# Cover Type Classification Problem

In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import math
from sklearn.svm import SVC

In [93]:
cols = ['elevation', 'aspect', 'slope', 'horizontal_distance_to_hydrology', 'vertical_distance_to_hydrology',
       'horizontal_distance_to_roadways', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm', 
       'horizontal_distance_to_fire_points', 'wilderness_area1', 'wilderness_area2','wilderness_area3','wilderness_area4',
        'soil_type1','soil_type2','soil_type3','soil_type4','soil_type5','soil_type6','soil_type7','soil_type8',
        'soil_type9','soil_type10','soil_type11','soil_type12','soil_type13','soil_type14','soil_type15','soil_type16','soil_type17',
        'soil_type18', 'soil_type19', 'soil_type20', 'soil_type21', 'soil_type22', 'soil_type23', 'soil_type24', 'soil_type25', 'soil_type26',
        'soil_type27', 'soil_type28', 'soil_type29', 'soil_type30', 'soil_type31', 'soil_type32', 'soil_type33', 'soil_type34', 'soil_type35', 
        'soil_type36', 'soil_type37', 'soil_type38', 'soil_type39', 'soil_type40', 'cover_type']
data = pd.read_csv('./COVTYPE/covtype.data', header = None, names = cols)

In [94]:
data.head()

Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,vertical_distance_to_hydrology,horizontal_distance_to_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,horizontal_distance_to_fire_points,...,soil_type32,soil_type33,soil_type34,soil_type35,soil_type36,soil_type37,soil_type38,soil_type39,soil_type40,cover_type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


## 1. Import Data


In [95]:
# Types in each column
print(data.dtypes)

elevation                             int64
aspect                                int64
slope                                 int64
horizontal_distance_to_hydrology      int64
vertical_distance_to_hydrology        int64
horizontal_distance_to_roadways       int64
hillshade_9am                         int64
hillshade_noon                        int64
hillshade_3pm                         int64
horizontal_distance_to_fire_points    int64
wilderness_area1                      int64
wilderness_area2                      int64
wilderness_area3                      int64
wilderness_area4                      int64
soil_type1                            int64
soil_type2                            int64
soil_type3                            int64
soil_type4                            int64
soil_type5                            int64
soil_type6                            int64
soil_type7                            int64
soil_type8                            int64
soil_type9                      

In [96]:
# Check if there are any null variables
data.isnull().values.any()

False

## Data Preprocessing

In [97]:
# Method in Caruana Paper: Largest class(7) is positive and everything else is negative
def transform_type(covtype):
    if covtype == 7:
        return 1
    else:
        return 0

In [98]:
data['cover_type'] = data['cover_type'].apply(transform_type)

In [99]:
data.head()

Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,vertical_distance_to_hydrology,horizontal_distance_to_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,horizontal_distance_to_fire_points,...,soil_type32,soil_type33,soil_type34,soil_type35,soil_type36,soil_type37,soil_type38,soil_type39,soil_type40,cover_type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,0
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,0


In [100]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

## Algorithm 1: Logistic Regression

In [115]:
def log_reg():
    # Logistic Regression binary classification
    C_list = [1e-4, 1e-3, 1e-2, 1e-1,1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8]
    scoring = ['accuracy', 'f1', 'roc_auc']
    parameters = {'C': C_list}
    classifier = LogisticRegression(n_jobs = -1)
    clf = GridSearchCV(classifier, parameters, return_train_score = True, n_jobs = -1, scoring = scoring, refit = False)
    clf.fit(X_train, y_train)
    
    # Find best parameters for each performance metric
    best_acc = 0
    best_C_acc = 0
    for i in range(len(clf.cv_results_['mean_test_accuracy'])):
        if clf.cv_results_['mean_test_accuracy'][i] > best_acc:
            best_C_acc = clf.cv_results_['param_C'][i]
        
    best_f1 = 0
    best_C_f1 = 0
    for j in range(len(clf.cv_results_['mean_test_f1'])):
        if clf.cv_results_['mean_test_f1'][j] > best_f1:
            best_C_f1 = clf.cv_results_['param_C'][j]
        
    best_auc = 0
    best_C_auc = 0
    for k in range(len(clf.cv_results_['mean_test_roc_auc'])):
        if clf.cv_results_['mean_test_roc_auc'][k] > best_auc:
            best_C_auc = clf.cv_results_['param_C'][k]
    print(clf.cv_results_['mean_test_f1'])  
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = LogisticRegression(C=best_C_acc, n_jobs = -1, max_iter = 5000)
    best_model_f1 = LogisticRegression(C=best_C_f1, n_jobs = -1, max_iter = 5000)
    best_model_auc = LogisticRegression(C=best_C_auc, n_jobs = -1, max_iter = 5000)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_auc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_auc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    
    
    
    
    
    #Extract training and validation accuracies and plot as heat maps to visualize best C parameter
    #train_acc = clf.cv_results_['mean_train_score'].reshape(-1, 1)
    #draw_heatmap(train_acc, C_list, 'Training ')
    
    #val_acc = clf.cv_results_['mean_test_score'].reshape(-1, 1)
    #draw_heatmap(val_acc, C_list, 'Validation ')
    
    # Use best parameters to define best model and fit to all training data
    #best_model = LogisticRegression(C=clf.best_params_['C'])
    #best_model.fit(X_train, y_train)
    
    # Find test accuracy and generalization error
    #for i,j in enumerate(C_list):
        #if j == clf.best_params_['C']:
            #best_train_acc = train_acc[i]
            
    #test_acc = best_model.score(X_test, y_test)
    #return test_acc, best_train_acc, clf.best_params_['C']
    
    return metrics_acc, metrics_f1, metrics_auc

In [116]:
LRD2 = np.zeros((5, 3))
for trial in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 5000)
    metrics_acc, metrics_f1, metrics_auc = log_reg()
    LRD2[trial][0] = metrics_acc
    LRD2[trial][1] = metrics_f1
    LRD2[trial][2] = metrics_auc
LRD2 = pd.DataFrame(LRD2, columns=['accuracy', 'f1', 'roc_auc'])

[0.07100617 0.10644274 0.16247693 0.08730955 0.19672445 0.16089947
 0.1975425  0.17472829 0.0936643  0.15767883 0.22446118 0.15890605
 0.1828605 ]
[0.07289945 0.10236767 0.13905477 0.16808486 0.14223017 0.19802566
 0.18638243 0.16849765 0.17792331 0.16154165 0.15786779 0.17641726
 0.16976005]
[0.09473684 0.15872171 0.29749529 0.29097608 0.23504867 0.26925143
 0.2047619  0.20521777 0.18299822 0.27462558 0.19303303 0.21236141
 0.18789284]
[0.04297455 0.09820371 0.12350797 0.16898228 0.12620388 0.14399416
 0.16533418 0.16385926 0.19071556 0.14095093 0.14172323 0.14982163
 0.18692151]
[0.06575758 0.14877822 0.15156806 0.20880974 0.23045916 0.23208461
 0.25105101 0.18683306 0.2292327  0.17153571 0.18929308 0.18920896
 0.18882249]


In [118]:
LRD2

Unnamed: 0,accuracy,f1,roc_auc
0,0.976502,0.611677,0.758588
1,0.975495,0.608949,0.76613
2,0.975527,0.607927,0.76438
3,0.976035,0.604696,0.755883
4,0.97474,0.562879,0.727295
