#### Investigation of classification of forest cover type from the Covertype database

https://archive.ics.uci.edu/ml/datasets/Covertype

This consists of columns that describe the characteritics of 30x30 m cells of land, followed by a column containing the i.d. of the forest cover type found there. The aim of the exercise is to classify the forest cover type based on the characteristics of each cell

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import sys
import time
import matplotlib.pylab as plt
%matplotlib inline
plt.style.use('ggplot')
sys.path.append('/Users/rmartinshort/Documents/Berkeley/GDSO/GeneticAlgorithm')

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from GeneticAlgorithm import GeneticAlgorithm

In [3]:
covdata = pd.read_csv('covdata_labelled.csv')

In [4]:
covdata.head(10)

Unnamed: 0,elevation,aspect,slope,H_dist_to_water,V_dist_to_water,H_dist_to_road,hillshade9am,hillshade_12pm,hillshade3pm,H_dist_to_firepoints,...,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,soil_type_40,cover_type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
5,2579,132,6,300,-15,67,230,237,140,6031,...,0,0,0,0,0,0,0,0,0,2
6,2606,45,7,270,5,633,222,225,138,6256,...,0,0,0,0,0,0,0,0,0,5
7,2605,49,4,234,7,573,222,230,144,6228,...,0,0,0,0,0,0,0,0,0,5
8,2617,45,9,240,56,666,223,221,133,6244,...,0,0,0,0,0,0,0,0,0,5
9,2612,59,10,247,11,636,228,219,124,6230,...,0,0,0,0,0,0,0,0,0,5


In [5]:
#count the number of null values in each of the columns
covdata.isnull().sum()

elevation               0
aspect                  0
slope                   0
H_dist_to_water         0
V_dist_to_water         0
H_dist_to_road          0
hillshade9am            0
hillshade_12pm          0
hillshade3pm            0
H_dist_to_firepoints    0
WA_1                    0
WA_2                    0
WA_3                    0
WA_4                    0
soil_type_1             0
soil_type_2             0
soil_type_3             0
soil_type_4             0
soil_type_5             0
soil_type_6             0
soil_type_7             0
soil_type_8             0
soil_type_9             0
soil_type_10            0
soil_type_11            0
soil_type_12            0
soil_type_13            0
soil_type_14            0
soil_type_15            0
soil_type_16            0
soil_type_17            0
soil_type_18            0
soil_type_19            0
soil_type_20            0
soil_type_21            0
soil_type_22            0
soil_type_23            0
soil_type_24            0
soil_type_25

In [10]:
#Function to do a grid search and find the best parameters for an input model

def test_model(model,X,Y,test_parameters,hold_out_fraction=0.3,treebased=False):
    
    '''
    Explore parameter space and report the best model and feature selection
    
    Input:
    model is an sklearn classifier object
    X is the input feature dataframe
    Y is the target vector
    test_parameters is a dictionary to be fed into GridSearchCV
    hold_out_fraction is the fraction of the dataset to use as a test once the hyperparameters have 
    been tuned
    treebased is used to identify whether or not tree based selection is to be used. This will only 
    work with tree-based classifiers

    Output:
    best_classifier: the model with the best hyperparameters
    
    if treebased is set to true, output is a list containing the best classifier and the
    redeacted input dataframe containing just the selected columns
    
    '''
    
    #Split into training and hold-out sets. We are going to do our search for optimal 
    #hyperparameters on the training set and preserve the holdout set for testing 
    #later

    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=hold_out_fraction)
    
    #Classification model
    CF = model
    
    if treebased == True:
        
        #Feature selector
        FS = SelectFromModel(CF, threshold='mean',prefit=False)
        #Pipeline
        CF_pipeline = Pipeline([('select', FS), ('classify', CF)])
    
    else:
        
        #Pipeline
        CF_pipeline = Pipeline([('classify', CF)])
    
    #Search parameter space
    grid_search = GridSearchCV(CF_pipeline, test_parameters, verbose=1, cv=5, n_jobs=4)
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in CF_pipeline.steps])
    print("parameters:")
    print(test_parameters)
    t0 = time.time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time.time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(test_parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    #Test on the hold_out set
    p = grid_search.best_estimator_
    best_classifier = p.named_steps['classify']
    
    if treebased == True:
        
        #Determine score on hold-out dataset using the selected parameters and 
        #selected columns from the tree based selection 
        
        hold_out_score = best_classifier.score(p.named_steps['select'].transform(X_test)
                                           ,y_test)
        hold_out_prediction = best_classifier.predict(p.named_steps['select'].transform(X_test))
    
    else:
    
        #Determine score on hold-out dataset 
        
        hold_out_score = best_classifier.score(X_test,y_test)
        hold_out_prediction = best_classifier.predict(X_test)
    
    print("Hold out score: %0.3f" %hold_out_score)
    
    #Do classification report on the hold_out dataset
    print(classification_report(y_test,hold_out_prediction))
    
    if treebased == True:
        
        #Transform the original dataframe by removing all but the selected columns 
        
        X_new = p.named_steps['select'].transform(X)
        support = p.named_steps['select'].get_support()
        X_new_cols = [X.columns[i] for i in range(len(support)) if support[i] == True]
        X_new = pd.DataFrame(X_new,columns=X_new_cols)
        
        return [best_classifier,X_new]
    
    else:
        
        return [best_classifier]

In [11]:
model = RandomForestClassifier()

parameters = {
    'classify__n_estimators': (20,50,100),
    'classify__max_depth': (10,20,None),
    'classify__criterion':('gini','entropy')
}

In [12]:
target = covdata['cover_type']
X = covdata.drop(['cover_type'],axis=1)

In [13]:
[best_classifier,X_new] = test_model(model,X,target,parameters,treebased=True)

Performing grid search...
pipeline: ['select', 'classify']
parameters:
{'classify__n_estimators': (20, 50, 100), 'classify__max_depth': (10, 20, None), 'classify__criterion': ('gini', 'entropy')}
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 21.6min
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed: 50.9min finished


done in 3253.960s

Best score: 0.946
Best parameters set:
	classify__criterion: 'entropy'
	classify__max_depth: None
	classify__n_estimators: 100
Hold out score: 0.951
             precision    recall  f1-score   support

          1       0.96      0.95      0.95     63491
          2       0.95      0.97      0.96     85012
          3       0.94      0.95      0.95     10782
          4       0.89      0.83      0.86       834
          5       0.95      0.76      0.84      2840
          6       0.93      0.88      0.91      5290
          7       0.98      0.94      0.96      6055

avg / total       0.95      0.95      0.95    174304

