# Dataset: 
https://archive.ics.uci.edu/ml/datasets/covertype

## Importing Required Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import time
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import ensemble
from sklearn import cross_validation
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.model_selection import cross_val_predict



## Reading Downloaded data

In [2]:
data = pd.read_csv('covtype.data',header=None) # This is the base file downloaded

#data = pd.read_csv('covtype_reduced.csv',header=0) #This file contains data reduced through Reverse 'One-Hot' Encoding for Soil type and wilderness area feature

## Split data as 80/20 for training and testing

In [3]:
#data split for Base Data
train_x, test_x, train_y, test_y = train_test_split(data.iloc[:,0:54].values,
                                                        data.iloc[:,54:55].values.ravel(), train_size=0.8, random_state=44)

#data split while using reduced features
#train_x, test_x, train_y, test_y = train_test_split(data.iloc[:,0:12].values,
#                                                        data.iloc[:,12:13].values.ravel(), train_size=0.8, random_state=44)



## Normalization using standardscaler

In [4]:
scaler = StandardScaler()
scaled_train_x = scaler.fit_transform(train_x)
scaled_test_x = scaler.transform(test_x)
print(scaled_train_x.shape)
print(scaled_test_x.shape)



(464809, 54)
(116203, 54)


## Dimensionality Reduction using PCA

In [5]:
pca=PCA(.95) #scikit-learn choose the minimum number of principal components such that 95% of the variance is retained

pca.fit(scaled_train_x)

print(pca.n_components_)

train_x_pca = pca.transform(scaled_train_x)
test_x_pca = pca.transform(scaled_test_x)

print(train_x_pca.shape)
print(test_x_pca.shape)

43
(464809, 43)
(116203, 43)


## SVM

### Grid Search to tune parameters for SVM

In [None]:
start_time = time.time()

#Grid Search
parameters = {'multi_class':('crammer_singer','ovr'),'C':[1, 0.1], 'loss':('hinge','squared_hinge')}

C = 1  # SVM regularization parameter
model = LinearSVC(random_state=0)
cls = GridSearchCV(model, parameters)
#print(cls)
cls.fit(scaled_train_x, train_y)

print('Best parameters for the model', cls.best_params_)

y = cls.predict(scaled_test_x)

print('test accuracy SVM: ', metrics.accuracy_score(test_y.ravel(),y))
print('train accuracy SVM: ', cls.score(scaled_train_x, train_y))
    
print("computation time=--- %s seconds ---" % (time.time() - start_time))
#C=1 test accuracy SVM:  0.711471079531
#C=1 train accuracy SVM:  0.708537005164

Best parameters for the model {'C': 1, 'multi_class': 'crammer_singer', 'loss': 'hinge'}

test accuracy SVM:  0.72369837054

train accuracy SVM:  0.722788296041

### SVM Model with Tuned Parameters

In [6]:
start_time = time.time()

C = 1  # SVM regularization parameter
cls = LinearSVC(random_state=0, multi_class= 'crammer_singer', loss = 'hinge')
cls.fit(scaled_train_x, train_y)

y = cls.predict(scaled_test_x)

print(metrics.classification_report(test_y.ravel(),y))
print('test accuracy SVM: ', metrics.accuracy_score(test_y.ravel(),y))
print('train accuracy SVM: ', cls.score(scaled_train_x, train_y))

print("computation time=--- %s seconds ---" % (time.time() - start_time))

             precision    recall  f1-score   support

          1       0.71      0.70      0.71     42219
          2       0.75      0.80      0.77     56617
          3       0.66      0.87      0.75      7120
          4       0.51      0.42      0.46       554
          5       0.20      0.01      0.01      1992
          6       0.57      0.09      0.15      3518
          7       0.70      0.54      0.61      4183

avg / total       0.71      0.72      0.71    116203

test accuracy SVM:  0.723131072348
train accuracy SVM:  0.724256630143
computation time=--- 1649.6820979118347 seconds ---


## Random Forest

### Grid Search to find optimal parameters for Random Forest Model

In [None]:
#Grid Search
start_time = time.time()

parameters = {'class_weight':('balanced', 'balanced_subsample'),'n_estimators':[250, 300, 350], 'n_jobs':[-1,2]}

model = ensemble.RandomForestClassifier(random_state=44)
cls = GridSearchCV(model, parameters)
cls.fit(train_x, train_y)

print('Best parameters for the model', cls.best_params_)

y = cls.predict(test_x)

print('test accuracy', metrics.accuracy_score(test_y.ravel(),y))

y_train_rf = cls.predict(train_x)
print(metrics.classification_report(train_y.ravel(),y_train_rf))
print('train accuracy', metrics.accuracy_score(train_y.ravel(),y_train_rf))

Best parameters for the model {'class_weight': 'balanced', 'n_estimators': 300, 'n_jobs': -1}

test accuracy SVM: 0.72369837054

train accuracy SVM: 0.722788296041

### Random Forest Model with the Tuned Parameters

In [7]:
start_time = time.time()

# Train and predict with the random forest classifier
rf = ensemble.RandomForestClassifier(n_estimators=300, class_weight = 'balanced', n_jobs=-1, random_state=44)
rf.fit(train_x,train_y.ravel())
y = rf.predict(test_x)
print(metrics.classification_report(test_y.ravel(),y))
print('test accuracy', metrics.accuracy_score(test_y.ravel(),y))

y_train_rf = rf.predict(train_x)
print(metrics.classification_report(train_y.ravel(),y_train_rf))
print('train accuracy', metrics.accuracy_score(train_y.ravel(),y_train_rf))

print("computation time=--- %s seconds ---" % (time.time() - start_time))

             precision    recall  f1-score   support

          1       0.97      0.95      0.96     42219
          2       0.95      0.98      0.96     56617
          3       0.94      0.96      0.95      7120
          4       0.92      0.88      0.90       554
          5       0.94      0.79      0.86      1992
          6       0.94      0.90      0.92      3518
          7       0.98      0.95      0.96      4183

avg / total       0.96      0.96      0.96    116203

test accuracy 0.95676531587
             precision    recall  f1-score   support

          1       1.00      1.00      1.00    169621
          2       1.00      1.00      1.00    226684
          3       1.00      1.00      1.00     28634
          4       1.00      1.00      1.00      2193
          5       1.00      1.00      1.00      7501
          6       1.00      1.00      1.00     13849
          7       1.00      1.00      1.00     16327

avg / total       1.00      1.00      1.00    464809

train accura

### Random Forest with K-fold cross validation

In [None]:
#k fold cross valdation on random forest
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.model_selection import cross_val_predict

start_time = time.time()
k_fold = KFold(len(data.iloc[:,54:55].values.ravel()), n_folds=10, shuffle=True, random_state=44)

cls = ensemble.RandomForestClassifier(n_estimators=300,class_weight='balanced',n_jobs=-1, random_state=44) #Ramdom Forest
print (cross_val_score(cls, data.iloc[:,0:54].values, data.iloc[:,54:55].values.ravel(), cv=k_fold, n_jobs=1))

print("computation time=--- %s seconds ---" % (time.time() - start_time))

## Extra Trees Classifier

In [7]:
from sklearn.ensemble import ExtraTreesClassifier
start_time = time.time()

# Train and predict with the random forest classifier
forest = ExtraTreesClassifier(n_estimators=100,
                              random_state=0)

forest.fit(train_x, train_y.ravel())
importances = forest.feature_importances_
y = forest.predict(test_x)

print(metrics.classification_report(test_y.ravel(),y))
print('test accuracy', metrics.accuracy_score(test_y.ravel(),y))

y_train_rf = forest.predict(train_x)
print(metrics.classification_report(train_y.ravel(),y_train_rf))
print('train accuracy', metrics.accuracy_score(train_y.ravel(),y_train_rf))

print("computation time=--- %s seconds ---" % (time.time() - start_time))

             precision    recall  f1-score   support

          1       0.95      0.94      0.95     42219
          2       0.95      0.97      0.96     56617
          3       0.94      0.96      0.95      7120
          4       0.92      0.86      0.89       554
          5       0.94      0.77      0.84      1992
          6       0.94      0.89      0.91      3518
          7       0.97      0.95      0.96      4183

avg / total       0.95      0.95      0.95    116203

test accuracy 0.950035713364
             precision    recall  f1-score   support

          1       1.00      1.00      1.00    169621
          2       1.00      1.00      1.00    226684
          3       1.00      1.00      1.00     28634
          4       1.00      1.00      1.00      2193
          5       1.00      1.00      1.00      7501
          6       1.00      1.00      1.00     13849
          7       1.00      1.00      1.00     16327

avg / total       1.00      1.00      1.00    464809

train accur

### k fold cross valdation on Extra Trees

In [None]:
start_time = time.time()
k_fold = KFold(len(data.iloc[:,54:55].values.ravel()), n_folds=10, shuffle=True, random_state=44)

cls = ExtraTreesClassifier(n_estimators=100,random_state=0)
print (cross_val_score(cls, data.iloc[:,0:54].values, data.iloc[:,54:55].values.ravel(), cv=k_fold, n_jobs=1))

print("computation time=--- %s seconds ---" % (time.time() - start_time))

In [None]:
[ 0.95969158  0.95924409  0.95858935  0.96099895  0.95919175  0.96020723
  0.95778042  0.95752225  0.95946713  0.95907127]
computation time=--- 2531.3313714903211 seconds ---

## Logistic Regression

#### Due to the system limitations, grid search was not performed in Logistic Regression but were executed separately to get the accuracy of each parameters

### Logistic regression without scaling and pca #Multinomial

In [22]:
start_time = time.time()

# Train multinomial logistic regression model
multinomial_class = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter = 100).fit(train_x, train_y.values.ravel())
 
# predict and calculate accuracy in training data
prob = multinomial_class.predict(train_x)
prob_test = multinomial_class.predict(test_x)

print ('Training Accuracy without scaling and pca: ',metrics.accuracy_score(train_y, prob)) 
print ('Testing Accuracy without scalng and pca: ',metrics.accuracy_score(test_y, prob_test)) 

print(prob_test)
print(test_y)

print("computation time=--- %s seconds ---" % (time.time() - start_time))



Training Accuracy without scaling and pca:  0.732228915663
Testing Accuracy without scalng and pca:  0.574958943798
[2 2 2 ..., 3 3 3]
[2 2 2 ..., 3 3 3]
computation time=--- 63.02908754348755 seconds ---


### Logistic regression without scaling and pca #ovr

In [28]:
start_time = time.time()

# Train multinomial logistic regression model
ovr_class = linear_model.LogisticRegression(multi_class='ovr', solver='newton-cg', max_iter = 100).fit(train_x, train_y.values.ravel())

# predict and calculate accuracy in training data
prob = ovr_class.predict(train_x)
prob_test = ovr_class.predict(test_x)

print ('Training Accuracy OVR without pca and scaling: ',metrics.accuracy_score(train_y, prob)) 
print ('Testing Accuracy OVR without pca and scaling: ',metrics.accuracy_score(test_y, prob_test)) 

print(prob_test)

print("computation time=--- %s seconds ---" % (time.time() - start_time))

Training Accuracy OVR without pca and scaling:  0.705388665022
Testing Accuracy OVR without pca and scaling:  0.705850967703
[3 2 2 ..., 2 3 2]
        54
240609   3
333022   1
375315   2
453031   1
229408   2
381877   2
290574   2
519153   1
365669   1
45099    1
232648   1
75915    2
127025   1
248350   1
570632   2
541677   1
170763   2
547630   2
530847   1
121703   1
356223   1
525117   1
438716   1
284025   3
423295   1
466296   2
547393   2
140887   2
67789    1
195806   2
...     ..
37387    1
474215   1
221515   2
386201   2
37220    2
541717   1
444714   1
564448   1
65279    2
574351   6
318678   3
47317    2
517505   7
114503   2
259769   2
61739    2
539836   2
244692   2
321509   1
9497     7
54265    1
569008   2
82938    1
126763   2
370942   2
305999   2
526794   2
91835    2
371636   3
172344   2

[116203 rows x 1 columns]
computation time=--- 248.20125651359558 seconds ---


### Logistic regression with scaling and without pca #Multinomial

In [7]:
start_time = time.time()

# Train multinomial logistic regression model
multiscal_class = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter = 100).fit(scaled_train_x, train_y)

# predict and calculate accuracy in training data
prob = multiscal_class.predict(scaled_train_x)
prob_test = multiscal_class.predict(scaled_test_x)

print ('Training Accuracy multinomial scaled data without pca: ',metrics.accuracy_score(train_y, prob)) 
print ('Testing Accuracy multinomial scaled data without pca: ',metrics.accuracy_score(test_y, prob_test)) 

print(prob_test)

print("computation time=--- %s seconds ---" % (time.time() - start_time))



Training Accuracy multinomial scaled data without pca:  0.72515377284
Testing Accuracy multinomial scaled data without pca:  0.723294579314
[2 2 2 ..., 1 1 2]
computation time=--- 2873.342022418976 seconds ---


### Logistic regression with scaling and without pca #ovr

In [37]:
start_time = time.time()

# Train multinomial logistic regression model
ovrscal_class = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', max_iter = 100).fit(scaled_train_x, train_y.values.ravel())

# predict and calculate accuracy in training data
prob = ovrscal_class.predict(scaled_train_x)
prob_test = ovrscal_class.predict(scaled_test_x)

print ('Training Accuracy OVR scaled data without pca: ',metrics.accuracy_score(train_y, prob)) 
print ('Testing Accuracy OVR scaled data without pca: ',metrics.accuracy_score(test_y, prob_test)) 

print(prob_test)

print("computation time=--- %s seconds ---" % (time.time() - start_time))

Training Accuracy OVR scaled data without pca:  0.715872541194
Testing Accuracy OVR scaled data without pca:  0.713492766968
computation time=--- 256.93520402908325 seconds ---


### Logistic regression with scaling and pca #Multinomial

In [34]:
start_time = time.time()

# Train multinomial logistic regression model
multiscalpca_class = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter = 100).fit(train_x_pca, train_y.values.ravel())

# predict and calculate accuracy in training data
prob = multiscalpca_class.predict(train_x_pca)
prob_test = multiscalpca_class.predict(test_x_pca)

print ('Training Accuracy multinomial scaled data &  pca: ',metrics.accuracy_score(train_y, prob)) 
print ('Testing Accuracy multinomial scaled data & pca: ',metrics.accuracy_score(test_y, prob_test)) 

print(prob_test)

print("computation time=--- %s seconds ---" % (time.time() - start_time))

Training Accuracy multinomial scaled data &  pca:  0.7225785215
Testing Accuracy multinomial scaled data & pca:  0.722184453069
[3 2 2 ..., 2 3 2]
        54
240609   3
333022   1
375315   2
453031   1
229408   2
381877   2
290574   2
519153   1
365669   1
45099    1
232648   1
75915    2
127025   1
248350   1
570632   2
541677   1
170763   2
547630   2
530847   1
121703   1
356223   1
525117   1
438716   1
284025   3
423295   1
466296   2
547393   2
140887   2
67789    1
195806   2
...     ..
37387    1
474215   1
221515   2
386201   2
37220    2
541717   1
444714   1
564448   1
65279    2
574351   6
318678   3
47317    2
517505   7
114503   2
259769   2
61739    2
539836   2
244692   2
321509   1
9497     7
54265    1
569008   2
82938    1
126763   2
370942   2
305999   2
526794   2
91835    2
371636   3
172344   2

[116203 rows x 1 columns]
computation time=--- 1604.5519785881042 seconds ---


### Logistic regression with scaling and pca #OVR

In [37]:
start_time = time.time()

# Train multinomial logistic regression model
ovrscalpca_class = linear_model.LogisticRegression(multi_class='ovr', solver='newton-cg', max_iter = 200).fit(train_x_pca, train_y.values.ravel())

# predict and calculate accuracy in training data
prob = ovrscalpca_class.predict(train_x_pca)
prob_test = ovrscalpca_class.predict(test_x_pca)

print ('Training Accuracy OVR scaled data &  pca: ',metrics.accuracy_score(train_y, prob)) 
print ('Testing Accuracy OVR scaled data & pca: ',metrics.accuracy_score(test_y, prob_test)) 

print(prob_test)

print("computation time=--- %s seconds ---" % (time.time() - start_time))

Training Accuracy OVR scaled data &  pca:  0.705388665022
Testing Accuracy OVR scaled data & pca:  0.705850967703
[3 2 2 ..., 2 3 2]
        54
240609   3
333022   1
375315   2
453031   1
229408   2
381877   2
290574   2
519153   1
365669   1
45099    1
232648   1
75915    2
127025   1
248350   1
570632   2
541677   1
170763   2
547630   2
530847   1
121703   1
356223   1
525117   1
438716   1
284025   3
423295   1
466296   2
547393   2
140887   2
67789    1
195806   2
...     ..
37387    1
474215   1
221515   2
386201   2
37220    2
541717   1
444714   1
564448   1
65279    2
574351   6
318678   3
47317    2
517505   7
114503   2
259769   2
61739    2
539836   2
244692   2
321509   1
9497     7
54265    1
569008   2
82938    1
126763   2
370942   2
305999   2
526794   2
91835    2
371636   3
172344   2

[116203 rows x 1 columns]
computation time=--- 266.72931575775146 seconds ---


## Naive Bayes

### k-Fold cross validation for Naive Bayes Algorithm with optimal parameters

#### Naive bayes have been executed for Gaussian and Bernoulli algorithm

In [9]:
def getmetrics(y_true, y_pred, algo,target_names):
        from sklearn.metrics import explained_variance_score
        from sklearn.metrics import mean_squared_error
        from sklearn.metrics import mean_absolute_error
        from sklearn.metrics import accuracy_score
        #from sklearn.metrics import confusion_matrix
        from sklearn.metrics import classification_report
        from math import sqrt

In [18]:
target_names=['1 - Spruce/Fir', '2 - Lodgepole Pine', '3 - Ponderosa Pine', '4 - Cottonwood/Willow', '5 – Aspen', '6 - Douglas-fir', '7 – Krummholz']

gnb = GaussianNB()
y_gnb = gnb.fit(train_x, train_y).predict(test_x)

getmetrics(test_y,y_gnb,'GaussianNB',target_names)

st=time.time()

##k fold for Gaussian
k_fold = KFold(data.shape[0], n_folds=10, shuffle=True, random_state=44)
cls = GaussianNB()
print(cross_val_score(cls, data.iloc[:,0:54].values, data.iloc[:,54].values, cv=k_fold, n_jobs=1))


bnb = BernoulliNB()
y_bnb = bnb.fit(train_x, train_y).predict(test_x)

getmetrics(test_y,y_bnb,'BernoulliNB',target_names)

st=time.time()
##k fold for Bernoulli
k_fold = KFold(data.shape[0], n_folds=10, shuffle=True, random_state=44)
cls = BernoulliNB()
print(cross_val_score(cls, data.iloc[:,0:54].values, data.iloc[:,54].values, cv=k_fold, n_jobs=1))


[ 0.45900313  0.45702385  0.45877007  0.45797835  0.45961343  0.46355484
  0.45761691  0.45949295  0.45844306  0.46012977]
[ 0.6325944   0.62832605  0.63558286  0.63434364  0.6332249   0.62969656
  0.63012685  0.63270856  0.62935233  0.62900811]
