In [1]:
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KernelDensity
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import pandas as pd

In [2]:
#Importing Data into Pandas Dataframe
cov_df2 = pd.read_csv('covtype.data.gz')

# COVERTYPE DATASET

In [3]:
#Renaming Column
#Creating Seperate Data Columns
cov_df2 = cov_df2.rename(columns={'5': 'Forest_cov_desig.'})
class_2 = cov_df2[cov_df2['Forest_cov_desig.'] == 2]
class_1 = cov_df2[cov_df2['Forest_cov_desig.'] == 1]
class_3 = cov_df2[cov_df2['Forest_cov_desig.'] == 3]
class_7 = cov_df2[cov_df2['Forest_cov_desig.'] == 7]
class_6 = cov_df2[cov_df2['Forest_cov_desig.'] == 6]
class_5 = cov_df2[cov_df2['Forest_cov_desig.'] == 5]
class_4 = cov_df2[cov_df2['Forest_cov_desig.'] == 4]

#Since the Dataset was way too big and my computer couldn't handle it, I decided to make 
#it the same 30000 row set like the paper. I kept the same percentage of each class and
# cut each class down by the same ratio. The ratio g is below.
# This code picks the number of random samples and cuts down dataset by the ratio. 
g = 30000/581011
class_2 = class_2.sample(frac=g, replace = False).reset_index(drop=True)
class_1 = class_1.sample(frac=g, replace = False).reset_index(drop=True)
class_3 = class_3.sample(frac=g, replace = False).reset_index(drop=True)
class_7 = class_7.sample(frac=g, replace = False).reset_index(drop=True)
class_6 = class_6.sample(frac=g, replace = False).reset_index(drop=True)
class_5 = class_5.sample(frac=g, replace = False).reset_index(drop=True)
class_4 = class_4.sample(frac=g, replace = False).reset_index(drop=True)
frames = [class_2, class_1, class_3, class_7, class_6, class_5, class_4]

#Combined data columns into a dataframe.
cov_df2 = pd.concat(frames)
#Changed class to 1 and 0 using .map to have largest class as 1 and the rest as 0.
class_map = {1: 0, 2: 1, 3: 0,
              4: 0, 5: 0, 6: 0, 7: 0}
cov_df2['Class'] = cov_df2['Forest_cov_desig.'].map(class_map)
cov_df2 = cov_df2.drop(['Forest_cov_desig.'], axis = 1)
#Shuffling data and converting into numpy array.
cov_df2 = shuffle(cov_df2)
cov_arr = cov_df2.to_numpy()

In [4]:
#Accuracy function. Compares predicted labels vs actual labels and counts accuracy.
def prediction(predictions, Y_given):
    wrong = 0
    counter = 0
    for test, train in zip(predictions, Y_given):
        if test == train:
            wrong = wrong
        else:
            wrong = wrong + 1
        counter = counter + 1
    accuracy = 1 - (wrong/counter)
    return accuracy

# SVM

In [5]:
#Three lists to store training accuracy for each trial, test accuracy for each trial, and best parameters for each trial.
SVM_train_accuracy = []
SVM_test_accuracy = []
SVM_best_params = []

#For each Trial
for i in range(3):
    data_svm = shuffle(cov_arr)
#Data splitting, train-test-split.
    X_svm = data_svm[:, 0:-1]
    Y_svm = data_svm[:, -1]
    X_train_svm, X_test_svm, Y_train_svm, Y_test_svm = train_test_split(X_svm, Y_svm, test_size = 5/6, random_state=42)
#Scaling training data using StandardScaler
    scaler_svm = preprocessing.StandardScaler().fit(X_train_svm)
    X_train_svm = scaler_svm.transform(X_train_svm)
#Param-grid, setting SVM class,grid search parameters, and fitting training data.
    parameters = [{'kernel': ['rbf'], 'gamma': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0], 'C': [10**-7, 10**-6, 10**-5,
                                                                                                     10**-4, 10**-3, 10**-2,
                                                                                                     10**-1]},
                  {'kernel': ['poly'], 'degree': [2, 3], 'C': [10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1]},
                  {'kernel': ['linear'], 'C': [10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1]}]
    svc = svm.SVC(gamma = 'auto')
    grid_search_svm = GridSearchCV(svc, parameters, cv = 5, error_score = np.nan)
    grid_search2_svm = grid_search_svm.fit(X_train_svm, Y_train_svm)
    best_params_svm = grid_search2_svm.best_params_
#Printing and storing best params into list.
    print("Best params: {}".format(best_params_svm))
    SVM_best_params.append('Trial ' + str(i + 1) + ': ' + str(best_params_svm))
#Using best_estimator to predict training data labels.
#Using prediction function to calculate training accuracy.
#Printing and storing training accuracies into list.
    train_predictions_svm = grid_search2_svm.best_estimator_.predict(X_train_svm) #.best_estimator_
    train_accuracy_svm = prediction(train_predictions_svm, Y_train_svm)
    print("Train accuracy: {}".format(train_accuracy_svm))
    SVM_train_accuracy.append('Trial ' + str(i + 1) + ': ' + str(train_accuracy_svm))
#Scaling testing data with my training data scaler. This ensures training and testing data are scaled the same.
#Using prediction function to calculate testing accuracy.
#Printing and storing test accuracy into list.
    X_test_svm = scaler_svm.transform(X_test_svm)
    test_predictions_svm = grid_search2_svm.best_estimator_.predict(X_test_svm) #.best_estimator_
    test_accuracy_svm = prediction(test_predictions_svm, Y_test_svm)
    print("Test accuracy: {}".format(test_accuracy_svm))
    SVM_test_accuracy.append('Trial ' + str(i + 1) + ': ' + str(test_accuracy_svm))

Best params: {'C': 0.1, 'kernel': 'linear'}
Train accuracy: 0.764
Test accuracy: 0.76076
Best params: {'C': 0.1, 'kernel': 'linear'}
Train accuracy: 0.7668
Test accuracy: 0.7628
Best params: {'C': 0.1, 'kernel': 'linear'}
Train accuracy: 0.7766
Test accuracy: 0.7587200000000001


# kNN

In [6]:
#Three lists to store training accuracy for each trial, test accuracy for each trial, and best parameters for each trial.
kNN_train_accuracy = []
kNN_test_accuracy = []
kNN_best_params = []

#Setting K parameters
p = (np.linspace(1,500,25))
p = p.astype('int64')

#For each trial
for i in range(3):
    data_knn = shuffle(cov_arr)
#Data splitting, train-test-split
    X_knn = data_knn[:, 0:-1]
    Y_knn = data_knn[:, -1]
    X_train_knn, X_test_knn, Y_train_knn, Y_test_knn = train_test_split(X_knn, Y_knn, test_size = 5/6, random_state=42)
# Scaling training data using StandardScaler()
    scaler_knn = preprocessing.StandardScaler().fit(X_train_knn)
    X_train_knn = scaler_knn.transform(X_train_knn)
#Param-grid, initializing kNN class, grid search parameters and fitting training data.
# Printing and storing best parameters in list.
    params = [{'weights' : ['uniform', 'distance'], 'metric' : ['minkowski'],'n_neighbors': p}]
    neighbor = KNeighborsClassifier()
    grid_search_knn = GridSearchCV(neighbor, params, cv=5, error_score = np.nan)
    grid_search_knn2 = grid_search_knn.fit(X_train_knn, Y_train_knn)
    best_params_knn = grid_search_knn2.best_params_
    print("Best params: {}".format(best_params_knn))
    kNN_best_params.append('Trial ' + str(i + 1) + ': ' + str(best_params_knn))
#Using best_estimator to predict training data labels.
#Using prediction function to calculate training accuracy
#Printing and storing training accuracy into list.
    train_predictions_knn = grid_search_knn2.best_estimator_.predict(X_train_knn)
    train_accuracy_knn = prediction(train_predictions_knn, Y_train_knn)
    print("Train accuracy: {}".format(train_accuracy_knn))
    kNN_train_accuracy.append('Trial ' + str(i + 1) + ': ' + str(train_accuracy_knn))
#Scaling testing data with my training data scaler. This ensures training and testing data are scaled the same.
#Using prediction function to calculate testing accuracy.
#Printing and storing test accuracy into list.
    X_test_knn = scaler_knn.transform(X_test_knn)
    test_predictions_knn = grid_search_knn2.best_estimator_.predict(X_test_knn)
    test_accuracy_knn = prediction(test_predictions_knn, Y_test_knn)
    print("Test accuracy: {}".format(test_accuracy_knn))
    kNN_test_accuracy.append('Trial ' + str(i + 1) + ': ' + str(test_accuracy_knn))

Best params: {'metric': 'minkowski', 'n_neighbors': 1, 'weights': 'uniform'}
Train accuracy: 1.0
Test accuracy: 0.77932
Best params: {'metric': 'minkowski', 'n_neighbors': 1, 'weights': 'uniform'}
Train accuracy: 1.0
Test accuracy: 0.77568
Best params: {'metric': 'minkowski', 'n_neighbors': 1, 'weights': 'uniform'}
Train accuracy: 1.0
Test accuracy: 0.7782


# Random Forest

In [7]:
#Three lists to store training accuracy for each trial, test accuracy for each trial, and best parameters for each trial.
rf_train_accuracy = []
rf_test_accuracy = []
rf_best_params = []

#For each trial
for i in range(3):
    data_rf = shuffle(cov_arr)
#Data splitting, train-test-split
    X_rf = data_rf[:, 0:-1]
    Y_rf = data_rf[:, -1]
    X_train_rf, X_test_rf, Y_train_rf, Y_test_rf = train_test_split(X_rf, Y_rf, test_size = 5/6, random_state=42)
#Param-grid, initialize random forest class, and fitting training data with grid search.
    params = [{'n_estimators' : [1024], 'max_features' : [1, 2, 4, 6, 8, 12, 16, 20]}]
    forest = RandomForestClassifier()
    grid_search_rf = GridSearchCV(forest, params, cv=5, error_score = np.nan)
    grid_search_rf2 = grid_search_rf.fit(X_train_rf, Y_train_rf)
#Printing and storing best params into list.
    best_params_rf = grid_search_rf2.best_params_
    print("Best params: {}".format(best_params_rf))
    rf_best_params.append('Trial ' + str(i + 1) + ': ' + str(best_params_rf))
#Using best_estimator to predict training data.
#Storing and printing training accuracy into list.
#Using defined prediction function to calculate total accuracy.
    train_predictions_rf = grid_search_rf2.best_estimator_.predict(X_train_rf)
    train_accuracy_rf = prediction(train_predictions_rf, Y_train_rf)
    print("Train accuracy: {}".format(train_accuracy_rf))
    rf_train_accuracy.append('Trial ' + str(i + 1) + ': ' + str(train_accuracy_rf))
#Predicting Test data with best_estimator.predict.
#Using prediction function to calculate accuracy.
#Printing and storing Test accuracy into list.
    test_predictions_rf = grid_search_rf2.best_estimator_.predict(X_test_rf)
    test_accuracy_rf = prediction(test_predictions_rf, Y_test_rf)
    print("Test accuracy: {}".format(test_accuracy_rf))
    rf_test_accuracy.append('Trial ' + str(i + 1) + ': ' + str(test_accuracy_rf))

Best params: {'max_features': 6, 'n_estimators': 1024}
Train accuracy: 1.0
Test accuracy: 0.8218
Best params: {'max_features': 2, 'n_estimators': 1024}
Train accuracy: 1.0
Test accuracy: 0.81464
Best params: {'max_features': 12, 'n_estimators': 1024}
Train accuracy: 1.0
Test accuracy: 0.82608
