Potential code for testing different methods/models using nested 5-fold CV.

Read in needed packages.

In [None]:
import csv
import os
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

Read in test and training data.

In [None]:
#set working directory
os.chdir('/Users/brinberg/Desktop/ist-557-final/data')

In [None]:
#reading in data
train = pandas.read_csv('train_full_feature.csv', encoding='utf-8')
train.head(n = 10)

In [None]:
#reading in data
test = pandas.read_csv('test_full_feature.csv', encoding='utf-8')
test.head(n = 10)

In [None]:
#convert both data sets to numpy arrays
train1 = pd.DataFrame.as_matrix(train)
test1 = pd.DataFrame.as_matrix(test)

Separate features from outcome and standardize training and test data.

In [None]:
#update based on number of columns, also might need to get rid of column that is a user_id
pred = train1[:, :500]

In [None]:
#update based on number of columns
outcome = train1[:, 500]

Scale the features of the training and test data sets.

In [None]:
pred_scaled = preprocessing.scale(pred)

In [None]:
testpred_scaled = preprocessing.scale(test1)

Creating folds that will be used to evaluate all algorithms.

In [None]:
#these folds are not created randomly! need to update!
X_folds = numpy.array_split(pred_scaled, 5)
y_folds = numpy.array_split(outcome, 5)

Evaluating Random Forest.

In [None]:
rf_clf = RandomForestClassifier(random_state=0) 

In [None]:
#set of parameters to test
rf_param_grid = {"n_estimators": [10, 50, 100],
              "criterion": ["gini"],
              "max_features": [2, 3, 4, 5, 6],
              "min_samples_split": [10],
              "max_depth": [None],
              "min_samples_leaf": [10],
              "max_leaf_nodes": [None],
              }

In [None]:
rf_model = GridSearchCV(estimator = rf_clf, param_grid = rf_param_grid, cv = 5)

In [None]:
#nested cross-validation
scores = list()

for k in range(5):
    X_train = list(X_folds)
    X_test  = X_train.pop(k)
    X_train = numpy.concatenate(X_train)
    y_train = list(y_folds)
    y_test  = y_train.pop(k)
    y_train = numpy.concatenate(y_train)
    scores.append(rf_model.fit(X_train, y_train).score(X_test, y_test))
    
    print("Fold:")
    print(k)
    print()
    print("Accuracy:")
    print(rf_model.fit(X_train, y_train).score(X_test, y_test))
    print()
    print("Best parameters set found on development set:")
    print()
    print(rf_model.best_params_)
    print()

Evaluate Multiple Class SVM. One against one approach? Or one against rest approach? Going with the latter for now.

In [None]:
svm_clf = svm.LinearSVC(random_state = 0)

In [None]:
svm_param_grid = [
  {"C": [1, 10, 100, 1000],
  "multi_class": ["ovr"],
  },
]

In [None]:
svm_model = GridSearchCV(estimator = svm_clf, param_grid = svm_param_grid, cv = 5)

In [None]:
scores = list()

for k in range(5):
    X_train = list(X_folds)
    X_test  = X_train.pop(k)
    X_train = numpy.concatenate(X_train)
    y_train = list(y_folds)
    y_test  = y_train.pop(k)
    y_train = numpy.concatenate(y_train)
    scores.append(svm_model.fit(X_train, y_train).score(X_test, y_test))
    
    print ('Best C:',svm_model.best_estimator_.C)
    print ('Accuracy:', svm_model.best_score_)

Evaluate k-NN.

In [None]:
knn_clf = KNeighborsClassifier()

In [None]:
# set of parameters to test
knn_param_grid = {"n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              "weights": ['uniform', 'distance'],
              "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
              }

In [None]:
knn_model = GridSearchCV(estimator = knn_clf, param_grid = knn_param_grid, cv = 5)

In [None]:
scores = list()

for k in range(5):
    X_train = list(X_folds)
    X_test  = X_train.pop(k)
    X_train = numpy.concatenate(X_train)
    y_train = list(y_folds)
    y_test  = y_train.pop(k)
    y_train = numpy.concatenate(y_train)
    scores.append(knn_model.fit(X_train, y_train).score(X_test, y_test))
    
    print ('Accuracy:', knn_model.fit(X_train, y_train).score(X_test, y_test))
    print ('Parameters:', knn_model.best_params_)