In [2]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn.



In [90]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
#from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

Automatically created module for IPython interactive environment


### Loading and pre-processing data

To begin with, I replaced NA's with zeros. 

Later, I will see how many features were mostly "NA". If they are not important, I will probably drop them. 

In [91]:

trainX = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingData.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        row = line.split("\t") 
        i = 0 
        for cell in row: 
            try: 
                row[i] = float(cell)
            except ValueError: 
                row[i] = 0 
            i+=1 
        trainX.append(row)
        
trainY = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingTruth.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        trainY.append(line)
        
testX = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/testData.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        row = line.split("\t") 
        i = 0 
        for cell in row: 
            try: 
                row[i] = float(cell)
            except ValueError: 
                row[i] = 0 
            i+=1 
        testX.append(row)

blindX = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/blindData.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        row = line.split("\t") 
        i = 0 
        for cell in row: 
            try: 
                row[i] = float(cell)
            except ValueError: 
                row[i] = 0 
            i+=1 
        blindX.append(row[0:334])

In [92]:
### Split data into training and tests sets 
### For some reason, I always have a few extra lines that I have to truncate off of my data arrays
### I think it comes from my file-parsing 

arrayX = np.array(trainX[0:17377])
arrayY = np.array(trainY[0:17377])
arrayTest = np.array(testX[0:8179])
arrayBlind = np.array(blindX[0:20049])

X_train, X_test, y_train, y_test = train_test_split(arrayX, arrayY, test_size=0.4, random_state=0)

In [4]:

classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVC": SVC(kernel="linear", C=0.5, probability=True),
    "RBF SVM": SVC(gamma=2, C=1, probability=True),
    "DecisionTree_old": DecisionTreeClassifier(max_depth=5), 
    "RandomForest_old": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "DecisionTree": DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=20, random_state=0),
    "Rand Forest": RandomForestClassifier(n_estimators=30, criterion='entropy', max_depth=20, min_samples_leaf=20, 
                           bootstrap=True, oob_score=False, random_state=0 ),
    "Adaboost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "Logit": LogisticRegression(C=1), 
    "Logit l=2": LogisticRegression(C=0.5), 
    "Logit l=4": LogisticRegression(C=0.25),
    "QDA": QuadraticDiscriminantAnalysis()
}
print("Classifier | Score on Test | Score on Train ")
print("--- | --- | --- | ")


import operator
for name, classifier in  classifiers.items() : 
    clf=classifier
    clf.fit(X_train, y_train)

    ## Sanity check by predicting what is in the trainingTruth.txt file 
    prob_array = clf.predict_proba(arrayX[0:10])
    row = [
        clf.score(X_test, y_test) ,clf.score(X_train, y_train), #comparing test and training on 4/6 split
         ]
    print( "%s |"%name + "|".join(["%.3f" % x for x in row]) )
    
    for row in prob_array: 
        index, value = max(enumerate(row), key=operator.itemgetter(1))
        print(["%.3f" % x for x in row] + [index+1] )
    

Classifier | Score on Test | Score on Train 
--- | --- | --- | 
DecisionTree_old |0.462|0.503
['0.097', '0.137', '0.655', '0.111', 3]
['0.148', '0.282', '0.379', '0.191', 3]
['0.097', '0.137', '0.655', '0.111', 3]
['0.533', '0.196', '0.123', '0.147', 1]
['0.091', '0.254', '0.478', '0.177', 3]
['0.410', '0.300', '0.150', '0.140', 1]
['0.097', '0.137', '0.655', '0.111', 3]
['0.097', '0.137', '0.655', '0.111', 3]
['0.129', '0.141', '0.561', '0.168', 3]
['0.574', '0.157', '0.151', '0.118', 1]
RandomForest_old |0.447|0.485
['0.380', '0.215', '0.249', '0.155', 1]
['0.311', '0.221', '0.316', '0.152', 3]
['0.255', '0.194', '0.431', '0.120', 3]
['0.337', '0.180', '0.326', '0.157', 1]
['0.306', '0.207', '0.348', '0.139', 3]
['0.326', '0.238', '0.275', '0.161', 1]
['0.357', '0.207', '0.300', '0.137', 1]
['0.222', '0.352', '0.286', '0.140', 2]
['0.317', '0.224', '0.303', '0.156', 1]
['0.276', '0.209', '0.361', '0.155', 3]
Logit l=4 |0.758|0.792
['0.025', '0.024', '0.564', '0.388', 3]
['0.000', '0.

### Predict using the best classifier we currently have

From the classifier comparison notebook, the best model we currently have is the GaussianNaiveBayes. Let's create and train this classifier and then use it to predict probabilities and a label per observation in the testData.txt file. 

In [93]:
clf=GaussianNB()
clf.fit(arrayX, arrayY)

prob_array = clf.predict_proba(arrayTest)

with open('wang_ochoa_test_predictions.csv', 'w') as wp: 
    for row in prob_array: 
        index, value = max(enumerate(row), key=operator.itemgetter(1))
        line_list = ["%.3f" % x for x in row] + [str(index+1)] 
        line_str = '\t'.join(line_list)+'\n'
        wp.write(line_str)

prob_array = clf.predict_proba(arrayBlind)

with open('wang_ochoa_blind_predictions.csv', 'w') as wp: 
    for row in prob_array: 
        index, value = max(enumerate(row), key=operator.itemgetter(1))
        line_list = ["%.3f" % x for x in row] + [str(index+1)] 
        line_str = '\t'.join(line_list)+'\n'
        wp.write(line_str)
