In [2]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn.



In [6]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
#from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

Automatically created module for IPython interactive environment


### Loading and pre-processing data

To begin with, I replaced NA's with zeros. 

Later, I will see how many features were mostly "NA". If they are not important, I will probably drop them. 

In [14]:

trainX = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingData.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        row = line.split("\t") 
        i = 0 
        for cell in row: 
            try: 
                row[i] = float(cell)
            except ValueError: 
                row[i] = 0 
            i+=1 
        trainX.append(row)
        
trainY = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingTruth.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        trainY.append(line)
        
testX = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/testData.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        row = line.split("\t") 
        i = 0 
        for cell in row: 
            try: 
                row[i] = float(cell)
            except ValueError: 
                row[i] = 0 
            i+=1 
        testX.append(row)

In [15]:
### Split data into training and tests sets 
arrayX = np.array(trainX[0:3000])
arrayY = np.array(trainY[0:3000])

X_train, X_test, y_train, y_test = train_test_split(arrayX, arrayY, test_size=0.4, random_state=0)



In [32]:

classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVC": SVC(kernel="linear", C=0.5, probability=True),
    "RBF SVM": SVC(gamma=2, C=1, probability=True),
    "DecisionTree_old": DecisionTreeClassifier(max_depth=5), 
    "RandomForest_old": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "DecisionTree": DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=20, random_state=0),
    "Rand Forest": RandomForestClassifier(n_estimators=30, criterion='entropy', max_depth=20, min_samples_leaf=20, 
                           bootstrap=True, oob_score=False, random_state=0 ),
    "Adaboost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "Logit": LogisticRegression(C=1), 
    "Logit l=2": LogisticRegression(C=0.5), 
    "Logit l=4": LogisticRegression(C=0.25),
    "QDA": QuadraticDiscriminantAnalysis()
}
print("Classifier | Score on Test | Score on Train ")
print("--- | --- | --- | ")


import operator
for name, classifier in  classifiers.items() : 
    clf=classifier
    clf.fit(X_train, y_train)

    ## Sanity check lines 
    #prob_array = clf.predict_proba(arrayX[0:10])

    prob_array = clf.predict_proba(testX[0:10])
    row = [
        clf.score(X_test, y_test) ,clf.score(X_train, y_train), #comparing test and training on 4/6 split
         ]
    print( "%s |"%name + "|".join(["%.3f" % x for x in row]) )
    
    for row in prob_array: 
        index, value = max(enumerate(row), key=operator.itemgetter(1))
        print(["%.3f" % x for x in row] + [index+1] )
    

Classifier | Score on Test | Score on Train 
--- | --- | --- | 
DecisionTree |0.432|0.538
['0.764', '0.125', '0.069', '0.042', 1]
['0.279', '0.238', '0.354', '0.129', 3]
['0.636', '0.364', '0.000', '0.000', 1]
['0.313', '0.328', '0.179', '0.179', 2]
['0.123', '0.219', '0.466', '0.192', 3]
['0.417', '0.042', '0.083', '0.458', 4]
['0.121', '0.319', '0.388', '0.172', 3]
['0.000', '0.361', '0.446', '0.193', 3]
['0.221', '0.283', '0.212', '0.283', 2]
['0.121', '0.319', '0.388', '0.172', 3]
QDA |0.368|1.000
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]
['0.000', '0.000', '1.000', '0.000', 3]




Rand Forest |0.562|0.751
['0.192', '0.229', '0.418', '0.161', 3]
['0.179', '0.223', '0.452', '0.147', 3]
['0.146', '0.218', '0.493', '0.143', 3]
['0.201', '0.184', '0.479', '0.137', 3]
['0.178', '0.304', '0.415', '0.103', 3]
['0.255', '0.258', '0.295', '0.192', 3]
['0.183', '0.235', '0.422', '0.160', 3]
['0.214', '0.188', '0.452', '0.147', 3]
['0.278', '0.303', '0.272', '0.147', 2]
['0.209', '0.377', '0.287', '0.128', 2]
Linear SVC |0.646|0.970
['0.064', '0.082', '0.720', '0.134', 3]
['0.007', '0.115', '0.665', '0.213', 3]
['0.008', '0.071', '0.794', '0.126', 3]
['0.014', '0.069', '0.885', '0.032', 3]
['0.000', '0.070', '0.869', '0.060', 3]
['0.768', '0.046', '0.100', '0.086', 1]
['0.061', '0.110', '0.750', '0.078', 3]
['0.236', '0.360', '0.241', '0.163', 2]
['0.074', '0.198', '0.620', '0.108', 3]
['0.045', '0.723', '0.040', '0.192', 2]
Adaboost |0.574|0.670
['0.243', '0.253', '0.255', '0.249', 3]
['0.231', '0.252', '0.264', '0.253', 3]
['0.242', '0.253', '0.256', '0.249', 3]
['0.242',

In [20]:
for row in prob_array: 
    print(["%.3f" % x for x in row])

['0.010', '0.029', '0.942', '0.019']
['0.000', '0.056', '0.844', '0.100']
['0.000', '0.022', '0.797', '0.180']
['0.002', '0.049', '0.949', '0.001']
['0.000', '0.345', '0.643', '0.012']
['0.892', '0.016', '0.006', '0.087']
['0.080', '0.010', '0.848', '0.062']
['0.099', '0.806', '0.081', '0.015']
['0.003', '0.687', '0.307', '0.002']
['0.001', '0.936', '0.000', '0.063']
