In [2]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn.



In [20]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
#from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

Automatically created module for IPython interactive environment


### Loading and pre-processing data

To begin with, I replaced NA's with zeros. 

Later, I will see how many features were mostly "NA". If they are not important, I will probably drop them. 

In [98]:

trainX = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingData.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        row = line.split("\t") 
        i = 0 
        for cell in row: 
            try: 
                row[i] = float(cell)
            except ValueError: 
                row[i] = 0 
            i+=1 
        trainX.append(row)
        
trainY = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingTruth.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        trainY.append(line)

In [100]:
### Split data into training and tests sets 
arrayX = np.array(trainX[0:3000])
arrayY = np.array(trainY[0:3000])

X_train, X_test, y_train, y_test = train_test_split(arrayX, arrayY, test_size=0.4, random_state=0)



1800


In [90]:

### Loop through all classifiers: 

classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVC": SVC(kernel="linear", C=0.5),
    "RBF SVM": SVC(gamma=2, C=1),
    "DecisionTree": DecisionTreeClassifier(max_depth=5),
    "Rand Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Adaboost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    #"BernoulliNB": BernoulliNB(), 
    #"MultinomialNB": MultinomialNB(),
    "Logit": LogisticRegression(C=1), 
    "Logit l=2": LogisticRegression(C=0.5), 
    "Logit l=4": LogisticRegression(C=0.25), 
    "QDA": QuadraticDiscriminantAnalysis()
}


### Check for high bias 

If the above accuracy on test is low, then it could be high bias or high variance. 

To score how we do on training will tell us if we have high bias. 

Scoring on test and then training shows the following. Buckets are less than 0.7 for high error, 0.7 to 0.8 for moderate error, greater than 0.8 for low error. 


| low bias (0.8+ on training)  | moderate bias (0.7 to 0.8 on training) | high bias (0.7- on training) | |
| ------------- |:-------------:| ----------:| ------------------:|
|               |  |      | **low variance (  0.7+ on test)**     |
| Linear SVM     | Logit, NB   |       | **moderate variance  (0.5-0.7 on test)** |
| QDA, RBF SVM |      | Adaboost, DT, RF    | **high variance (0.5- on test)**    |


In [91]:
### Check bias and variance 

for name, classifier in  classifiers.items() : 
    clf=classifier
    clf.fit(X_train, y_train)
    print("%s accuracy on test: %f" % (name, clf.score(X_test, y_test) ))
    print("%s accuracy on training: %f \n" % (name, clf.score(X_train, y_train) ))

Nearest Neighbors accuracy on test: 0.565833
Nearest Neighbors accuracy on training: 0.704444 

QDA accuracy on test: 0.368333
QDA accuracy on training: 1.000000 





Adaboost accuracy on test: 0.574167
Adaboost accuracy on training: 0.670000 

DecisionTree accuracy on test: 0.425833
DecisionTree accuracy on training: 0.570556 

Logit accuracy on test: 0.681667
Logit accuracy on training: 0.875000 

GaussianNB accuracy on test: 0.760000
GaussianNB accuracy on training: 0.841667 

Rand Forest accuracy on test: 0.405000
Rand Forest accuracy on training: 0.511667 

RBF SVM accuracy on test: 0.368333
RBF SVM accuracy on training: 1.000000 

Logit l=4 accuracy on test: 0.683333
Logit l=4 accuracy on training: 0.853889 

Logit l=2 accuracy on test: 0.681667
Logit l=2 accuracy on training: 0.861667 

Linear SVC accuracy on test: 0.645833
Linear SVC accuracy on training: 0.970000 



### Cross validation to get more accurate assessment of variance 

In [92]:
for name, classifier in  classifiers.items() : 
    clf=classifier
    scores = cross_val_score(clf, arrayX, arrayY, cv=10)
    print("%s accuracy: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))
    

Nearest Neighbors accuracy: 0.58 (+/- 0.04)
QDA accuracy: 0.38 (+/- 0.02)
Adaboost accuracy: 0.57 (+/- 0.05)
DecisionTree accuracy: 0.45 (+/- 0.04)
Logit accuracy: 0.69 (+/- 0.06)
GaussianNB accuracy: 0.75 (+/- 0.07)
Rand Forest accuracy: 0.40 (+/- 0.02)
RBF SVM accuracy: 0.36 (+/- 0.00)
Logit l=4 accuracy: 0.69 (+/- 0.05)
Logit l=2 accuracy: 0.69 (+/- 0.05)
Linear SVC accuracy: 0.66 (+/- 0.05)


### Revisit feature selection

Try a few ways. First, see if decision tree and random forest can pick out some important features. 

Then, see if they agree with logit's feature weights. 

In [103]:
# Feature importance -> a dictionary for later use, possibly to help me select features 

names = ["DT", "RF"]
feature_importances = { "DT": [] , "RF": [] }
important_indexes = { "DT": [] , "RF": [] }

classifiers = [
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
]

print(len(X_train))

for name, classifier in zip(names, classifiers): 
    clf=classifier
    clf.fit(X_train, y_train)
    print("%s accuracy on test: %f" % (name, clf.score(X_test, y_test) ))
    feature_importances[name]=clf.feature_importances_

for name, importances in feature_importances.items(): 
    features = list(enumerate(importances))
    important_indexes[name] = [i for i, v in features if v > 0]

dtX=arrayX[:,important_indexes['DT']]
rfX=arrayX[:,important_indexes['RF']]

1800
DT accuracy on test: 0.424167
RF accuracy on test: 0.418333
3000


In [None]:

classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVC": SVC(kernel="linear", C=0.5),
    "RBF SVM": SVC(gamma=2, C=1),
    "DecisionTree": DecisionTreeClassifier(max_depth=5),
    "Rand Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Adaboost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "Logit": LogisticRegression(C=1), 
    "Logit l=2": LogisticRegression(C=0.5), 
    "Logit l=4": LogisticRegression(C=0.25), 
    "QDA": QuadraticDiscriminantAnalysis()
}
print("Classifier | Score on Test | Score on Train | CV on all data | CV on DT features | CV on RF features")
print("--- | --- | --- | --- | --- | --- | --- | ")

for name, classifier in  classifiers.items() : 
    clf=classifier
    scores = cross_val_score(clf, arrayX, arrayY, cv=10)
    dt_scores = cross_val_score(clf, dtX, arrayY, cv=10)
    rf_scores = cross_val_score(clf, rfX, arrayY, cv=10)
    clf.fit(X_train, y_train)
    row = [
            clf.score(X_test, y_test) ,clf.score(X_train, y_train), #comparing test and training on 4/6 split
            scores.mean(), scores.std() * 2, #comparing cross validated scores 
            dt_scores.mean(), dt_scores.std() * 2, # using dt thinnned data
            rf_scores.mean(), dt_scores.std() * 2, # using rf thinned data 
         ]
    print(name + "|" + "|".join(["%.3f" % x for x in row]) )


Classifier | Score on Test | Score on Train | CV on all data | CV on DT features | CV on RF features
--- | --- | --- | --- | --- | --- | --- | 
Nearest Neighbors|0.566|0.704|0.578|0.035|0.468|0.059|0.499|0.059




QDA|0.368|1.000|0.385|0.019|0.549|0.029|0.490|0.029
Adaboost|0.574|0.670|0.572|0.046|0.554|0.036|0.559|0.036
DecisionTree|0.426|0.571|0.452|0.038|0.442|0.042|0.426|0.042


### Results after thinning variables according to tree feature importance 


Classifier | Score on Test | Score on Train | CV on all data | error | CV on DT features | error | CV on RF features | error
--- | --- | --- | --- | --- | --- | --- |  --- | --- | 
NearestNeighbors|0.565833333333|0.704444444444|0.577691161968|0.03508060174|0.467939945369|0.059477632684|0.499345452353|0.059477632684
QDA|0.368333333333|1.0|0.384672401604|0.0193418185832|0.548980185519|0.0294694432742|0.489634310008|0.0294694432742
Adaboost|0.574166666667|0.67|0.572308722526|0.046208040059|0.554306985154|0.0364691224832|0.558999582936|0.0364691224832
DecisionTree|0.425833333333|0.570555555556|0.452674113892|0.0380910086511|0.441619515653|0.0422924095744|0.426033432088|0.0422924095744
Logit|0.681666666667|0.875|0.693667681574|0.0552929217141|0.58599765003|0.033998966029|0.658649913935|0.033998966029
GaussianNB|0.76|0.841666666667|0.747586879902|0.0665484378079|0.590301185557|0.0505559355188|0.683608335048|0.0505559355188
Rand Forest|0.410833333333|0.462777777778|0.395681838913|0.0391783398321|0.460050160459|0.0414290518191|0.397315015645|0.0414290518191
RBF SVM|0.368333333333|1.0|0.358335115471|0.00158519469426|0.358335115471|0.00158519469426|0.358335115471|0.00158519469426
Logit l=4|0.683333333333|0.853888888889|0.690330907607|0.0526520040119|0.584987616213|0.0412230092051|0.650284205201|0.0412230092051
Logit l=2|0.681666666667|0.861666666667|0.693338778438|0.052499985639|0.587999939094|0.0365701288502|0.657289964605|0.0365701288502
Linear SVC|0.645833333333|0.97|0.659352396568|0.0535692556328|0.582683128533|0.0466345803853|0.641979354172|0.0466345803853