In [2]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn.



In [20]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
#from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

Automatically created module for IPython interactive environment


### Loading and pre-processing data

To begin with, I replaced NA's with zeros. 

Later, I will see how many features were mostly "NA". If they are not important, I will probably drop them. 

In [2]:

trainX = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingData.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        row = line.split("\t") 
        i = 0 
        for cell in row: 
            try: 
                row[i] = float(cell)
            except ValueError: 
                row[i] = 0 
            i+=1 
        trainX.append(row)
        
trainY = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingTruth.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        trainY.append(line)

In [3]:
### Naive bayes classifier
arrayX = np.array(trainX[0:17000])
arrayY = np.array(trainY[0:17000])

X_train, X_test, y_train, y_test = train_test_split(arrayX, arrayY, test_size=0.4, random_state=0)

clf = GaussianNB()
clf.fit(X_train, y_train)

#GaussianNB(priors=None)


# check accuracy (probs won't be very accurate)
print("Gaussian Naive Bayes cross validation:" , clf.score(X_test, y_test)  )

Gaussian Naive Bayes cross validation: 0.768823529412


In [23]:

### Loop through all classifiers: 
_names = [
        "Nearest Neighbors", 
         "Linear SVM", 
         "RBF SVM",
         "Decision Tree", 
         "Random Forest", 
         "Adaboost", 
         "Naive Bayes", 
         "QDA", 
    ]
names = ["DT", "RF", "Gauss NB", "Logit", "Logit l=2", "Logit l=4" "Logit CV"]
classifiers = [
    #KNeighborsClassifier(3),
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    #AdaBoostClassifier(),
    GaussianNB(),
    #BernoulliNB(), 
    #MultinomialNB(),
    LogisticRegression(C=1), 
    LogisticRegression(C=0.5), 
    LogisticRegression(C=0.25), 
    LogisticRegressionCV(), 
    #QuadraticDiscriminantAnalysis()
]

for name, classifier in zip(names, classifiers): 
    clf=classifier
    clf.fit(X_train, y_train)
    print("%s accuracy on test: %f" % (name, clf.score(X_test, y_test) ))
    print("%s accuracy on training: %f" % (name, clf.score(X_train, y_train) ))



DT accuracy on test: 0.466176
DT accuracy on training: 0.503627
RF accuracy on test: 0.459853
RF accuracy on training: 0.490490
Gauss NB accuracy on test: 0.768824
Gauss NB accuracy on training: 0.792157
Logit accuracy on test: 0.756176
Logit accuracy on training: 0.796667
Logit l=2 accuracy on test: 0.755441
Logit l=2 accuracy on training: 0.795392
Logit l=4Logit CV accuracy on test: 0.752647
Logit l=4Logit CV accuracy on training: 0.792843


### Check for high bias 

If the above accuracy on test is low, then it could be high bias or high variance. 

To score how we do on training will tell us if we have high bias. 

In [None]:
for name, classifier in zip(names, classifiers): 
    clf=classifier
    scores = cross_val_score(clf, arrayX, arrayY, cv=10)
    print("%s accuracy: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))

### Revisit feature selection

Try a few ways. First, see if decision tree and random forest can pick out some important features. 

Then, see if they agree with logit's feature weights. 

In [40]:
# Feature imoprtance 

names = ["DT", "RF"]
feature_importances = { "DT": [] , "RF": [] }
important_indexes = { "DT": [] , "RF": [] }

classifiers = [
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
]

for name, classifier in zip(names, classifiers): 
    clf=classifier
    clf.fit(X_train, y_train)
    print("%s accuracy on test: %f" % (name, clf.score(X_test, y_test) ))
    feature_importances[name]=clf.feature_importances_

for name, importances in feature_importances.items(): 
    features = list(enumerate(importances))
    important_indexes[name] = [i for i, v in features if v > 0]
    print(name, important_indexes[name])

DT accuracy on test: 0.466176
RF accuracy on test: 0.468382
DT [15, 45, 47, 69, 78, 88, 104, 108, 114, 123, 142, 146, 148, 163, 211, 235, 243, 259, 264, 268, 279, 298, 303, 325]
RF [0, 4, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 21, 22, 25, 28, 31, 32, 33, 35, 41, 42, 43, 45, 46, 48, 49, 51, 52, 53, 54, 55, 58, 60, 61, 63, 64, 66, 67, 68, 75, 76, 79, 81, 83, 84, 85, 89, 90, 92, 96, 97, 99, 102, 105, 106, 108, 109, 110, 111, 113, 114, 115, 116, 117, 119, 120, 122, 123, 129, 133, 134, 135, 141, 142, 143, 144, 145, 151, 154, 155, 156, 158, 159, 160, 162, 163, 164, 165, 166, 169, 171, 173, 174, 176, 177, 178, 180, 181, 183, 186, 189, 191, 194, 195, 196, 198, 200, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 214, 215, 216, 217, 219, 220, 221, 224, 225, 231, 233, 235, 239, 240, 242, 244, 246, 248, 250, 252, 254, 258, 260, 261, 263, 265, 266, 268, 269, 270, 272, 274, 275, 276, 277, 279, 281, 282, 283, 286, 288, 289, 290, 293, 294, 295, 298, 299, 303, 305, 306, 307, 308, 312, 313, 314, 3

In [36]:
l = [1, 0, 0 , 2, 2]
print(list(enumerate(l)))


[(0, 1), (1, 0), (2, 0), (3, 2), (4, 2)]
