In [2]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn.



In [20]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
#from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

Automatically created module for IPython interactive environment


### Loading and pre-processing data

To begin with, I replaced NA's with zeros. 

Later, I will see how many features were mostly "NA". If they are not important, I will probably drop them. 

In [123]:

trainX = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingData.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        row = line.split("\t") 
        i = 0 
        for cell in row: 
            try: 
                row[i] = float(cell)
            except ValueError: 
                row[i] = 0 
            i+=1 
        trainX.append(row)
        
trainY = [] 
with open("/home/lu/Documents/School/81_machine_learning/HW3/HW3/trainingTruth.txt", 'r') as rp: 
    for line in rp.read().split("\n"): 
        trainY.append(line)

In [130]:
### Split data into training and tests sets 
arrayX = np.array(trainX[0:17000])
arrayY = np.array(trainY[0:17000])

X_train, X_test, y_train, y_test = train_test_split(arrayX, arrayY, test_size=0.4, random_state=0)



In [90]:

### Loop through all classifiers: 

classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVC": SVC(kernel="linear", C=0.5),
    "RBF SVM": SVC(gamma=2, C=1),
    "DecisionTree_old": DecisionTreeClassifier(max_depth=5), 
    "RandomForest_old": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "DecisionTree": DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=20, random_state=0),
    "Rand Forest": RandomForestClassifier(n_estimators=30, criterion='entropy', max_depth=20, min_samples_leaf=20, 
                           bootstrap=True, oob_score=False, random_state=0 ),
    "Adaboost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "Logit": LogisticRegression(C=1), 
    "Logit l=2": LogisticRegression(C=0.5), 
    "Logit l=4": LogisticRegression(C=0.25),
    "QDA": QuadraticDiscriminantAnalysis()
}
print("Classifier | Score on Test | Score on Train ")
print("--- | --- | --- | ")


for name, classifier in  classifiers.items() : 
    clf=classifier
    clf.fit(X_train, y_train)
    row = [
            clf.score(X_test, y_test) ,clf.score(X_train, y_train), #comparing test and training on 4/6 split
         ]
    print(name + "|" + "|".join(["%.3f" % x for x in row]) )

### Classifier comparison
Classifier | Score on Test | Score on Train 
--- | --- | --- | 
Nearest Neighbors|0.579|0.740
QDA|0.556|1.000
Adaboost|0.600|0.632
DecisionTree_old|0.460|0.526
DecisionTree|0.440|0.497
Logit|0.739|0.800
GaussianNB|0.756|0.799
RandomForest_old|0.389|0.436
Rand Forest|0.593|0.787
RBF SVM|0.343|1.000
Logit l=4|0.728|0.794
Logit l=2|0.737|0.799
Linear SVC|0.704|0.832


### Check for high bias 

If the above accuracy on test is low, then it could be high bias or high variance. 

To score how we do on training will tell us if we have high bias. 

Scoring on test and then training shows the following. Buckets are less than 0.7 for high error, 0.7 to 0.8 for moderate error, greater than 0.8 for low error. 


| low bias (0.8+ on training)  | moderate bias (0.7 to 0.8 on training) | high bias (0.7- on training) | |
| ------------- |:-------------:| ----------:| ------------------:|
|               |  |      | **low variance (  0.7+ on test)**     |
| Linear SVM     | Logit, NB   |       | **moderate variance  (0.5-0.7 on test)** |
| QDA, RBF SVM |      | Adaboost, DT, RF    | **high variance (0.5- on test)**    |


### Cross validation to get more accurate assessment of variance 

In [92]:
for name, classifier in  classifiers.items() : 
    clf=classifier
    scores = cross_val_score(clf, arrayX, arrayY, cv=10)
    print("%s accuracy: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))
    

Nearest Neighbors accuracy: 0.58 (+/- 0.04)
QDA accuracy: 0.38 (+/- 0.02)
Adaboost accuracy: 0.57 (+/- 0.05)
DecisionTree accuracy: 0.45 (+/- 0.04)
Logit accuracy: 0.69 (+/- 0.06)
GaussianNB accuracy: 0.75 (+/- 0.07)
Rand Forest accuracy: 0.40 (+/- 0.02)
RBF SVM accuracy: 0.36 (+/- 0.00)
Logit l=4 accuracy: 0.69 (+/- 0.05)
Logit l=2 accuracy: 0.69 (+/- 0.05)
Linear SVC accuracy: 0.66 (+/- 0.05)


### Revisit feature selection

Try a few ways. First, see if decision tree and random forest can pick out some important features. 

Then, see if they agree with logit's feature weights. 

In [119]:
# Feature importance -> a dictionary for later use, possibly to help me select features 

names = ["DT", "RF", "DT2", "RF2"]
feature_importances = { "DT": [] , "RF": [] }
important_indexes = { "DT": [] , "RF": [] }

classifiers = [
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    DecisionTreeClassifier(criterion='entropy', max_depth=5,
                           min_samples_split=2, min_samples_leaf=20, 
                           max_features=None,
                           random_state=0, 
                           ),
    RandomForestClassifier(n_estimators=100, criterion='entropy',
                           max_depth=30, min_samples_split=2, 
                           min_samples_leaf=20, 
                           max_features='auto',
                           bootstrap=True, oob_score=False, random_state=0, 
                           )
]

print(len(X_train))

for name, classifier in zip(names, classifiers): 
    clf=classifier
    clf.fit(X_train, y_train)
    print("%s accuracy on test: %f" % (name, clf.score(X_test, y_test) ))
    feature_importances[name]=clf.feature_importances_

for name, importances in feature_importances.items(): 
    features = list(enumerate(importances))
    important_indexes[name] = [i for i, v in features if v > 0]

dtX=arrayX[:,important_indexes['DT']]
rfX=arrayX[:,important_indexes['RF']]

1800
DT accuracy on test: 0.425000
RF accuracy on test: 0.417500
DT2 accuracy on test: 0.432500
RF2 accuracy on test: 0.568333


In [106]:
print("Classifier | Score on Test | Score on Train | CV on all data | CV on DT features | CV on RF features")
print("--- | --- | --- | --- | --- | --- | --- | ")

for name, classifier in  classifiers.items() : 
    clf=classifier
    scores = cross_val_score(clf, arrayX, arrayY, cv=10)
    dt_scores = cross_val_score(clf, dtX, arrayY, cv=10)
    rf_scores = cross_val_score(clf, rfX, arrayY, cv=10)
    clf.fit(X_train, y_train)
    row = [
            clf.score(X_test, y_test) ,clf.score(X_train, y_train), #comparing test and training on 4/6 split
            scores.mean(), scores.std() * 2, #comparing cross validated scores 
            dt_scores.mean(), dt_scores.std() * 2, # using dt thinnned data
            rf_scores.mean(), dt_scores.std() * 2, # using rf thinned data 
         ]
    print(name + "|" + "|".join(["%.3f" % x for x in row]) )


Classifier | Score on Test | Score on Train | CV on all data | CV on DT features | CV on RF features
--- | --- | --- | --- | --- | --- | --- | 
Nearest Neighbors|0.566|0.704|0.578|0.035|0.468|0.059|0.499|0.059




QDA|0.368|1.000|0.385|0.019|0.549|0.029|0.490|0.029
Adaboost|0.574|0.670|0.572|0.046|0.554|0.036|0.559|0.036
DecisionTree|0.426|0.571|0.452|0.038|0.442|0.042|0.426|0.042
Logit|0.682|0.875|0.694|0.055|0.586|0.034|0.659|0.034
GaussianNB|0.760|0.842|0.748|0.067|0.590|0.051|0.684|0.051
Rand Forest|0.418|0.482|0.396|0.025|0.465|0.046|0.391|0.046
RBF SVM|0.368|1.000|0.358|0.002|0.358|0.002|0.358|0.002
Logit l=4|0.683|0.854|0.690|0.053|0.585|0.041|0.650|0.041
Logit l=2|0.682|0.862|0.693|0.052|0.588|0.037|0.657|0.037
Linear SVC|0.646|0.970|0.659|0.054|0.583|0.047|0.642|0.047


### Results after thinning variables according to tree feature importance 

To speed things up, I"ve been using 3K of the given dataset. 

Classifier | Score on Test | Score on Train | CV on all data | error | CV on DT features | error | CV on RF features | error
--- | --- | --- | --- | --- | --- | --- |  --- | --- | 
Nearest Neighbors|0.566|0.704|0.578|0.035|0.468|0.059|0.499|0.059
QDA|0.368|1.000|0.385|0.019|0.549|0.029|0.490|0.029
Adaboost|0.574|0.670|0.572|0.046|0.554|0.036|0.559|0.036
DecisionTree|0.426|0.571|0.452|0.038|0.442|0.042|0.426|0.042
Logit|0.682|0.875|0.694|0.055|0.586|0.034|0.659|0.034
GaussianNB|0.760|0.842|0.748|0.067|0.590|0.051|0.684|0.051
Rand Forest|0.418|0.482|0.396|0.025|0.465|0.046|0.391|0.046
RBF SVM|0.368|1.000|0.358|0.002|0.358|0.002|0.358|0.002
Logit l=4|0.683|0.854|0.690|0.053|0.585|0.041|0.650|0.041
Logit l=2|0.682|0.862|0.693|0.052|0.588|0.037|0.657|0.037
Linear SVC|0.646|0.970|0.659|0.054|0.583|0.047|0.642|0.047


So it looks like this is not a good way to select features, though it seemed to help the Random Forest to make selections on fewer features. For my next run, I'm going to let the trees go deeper. The first time was using depth=5, which is not optimal. 

### Feature selection - dropping mostly empty columns or rows 

For this data, we have filled NA's with zeros. Perhaps that is not good for some columns that contain legitimate small numbers. Dropping features that are mostly populated with NAs might improve things. 

Similarly, we can drop rows of data that contain too many NA's to be useful. From running this model on subsets of the data, a training set of 6K of the data creates a model that is nearly as good as a model that used 10K of the data (splitting the entire dataset). Therefore, we might be able to drop observations without reducing our dataset to a point where the model accuracy suffered. 

In [136]:
some_classifiers = {
    "Linear SVC": SVC(kernel="linear", C=0.5),
    "Rand Forest": RandomForestClassifier(n_estimators=30, criterion='entropy', max_depth=20, min_samples_leaf=20, 
                           bootstrap=True, oob_score=False, random_state=0 ),
    "Adaboost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "Logit": LogisticRegression(C=1), 
}

shortX, shortY = arrayX[0:10000], arrayY[0:10000]
sX_train, sX_test, sy_train, sy_test = train_test_split(shortX, shortY, test_size=0.4, random_state=0)



In [138]:
print("Classifier | 7K Test | 10K Train |  4K Test | 6K Train ")
print("--- | --- | --- | ")

for name, classifier in  some_classifiers.items() : 
    clf=classifier
    clf.fit(X_train, y_train)
    short_clf = classifier
    short_clf.fit(sX_train, sy_train)
    row = [
            clf.score(X_test, y_test) ,clf.score(X_train, y_train), #comparing test and training on 4/6 split
            short_clf.score(sX_test, sy_test) ,short_clf.score(sX_train, sy_train)
         ]
    print(name + "|" + "|".join(["%.8f" % x for x in row]) )

Classifier | 7K Test | 10K Train |  4K Test | 6K Train 
--- | --- | --- | 
Rand Forest|0.64058824|0.67480392|0.59300000|0.78666667
Adaboost|0.60500000|0.62098039|0.59950000|0.63200000
Logit|0.76147059|0.76490196|0.73925000|0.80016667
GaussianNB|0.77338235|0.78049020|0.75650000|0.79950000
Linear SVC|0.74602941|0.76441176|0.70425000|0.83200000
