In [1]:
from IPython.display import HTML
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import precision_score
    
%matplotlib inline
import seaborn as sns

In [2]:
#read in the data
data = pd.read_csv('AggredgatedData.csv', sep=',', na_values=[" ", ""], index_col=0)

In [3]:
features = list(data.columns[1:-1])
X = data[features]
Y = data['2016ODabovenatavg']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1)

##I. SVM

Pros: 
* Effective in high dimensional spaces.
* Still effective in cases where number of dimensions is greater than the number of samples.
* Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
* Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

Cons:

* If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
* SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation

In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

Linear kernel = Straight Line (hyperplane) as the decision boundary
* rarely used in practice

RBF = commonly used kernel in SVC
2 parameters:
* gamma
* C

Gamma:
*  'spread' of the kernel and therefore the decision region.
* low gamma -> the 'curve' of the decision boundary is very low and thus the decision region is very broad (underfitting)
* gamma = 10 (The decision boundary starts to be highly effected by individual data points (i.e. variance)).
* high gamma -> the 'curve' of the decision boundary is high, which creates islands of decision-boundaries around data points (overfitting)

C:
* penalty for misclassifying a data point
* small C -> classifier is okay with misclassified data points (high bias, low variance)
* big C -> classifier is heavily penalized for misclassified data and therefore bends over backwards avoid any misclassified data points (low bias, high variance)

C > 10 is too slow

In [35]:
kernels = ['linear', 'rbf', 'sigmoid']
#gamma
#If gamma is ‘auto’ then 1/n_features will be used instead.

#degree : int, optional (default=3)
#Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

for i in kernels:
    for j, C in enumerate((0.01, 1, 10, 100)):
        for k, D in enumerate((1, 10)):
            clf = SVC(C=D, cache_size=200, class_weight=None, coef0=0.0,
                      decision_function_shape='ovr', degree=3, gamma=C, kernel=i,
                      max_iter=-1, probability=False, random_state=None, shrinking=True,
                      tol=0.001, verbose=False)
            clf.fit(X_train, Y_train) 
            scores = cross_val_score(clf, X, Y, cv = 10)

            print ("Kernel: %s | Gamma: %0.2f | C: %i" % (i, C, D))
            print scores
            print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Kernel: linear | Gamma: 0.01 | C: 1
[ 0.70588235  0.6875      0.5         0.75        0.8125      0.5625      0.875
  0.625       0.8         0.8       ]
Accuracy: 0.71 (+/- 0.23)
Kernel: linear | Gamma: 0.01 | C: 10
[ 0.52941176  0.8125      0.5625      0.625       0.75        0.6875      0.875
  0.625       0.6         0.53333333]
Accuracy: 0.66 (+/- 0.23)
Kernel: linear | Gamma: 1.00 | C: 1
[ 0.70588235  0.6875      0.5         0.75        0.8125      0.5625      0.875
  0.625       0.8         0.8       ]
Accuracy: 0.71 (+/- 0.23)
Kernel: linear | Gamma: 1.00 | C: 10
[ 0.52941176  0.8125      0.5625      0.625       0.75        0.6875      0.875
  0.625       0.6         0.53333333]
Accuracy: 0.66 (+/- 0.23)
Kernel: linear | Gamma: 10.00 | C: 1
[ 0.70588235  0.6875      0.5         0.75        0.8125      0.5625      0.875
  0.625       0.8         0.8       ]
Accuracy: 0.71 (+/- 0.23)
Kernel: linear | Gamma: 10.00 | C: 10
[ 0.52941176  0.8125      0.5625      0.625       0.75     

##II. Logistic Regression

binary classifier

L1 regularization (also called least absolute deviations) 
* push feature coefficients to 0, creating a method for feature selection. 
* as C decreases, more coefficients become 0.

In [7]:
from sklearn.linear_model import LogisticRegression

In [23]:
clf1 = LogisticRegression(penalty='l1', dual=False, tol=0.01, C=100.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', 
                   max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
clf1.fit(X_train, Y_train) 

print(clf1.predict(X_test))

[ True False False  True False  True  True False False  True  True False
  True False  True  True  True  True False False  True  True False False
 False False  True  True  True False  True False  True False  True False
  True False False False False False False  True  True False  True False
  True False  True  True False]


In [24]:
scores = cross_val_score(clf1, X, Y, cv = 10)
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.64705882  0.6875      0.5         0.625       0.6875      0.6875      0.875
  0.5625      0.6         0.8       ]
Accuracy: 0.67 (+/- 0.21)


In [10]:
#Comparison of the sparsity (% of zero coefficients) of solutions when L1 and L2 penalty 
#are used for different values of C. 
#We can see that large values of C give more freedom to the model. 
#Conversely, smaller values of C constrain the model more. 
#In the L1 penalty case, this leads to sparser solutions.

for i, C in enumerate((100, 1, 0.01)):
    # turn down tolerance for short training time
    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
    clf_l1_LR.fit(X, Y)
    clf_l2_LR.fit(X, Y)

    coef_l1_LR = clf_l1_LR.coef_.ravel()
    coef_l2_LR = clf_l2_LR.coef_.ravel()

    # coef_l1_LR contains zeros due to the
    # L1 sparsity inducing norm

    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100

    print("C=%.2f" % C)
    print("Sparsity with L1 penalty: %.2f%%" % sparsity_l1_LR)
    print("score with L1 penalty: %.4f" % clf_l1_LR.score(X, Y))
    print("Sparsity with L2 penalty: %.2f%%" % sparsity_l2_LR)
    print("score with L2 penalty: %.4f" % clf_l2_LR.score(X, Y))

C=100.00
Sparsity with L1 penalty: 5.00%
score with L1 penalty: 0.8491
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.8302
C=1.00
Sparsity with L1 penalty: 57.50%
score with L1 penalty: 0.7799
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.7925
C=0.01
Sparsity with L1 penalty: 95.00%
score with L1 penalty: 0.6415
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.7421


## III. Stochastic Gradient Descent (SGD)

Pros:
* Efficiency.
* Ease of implementation (lots of opportunities for code tuning).

Cons:
* requires a number of hyperparameters such as the regularization parameter and the number of iterations.
* sensitive to feature scaling.

In [15]:
from sklearn.linear_model import SGDClassifier

In [40]:
clf3 = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

clf3.fit(X_train, Y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [41]:
clf3.predict(X_test)

array([ True,  True, False,  True, False,  True,  True, False, False,
        True,  True, False,  True, False,  True,  True,  True,  True,
       False, False,  True,  True, False,  True, False,  True,  True,
        True, False,  True,  True, False,  True, False,  True, False,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True, False, False,  True,  True, False], dtype=bool)

In [42]:
clf3.coef_

array([[ -4.57815566e+01,   1.50879963e-14,   2.61608895e+01,
         -2.61608895e+01,   1.04643558e+02,   3.27011118e+01,
          2.00508997e+02,   2.95783550e+02,   2.17630119e+02,
          1.01921571e+01,   1.11889466e+01,   4.13377811e+00,
         -6.19553237e+01,   2.81719876e+01,  -1.18801893e+01,
          5.79328344e+01,  -3.62768771e+01,  -3.85361927e+01,
          7.65269047e+00,   1.33746356e+01,  -5.86051608e+00,
          1.57607692e+01,   4.35397517e-01,   3.85571200e+01,
         -1.08133255e+01,  -1.17352363e+01,   1.26833722e+01,
         -2.73213734e+01,   4.60294511e+01,   2.76116816e+01,
          4.02134762e+01,  -6.55977122e+01,   4.21165986e+01,
          4.35873120e-01,  -9.89535644e-01,  -2.82733813e+01,
          1.66400233e-01,  -5.24451644e-01,   4.33485939e+01,
          6.96464951e-02]])

In [43]:
clf3.intercept_   

array([-37.4209551])

In [44]:
scores = cross_val_score(clf3, X, Y, cv = 10)
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.64705882  0.6875      0.5         0.5625      0.625       0.625       0.4375
  0.6875      0.6         0.66666667]
Accuracy: 0.60 (+/- 0.16)


##IV. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf4 = RandomForestClassifier(n_estimators=10, criterion='entropy', max_depth=None, 
                              min_samples_split=2, 
                       min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                              max_features='auto', 
                       max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
                       bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, 
                       warm_start=False, class_weight=None)

clf4.fit(X_train, Y_train)

##V. Model Selection Using Grid Search 

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [58]:
# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression()],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': [0.01, 1, 100]},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1, 2, 3]},
                {'classifier': [SGDClassifier()],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__loss': ['hinge', 'log', 'modified_huber']
                 },
                {'classifier': [SVC()],
                 'classifier__kernel': ['linear', 'rbf', 'sigmoid'],
                 'classifier__C': [1, 10],
                 'classifier__gamma': [0.01, 1, 10, 100]} ]

#SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
#        eta0=0.0, fit_intercept=True, l1_ratio=0.15,
#        learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
#        n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
#        shuffle=True, tol=None, verbose=0, warm_start=False)


clf5 = GridSearchCV(pipe, search_space, cv=5, verbose=0)
best_model = clf5.fit(X_train, Y_train)
# View best model
best_model.best_estimator_.get_params()['classifier']



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [59]:
best_model.predict(X_test)

array([False,  True, False, False, False, False,  True, False, False,
        True,  True,  True,  True, False,  True, False, False,  True,
       False, False,  True, False, False,  True, False, False,  True,
        True,  True, False, False, False,  True, False,  True, False,
        True, False,  True,  True, False, False, False, False,  True,
        True, False, False, False, False, False,  True,  True], dtype=bool)

In [60]:
best_model.score(X_test, Y_test)

0.90566037735849059