In [1]:
from IPython.display import HTML
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import precision_score
    
%matplotlib inline
import seaborn as sns

In [4]:
#read in the data
data = pd.read_csv('AggredgatedData.csv', sep=',', na_values=[" ", ""], index_col=0)

In [None]:
features = list(data.columns[1:-1])
X = data[features]
Y = data['2016ODabovenatavg']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1)

##I. SVM

Pros: 
* Effective in high dimensional spaces.
* Still effective in cases where number of dimensions is greater than the number of samples.
* Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
* Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

Cons:

* If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
* SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [49]:
kernels = ['linear', 'polynomial', 'rbf', 'sigmoid']

for i in kernels:
    for j in range(1, 9, 2):
        clf = SVC()
        clf.fit(X_train, Y_train) 
        #kernel? C?
        SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
            decision_function_shape='ovr', degree=j, gamma='auto', kernel=i,
            max_iter=-1, probability=False, random_state=None, shrinking=True,
            tol=0.001, verbose=False)
        scores = cross_val_score(clf, X, Y, cv = 10)

        print ("Kernel: %s | Degree: %i" % (i, j))
        print scores
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Kernel: linear | Degree: 1
[ 0.64705882  0.6875      0.5         0.75        0.875       0.5625
  0.8125      0.6875      0.66666667  0.73333333]
Accuracy: 0.69 (+/- 0.21)
Kernel: linear | Degree: 3
[ 0.64705882  0.6875      0.5         0.75        0.875       0.5625
  0.8125      0.6875      0.66666667  0.73333333]
Accuracy: 0.69 (+/- 0.21)
Kernel: linear | Degree: 5
[ 0.64705882  0.6875      0.5         0.75        0.875       0.5625
  0.8125      0.6875      0.66666667  0.73333333]
Accuracy: 0.69 (+/- 0.21)
Kernel: linear | Degree: 7
[ 0.64705882  0.6875      0.5         0.75        0.875       0.5625
  0.8125      0.6875      0.66666667  0.73333333]
Accuracy: 0.69 (+/- 0.21)


ValueError: 'polynomial' is not in list

##II. Logistic Regression

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross- entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’ and ‘newton-cg’ solvers.)
This class implements regularized logistic regression using the ‘liblinear’ library, ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers. It can handle both dense and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit floats for optimal performance; any other input format will be converted (and copied).
The ‘newton-cg’, ‘sag’, and ‘lbfgs’ solvers support only L2 regularization with primal formulation. The ‘liblinear’ solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty.

In [15]:
from sklearn.linear_model import LogisticRegression

In [25]:
clf1 = LogisticRegression()
clf1.fit(X_train, Y_train) 
#kernel? C?
LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', 
                   max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

print(clf1.predict(X_test))

[False False False  True False  True  True False False  True  True False
 False False False False  True  True False False  True  True False False
 False False  True  True  True False False False  True False False False
  True False  True False False False False  True  True False  True  True
 False False  True False False]


In [26]:
scores = cross_val_score(clf1, X, Y, cv = 10)
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.70588235  0.625       0.375       0.6875      0.875       0.625       0.8125
  0.6875      0.66666667  0.66666667]
Accuracy: 0.67 (+/- 0.25)


In [21]:
#Comparison of the sparsity (% of zero coefficients) of solutions when L1 and L2 penalty 
#are used for different values of C. 
#We can see that large values of C give more freedom to the model. 
#Conversely, smaller values of C constrain the model more. 
#In the L1 penalty case, this leads to sparser solutions.

for i, C in enumerate((100, 1, 0.01)):
    # turn down tolerance for short training time
    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
    clf_l1_LR.fit(X, Y)
    clf_l2_LR.fit(X, Y)

    coef_l1_LR = clf_l1_LR.coef_.ravel()
    coef_l2_LR = clf_l2_LR.coef_.ravel()

    # coef_l1_LR contains zeros due to the
    # L1 sparsity inducing norm

    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100

    print("C=%.2f" % C)
    print("Sparsity with L1 penalty: %.2f%%" % sparsity_l1_LR)
    print("score with L1 penalty: %.4f" % clf_l1_LR.score(X, Y))
    print("Sparsity with L2 penalty: %.2f%%" % sparsity_l2_LR)
    print("score with L2 penalty: %.4f" % clf_l2_LR.score(X, Y))

C=100.00
Sparsity with L1 penalty: 7.14%
score with L1 penalty: 0.8239
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.8302
C=1.00
Sparsity with L1 penalty: 50.00%
score with L1 penalty: 0.7673
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.7862
C=0.01
Sparsity with L1 penalty: 96.43%
score with L1 penalty: 0.6038
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.7170


##III. K-Means (2 centroids)

In [22]:
from sklearn.cluster import KMeans

In [43]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_train)
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=int32)

In [41]:
kmeans.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [42]:
kmeans.cluster_centers_

array([[  6.85714286e-01,   7.14285714e-01,   9.90476190e-01,
          1.06666667e+00,   2.66666667e-01,   1.26666667e+00,
          1.45568932e-01,   1.73476153e-01,   4.51789978e-01,
         -9.63652793e-02,  -4.64647099e-01,   2.21660688e-01,
         -7.06148276e-02,   2.77903902e-02,   3.98652509e-01,
          2.35427698e-02,  -5.50511346e-03,   8.18997065e-02,
         -2.02946285e-02,  -6.40498423e-02,  -1.85447160e-01,
          6.23701023e-02,   8.74484442e-02,   9.35764354e-02,
          1.01390090e-02,   1.15824059e-02,   8.62609524e-01,
          1.19457645e-04],
       [  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   1.00000000e+00,
         -1.14942529e-02,  -3.48484848e-01,   4.82758621e-01,
         -2.33333333e-01,  -3.23529412e-01,  -6.61764706e-01,
          1.65009940e-01,   1.53543307e-01,   5.85000000e+02,
          1.33924653e-03,  -1.38186319e-01,

In [45]:
##SCORING?

## IV. Stochastic Gradient Descent (SGD)

Pros:
* Efficiency.
* Ease of implementation (lots of opportunities for code tuning).

Cons:
* requires a number of hyperparameters such as the regularization parameter and the number of iterations.
* sensitive to feature scaling.

In [33]:
from sklearn.linear_model import SGDClassifier

In [34]:
clf3 = SGDClassifier(loss="hinge", penalty="l2")

clf3.fit(X_train, Y_train)
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [35]:
clf3.predict(X_test)

array([ True, False, False,  True,  True,  True,  True, False, False,
        True,  True, False,  True, False, False,  True,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False,  True, False,  True,  True,  True, False,
        True,  True, False, False,  True, False,  True, False,  True,
       False,  True,  True,  True, False, False, False, False], dtype=bool)

In [37]:
clf3.coef_

array([[  1.30804447e+01,  -6.54022237e+00,   1.30804447e+01,
         -7.19424460e+01,   3.92413342e+01,  -2.61608895e+01,
          1.16163839e+02,   1.17652209e+02,   9.84066328e+01,
          7.74014089e-01,   8.80393042e+00,  -6.12056970e+00,
         -8.38990893e+00,   1.32910604e+01,  -3.38094249e+01,
          2.66363773e+01,   2.03904818e+00,  -3.77929767e+03,
          1.32031978e+00,   1.61751920e+01,  -6.74089943e+00,
         -2.15426692e+00,  -7.31040676e+00,   1.85971723e+01,
         -1.32784105e-01,  -6.50365673e-01,   2.98822760e+01,
          6.47109179e-02]])

In [38]:
clf3.intercept_   

array([-43.10928282])

In [39]:
scores = cross_val_score(clf3, X, Y, cv = 10)
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.64705882  0.8125      0.625       0.6875      0.5         0.6875
  0.5625      0.75        0.46666667  0.73333333]
Accuracy: 0.65 (+/- 0.21)
