In [1]:
from IPython.display import HTML
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import precision_score
    
%matplotlib inline
import seaborn as sns

In [4]:
#read in the data
data = pd.read_csv('AggredgatedData.csv', sep=',', na_values=[" ", ""], index_col=0)

##I. SVM

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [8]:
features = list(data.columns[1:-1])
X = data[features]
Y = data['2016ODabovenatavg']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1)

clf = SVC()
clf.fit(X_train, Y_train) 
#kernel? C?
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

print(clf.predict(X_test))

[False  True False  True False  True False False False  True  True False
 False False  True False False  True False False  True  True False  True
 False False  True  True  True False False False  True False False False
  True False  True False False False False False  True False False False
 False False  True False False]


In [10]:
#scoring='f1_macro'
scores = cross_val_score(clf, X, Y, cv = 10)

In [13]:
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.70588235  0.6875      0.5         0.6875      0.875       0.625       0.8125
  0.6875      0.6         0.66666667]
Accuracy: 0.68 (+/- 0.20)


##II. Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
clf1 = LogisticRegression()
clf1.fit(X_train, Y_train) 
#kernel? C?
LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', 
                   max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

print(clf1.predict(X_test))

[False False False  True False  True  True False False  True  True False
 False False False False  True  True False False  True  True False False
 False False  True  True  True False False False  True False False False
  True False  True False False False False  True  True False  True  True
 False False  True False False]


In [17]:
scores = cross_val_score(clf1, X, Y, cv = 10)
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.70588235  0.625       0.375       0.6875      0.875       0.625       0.8125
  0.6875      0.66666667  0.66666667]
Accuracy: 0.67 (+/- 0.25)


In [19]:
#Comparison of the sparsity (% of zero coefficients) of solutions when L1 and L2 penalty 
#are used for different values of C. 
#We can see that large values of C give more freedom to the model. 
#Conversely, smaller values of C constrain the model more. 
#In the L1 penalty case, this leads to sparser solutions.

for i, C in enumerate((100, 1, 0.01)):
    # turn down tolerance for short training time
    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
    clf_l1_LR.fit(X, Y)
    clf_l2_LR.fit(X, Y)

    coef_l1_LR = clf_l1_LR.coef_.ravel()
    coef_l2_LR = clf_l2_LR.coef_.ravel()

    # coef_l1_LR contains zeros due to the
    # L1 sparsity inducing norm

    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100

    print("C=%.2f" % C)
    print("Sparsity with L1 penalty: %.2f%%" % sparsity_l1_LR)
    print("score with L1 penalty: %.4f" % clf_l1_LR.score(X, Y))
    print("Sparsity with L2 penalty: %.2f%%" % sparsity_l2_LR)
    print("score with L2 penalty: %.4f" % clf_l2_LR.score(X, Y))
    
    l1_plot = plt.subplot(3, 2, 2 * i + 1)
    l2_plot = plt.subplot(3, 2, 2 * (i + 1))
    if i == 0:
        l1_plot.set_title("L1 penalty")
        l2_plot.set_title("L2 penalty")

    l1_plot.imshow(np.abs(coef_l1_LR.reshape(8, 8)), interpolation='nearest',
                   cmap='binary', vmax=1, vmin=0)
    l2_plot.imshow(np.abs(coef_l2_LR.reshape(8, 8)), interpolation='nearest',
                   cmap='binary', vmax=1, vmin=0)
    plt.text(-8, 3, "C = %.2f" % C)

    l1_plot.set_xticks(())
    l1_plot.set_yticks(())
    l2_plot.set_xticks(())
    l2_plot.set_yticks(())

plt.show()

C=100.00
Sparsity with L1 penalty: 7.14%
score with L1 penalty: 0.8113
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.8302
C=1.00
Sparsity with L1 penalty: 50.00%
score with L1 penalty: 0.7610
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.7862
C=0.01
Sparsity with L1 penalty: 96.43%
score with L1 penalty: 0.6038
Sparsity with L2 penalty: 0.00%
score with L2 penalty: 0.7170


##III. K-Means