In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [2]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()

##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################
'''
from sklearn import cross_validation
XX, XXval, yyy, yyval = cross_validation.train_test_split(X, yy, test_size=0.2)

Cvals = [0.1, 0.3, 1, 3, 10, 30]
lr_vals = [1e-2, 3e-2, 1e-1, 3e-1, 1, 3]
iter_vals = [1000, 5000, 10000, 25000]

best_acc = 0
scaler = preprocessing.StandardScaler().fit(XX)
scaleX = scaler.transform(XX)
XX = np.vstack([np.ones((scaleX.shape[0],)), scaleX.T]).T

scalerval = preprocessing.StandardScaler().fit(XXval)
scaleXval = scalerval.transform(XXval)
XXval = np.vstack([np.ones((scaleXval.shape[0],)), scaleXval.T]).T

for C in Cvals:
    for lr in lr_vals:
        for it in iter_vals:
            svm = LinearSVM_twoclass()
            svm.theta = np.zeros((XX.shape[1],))
            svm.train(XX, yyy, learning_rate=lr, reg=C, num_iters=it, verbose=True)
            yval_predict = svm.predict(XXval)
            acc = metrics.accuracy_score(yyval, yval_predict)
            print ("C {}".format(C), "learning rate {}".format(lr), "iteration {}".format(it), "accuracy {}".format(acc))
            if acc > best_acc:
                best_acc = acc
                best_C = C
                best_lr = lr
                best_it = it
print ('best', best_C, best_acc, best_lr, best_it)
'''
svm = LinearSVM_twoclass()
svm.theta = np.zeros((X.shape[1],))

best_C = 0.1
best_lr = 0.1
best_it = 10000

svm.train(X, yy, learning_rate=1e-1, reg=best_C, num_iters=best_it, verbose=True)
##################################################################################
# YOUR CODE HERE for testing your best model's perfor                            #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

y_pred = svm.predict(X)
print "Accuracy of model on training data is: ", metrics.accuracy_score(yy,y_pred)

yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
test_pred = svm.predict(X_test)
print "Accuracy of model on test data is: ", metrics.accuracy_score(yy_test,test_pred)


##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()

index = np.argsort(svm.theta)[-15:]
print "Top 15 predictors of spam are: "
for i in range(-1,-16,-1):
    print words[index[i]+1]
    
##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

iteration 0 / 10000: loss 0.100000
iteration 100 / 10000: loss 0.028190
iteration 200 / 10000: loss 0.017769
iteration 300 / 10000: loss 0.014181
iteration 400 / 10000: loss 0.012271
iteration 500 / 10000: loss 0.011083
iteration 600 / 10000: loss 0.010240
iteration 700 / 10000: loss 0.009578
iteration 800 / 10000: loss 0.009056
iteration 900 / 10000: loss 0.008641
iteration 1000 / 10000: loss 0.008296
iteration 1100 / 10000: loss 0.007994
iteration 1200 / 10000: loss 0.007751
iteration 1300 / 10000: loss 0.007537
iteration 1400 / 10000: loss 0.007343
iteration 1500 / 10000: loss 0.007172
iteration 1600 / 10000: loss 0.007016
iteration 1700 / 10000: loss 0.006875
iteration 1800 / 10000: loss 0.006746
iteration 1900 / 10000: loss 0.006625
iteration 2000 / 10000: loss 0.006511
iteration 2100 / 10000: loss 0.006402
iteration 2200 / 10000: loss 0.006300
iteration 2300 / 10000: loss 0.006200
iteration 2400 / 10000: loss 0.006104
iteration 2500 / 10000: loss 0.006011
iteration 2600 / 10000: 

In [3]:
#Cosine similarity kernels

XK = sklearn.metrics.pairwise.cosine_similarity(X, Y=None, dense_output=True)
scaler = preprocessing.StandardScaler().fit(XK)
scaleK = scaler.transform(XK)
KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK]).T

XKtest = sklearn.metrics.pairwise.cosine_similarity(X_test, Y=None, dense_output=True)
scalertest = preprocessing.StandardScaler().fit(XKtest)
scaleKtest = scalertest.transform(XKtest)
KKtest = np.vstack([np.ones((scaleKtest.shape[0],)),scaleKtest]).T

svm = LinearSVM_twoclass()
svm.theta = np.zeros((XK.shape[1],))

svm.train(scaleK, yy, learning_rate=best_lr, reg=best_C, num_iters=best_it, verbose=True)

y_pred = svm.predict(scaleK)
print "Accuracy of model on training data is: ", metrics.accuracy_score(yy,y_pred)

yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
test_pred = svm.predict(scaleKtest)
print "Accuracy of model on test data is: ", metrics.accuracy_score(yy_test,test_pred)

iteration 0 / 10000: loss 0.100000
iteration 100 / 10000: loss 0.011278
iteration 200 / 10000: loss 0.011013
iteration 300 / 10000: loss 0.008198
iteration 400 / 10000: loss 0.008405
iteration 500 / 10000: loss 0.006487
iteration 600 / 10000: loss 0.005924
iteration 700 / 10000: loss 0.006062
iteration 800 / 10000: loss 0.006145
iteration 900 / 10000: loss 0.006678
iteration 1000 / 10000: loss 0.004963
iteration 1100 / 10000: loss 0.006848
iteration 1200 / 10000: loss 0.004501
iteration 1300 / 10000: loss 0.005626
iteration 1400 / 10000: loss 0.006181
iteration 1500 / 10000: loss 0.004133
iteration 1600 / 10000: loss 0.003925
iteration 1700 / 10000: loss 0.003911
iteration 1800 / 10000: loss 0.003558
iteration 1900 / 10000: loss 0.004517
iteration 2000 / 10000: loss 0.004601
iteration 2100 / 10000: loss 0.005298
iteration 2200 / 10000: loss 0.003560
iteration 2300 / 10000: loss 0.003343
iteration 2400 / 10000: loss 0.003224
iteration 2500 / 10000: loss 0.003342
iteration 2600 / 10000: 

ValueError: shapes (1000,1000) and (4000,) not aligned: 1000 (dim 1) != 4000 (dim 0)

In [None]:
#Gaussian Kernels

sigma = 0.02

K = np.array([utils.gaussian_kernel(x1,x2,sigma) for x1 in X for x2 in X]).reshape(X.shape[0],X.shape[0])
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK]).T
yy = np.ones(y.shape)
yy[y == 0] = -1
svm = LinearSVM_twoclass()
svm.theta = np.zeros((KK.shape[1],))
C = 1
svm.train(KK,yy,learning_rate=best_lr,reg=best_C,num_iters=best_it,verbose=True)
y_pred = svm.predict(KK)
print "Accuracy on training data = ", metrics.accuracy_score(yy,y_pred)
yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
test_pred = svm.predict(X_test)
print "Accuracy of model on test data is: ", metrics.accuracy_score(yy_test,test_pred)