In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [8]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass
from sklearn.metrics.pairwise import cosine_similarity

# load the SPAM email training dataset

X_train,y_train = utils.load_mat('data/spamTrain.mat')
print X_train, y_train
print X_train.shape, y_train.shape
yy_train = np.ones(y_train.shape)
yy_train[y_train==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()

##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################

#svm = LinearSVM_twoclass()
#svm.theta = np.zeros((X.shape[1],))

from sklearn import cross_validation
XX, XXval, yy, yyval = cross_validation.train_test_split(X_train, yy_train, test_size=0.2)

C_vals = [0.1,0.3,1,3,10,30]
lr_vals = [1e-2, 3e-2, 1e-1, 3e-1, 1, 3]

# compute the kernel
K = np.array([cosine_similiarity(x) for x in XX]).reshape(XX.shape[0],XX.shape[0])
# scale the kernelized data matrix
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
# add the intercept term
KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK.T]).T

# for validation set
Kval = np.array([cosine_similiarity(x) for x in XXval]).reshape(XXval.shape[0],XXval.shape[0])
scaler = preprocessing.StandardScaler().fit(Kval)
scaleKval = scaler.transform(Kval)
KKval = np.vstack([np.ones((scaleKval.shape[0],)),scaleKval.T]).T

for lr in lr_vals:
    for c in C_vals:
        svm.theta = np.zeros((KK.shape[1],))
        #train on X, y
        svm.train(KK,yy,learning_rate=lr,reg=c,num_iters=10000,verbose=False)
        #predict on Xval, yval
        y_pred = svm.predict(KKval)
        accuracy = metrics.accuracy_score(yyval, y_pred)
        print 'lr=', lr, 'c=', c, 'accuracy=', accuracy


##################################################################################
# YOUR CODE HERE for testing your best model's perfor                            #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

best_C = 0.1
best_lr = 1e-1

#best_svm
best_svm = LinearSVM()


# compute the kernel
K = np.array([cosine_similiarity(x) for x in X_test]).reshape(X_test.shape[0],X_test.shape[0])
# scale the kernelized data matrix
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
# add the intercept term
KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK.T]).T

best_svm.theta = np.zeros((KK.shape[1],))
best_svm.train(KK,y_test,learning_rate=best_lr,reg=best_C,num_iters=10000,verbose=False)


best_svm.train(X_train, yy_train, learning_rate=best_param['LR'], reg=best_param['reg'],
                      num_iters=1500, verbose=True)
# Evaluate the best svm on test set
y_test_pred = best_svm.predict(X_test)
test_accuracy = np.mean(y_test == y_test_pred)
print 'linear SVM on raw pixels final test set accuracy: %f' % test_accuracy



y_pred = svm.predict(X)
print "Accuracy of model on training data is: ", metrics.accuracy_score(yy,y_pred)

yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
test_pred = svm.predict(X_test)
print "Accuracy of model on test data is: ", metrics.accuracy_score(yy_test,test_pred)


##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()

index = np.argsort(svm.theta)[-15:]
print "Top 15 predictors of spam are: "
for i in range(-1,-16,-1):
    print words[index[i]+1]
##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]] [1 1 0 ..., 1 0 0]
(4000, 1899) (4000,)
iteration 0 / 10000: loss 0.100000
iteration 100 / 10000: loss 0.028190
iteration 200 / 10000: loss 0.017769
iteration 300 / 10000: loss 0.014181
iteration 400 / 10000: loss 0.012271
iteration 500 / 10000: loss 0.011083
iteration 600 / 10000: loss 0.010240
iteration 700 / 10000: loss 0.009578
iteration 800 / 10000: loss 0.009056
iteration 900 / 10000: loss 0.008641
iteration 1000 / 10000: loss 0.008296
iteration 1100 / 10000: loss 0.007994
iteration 1200 / 10000: loss 0.007751
iteration 1300 / 10000: loss 0.007537
iteration 1400 / 10000: loss 0.007343
iteration 1500 / 10000: loss 0.007172
iteration 1600 / 10000: loss 0.007016
iteration 1700 / 10000: loss 0.006875
iteration 1800 / 10000: loss 0.006746
iteration 1900 / 10000: loss 0.006625
iteration 2000 / 10000: loss 0.006511
iteration 2100 / 10000: loss 0.006402
iteratio