In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [2]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()
yytest = np.ones(y_test.shape)
yytest[y_test==0] = -1
##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################
from sklearn.metrics.pairwise import linear_kernel

num_train = 3200;
X_train = X[:3200];
Xval = X[3200:];
yy_train = yy[:3200];
yyval = yy[3200:];

Cvals = [1,3,10,30,100,300]

max_acc = 0;
best_C = 0;
best_sigma = 0;


K = linear_kernel(X_train,X_train)
# scale the kernelized data matrix
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
# add the intercept term
KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK.T]).T
    
Kval = linear_kernel(Xval,X_train)
# scale the kernelized data matrix
scale_Kval = scaler.transform(Kval)
# add the intercept term
KK_val = np.vstack([np.ones((scale_Kval.shape[0],)),scale_Kval.T]).T    

for C in Cvals:
    svm = LinearSVM_twoclass()
    svm.theta = np.zeros((KK.shape[1],))
    svm.train(KK,yy_train,learning_rate=1e-4,reg=C,num_iters=20000,verbose=False,batch_size=KK.shape[0])
    pred_val = svm.predict(KK_val)
    accuracy = np.sum((pred_val == yyval)*1)/len(yyval)
    print("C value: " + str(C),"current accuracy: " + str(accuracy),"max accuracy: " + str(max_acc))
    if (accuracy >= max_acc):
        max_acc = accuracy;
        best_C = C;
print(max_acc,best_C)    


C value: 1 current accuracy: 0.96 max accuracy: 0
C value: 3 current accuracy: 0.97 max accuracy: 0.96
C value: 10 current accuracy: 0.9775 max accuracy: 0.97
C value: 30 current accuracy: 0.97625 max accuracy: 0.9775
C value: 100 current accuracy: 0.97 max accuracy: 0.9775
C value: 300 current accuracy: 0.9725 max accuracy: 0.9775
0.9775 10


In [6]:
iterations = [100,1000,4000,10000,20000,30000]
best_iter = 0;
for iters in iterations:
    svm = LinearSVM_twoclass()
    svm.theta = np.zeros((KK.shape[1],))
    svm.train(KK,yy_train,learning_rate=1e-4,reg=best_C,num_iters=iters,verbose=False,batch_size=KK.shape[0])
    pred_val = svm.predict(KK_val)
    accuracy = np.sum((pred_val == yyval)*1)/len(yyval)
    print("iteration number: " + str(iters),"current accuracy: " + str(accuracy),"max accuracy: " + str(max_acc))
    if (accuracy >= max_acc):
        max_acc = accuracy;
        best_iter = iters;
if(best_iter == 0):
    best_iter = 20000;
print(max_acc,best_iter)  


iteration number: 100 current accuracy: 0.91875 max accuracy: 0.9775
iteration number: 1000 current accuracy: 0.955 max accuracy: 0.9775
iteration number: 4000 current accuracy: 0.97 max accuracy: 0.9775
iteration number: 10000 current accuracy: 0.9725 max accuracy: 0.9775
iteration number: 20000 current accuracy: 0.9775 max accuracy: 0.9775
iteration number: 30000 current accuracy: 0.97375 max accuracy: 0.9775
0.9775 20000


In [7]:

lrs = [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6]
best_lr = 0;
for lr in lrs:
    svm = LinearSVM_twoclass()
    svm.theta = np.zeros((KK.shape[1],))
    svm.train(KK,yy_train,learning_rate=lr,reg=best_C,num_iters=best_iter,verbose=False,batch_size=KK.shape[0])
    pred_val = svm.predict(KK_val)
    accuracy = np.sum((pred_val == yyval)*1)/len(yyval)
    print("current accuracy: " + str(accuracy),"max accuracy: " + str(max_acc))
    if (accuracy >= max_acc):
        max_acc = accuracy;
        best_lr = lr;
if(best_lr == 0):
    best_lr = 1e-4;
    
print(max_acc,best_lr)  


current accuracy: 0.96875 max accuracy: 0.9775
current accuracy: 0.97 max accuracy: 0.9775
current accuracy: 0.9675 max accuracy: 0.9775
current accuracy: 0.9775 max accuracy: 0.9775
current accuracy: 0.96 max accuracy: 0.9775
current accuracy: 0.93625 max accuracy: 0.9775
0.9775 0.0001


In [9]:


Cvals = [10,30,100]
iterations = [4000,10000,20000]
lrs = [1e-3,1e-4,1e-5]
best_lr = 0;
best_C = 0;
best_iter = 0;
for C in Cvals:
    for iteration in iterations:
        for lr in lrs:
            svm = LinearSVM_twoclass()
            svm.theta = np.zeros((KK.shape[1],))
            svm.train(KK,yy_train,learning_rate=lr,reg=C,num_iters=iteration,verbose=False,batch_size=KK.shape[0])
            pred_val = svm.predict(KK_val)
            accuracy = np.sum((pred_val == yyval)*1)/len(yyval)
            print("current accuracy: " + str(accuracy),"max accuracy: " + str(max_acc))
            if (accuracy > max_acc):
                max_acc = accuracy;
                best_lr = lr;
                best_C = C;
                best_iter = iteration;
if(best_lr == 0 and best_C == 0 and best_iter == 0):
    best_lr = 1e-4;
    best_iter = 20000;
    best_C = 10;
print(max_acc,best_C,best_iter,best_lr)  

current accuracy: 0.96875 max accuracy: 0.9775
current accuracy: 0.97 max accuracy: 0.9775
current accuracy: 0.94 max accuracy: 0.9775
current accuracy: 0.97375 max accuracy: 0.9775
current accuracy: 0.9725 max accuracy: 0.9775
current accuracy: 0.95375 max accuracy: 0.9775
current accuracy: 0.9675 max accuracy: 0.9775
current accuracy: 0.9775 max accuracy: 0.9775
current accuracy: 0.96 max accuracy: 0.9775
current accuracy: 0.9675 max accuracy: 0.9775
current accuracy: 0.97375 max accuracy: 0.9775
current accuracy: 0.9575 max accuracy: 0.9775
current accuracy: 0.97375 max accuracy: 0.9775
current accuracy: 0.97375 max accuracy: 0.9775
current accuracy: 0.96875 max accuracy: 0.9775
current accuracy: 0.9725 max accuracy: 0.9775
current accuracy: 0.97625 max accuracy: 0.9775
current accuracy: 0.97 max accuracy: 0.9775
current accuracy: 0.96375 max accuracy: 0.9775
current accuracy: 0.97625 max accuracy: 0.9775
current accuracy: 0.97 max accuracy: 0.9775
current accuracy: 0.965 max accura

In [10]:
##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################
print(best_C,best_iter,best_lr)
K = linear_kernel(X_train,X_train)
# scale the kernelized data matrix
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
# add the intercept term
KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK.T]).T
    
Kval = linear_kernel(Xval,X_train)
# scale the kernelized data matrix
scale_Kval = scaler.transform(Kval)
# add the intercept term
KK_val = np.vstack([np.ones((scale_Kval.shape[0],)),scale_Kval.T]).T   
svm = LinearSVM_twoclass()
svm.theta = np.zeros((KK.shape[1],))
svm.train(KK,yy_train,learning_rate=best_lr,reg=best_C,num_iters=best_iter,verbose=True,batch_size=KK.shape[0])
pred_train = svm.predict(KK)
accuracy = np.sum((pred_train == yy_train)*1)/len(yy_train)
print("max accuracy: " + str(accuracy))



10 20000 0.0001
iteration 0 / 20000: loss 10.000000
iteration 100 / 20000: loss 2.456204
iteration 200 / 20000: loss 2.026511
iteration 300 / 20000: loss 1.775690
iteration 400 / 20000: loss 1.608407
iteration 500 / 20000: loss 1.505424
iteration 600 / 20000: loss 1.424841
iteration 700 / 20000: loss 1.363712
iteration 800 / 20000: loss 1.314737
iteration 900 / 20000: loss 1.274128
iteration 1000 / 20000: loss 1.236948
iteration 1100 / 20000: loss 1.205123
iteration 1200 / 20000: loss 1.175196
iteration 1300 / 20000: loss 1.148629
iteration 1400 / 20000: loss 1.123458
iteration 1500 / 20000: loss 1.101002
iteration 1600 / 20000: loss 1.078721
iteration 1700 / 20000: loss 1.059668
iteration 1800 / 20000: loss 1.041429
iteration 1900 / 20000: loss 1.025514
iteration 2000 / 20000: loss 1.006274
iteration 2100 / 20000: loss 0.990371
iteration 2200 / 20000: loss 0.977996
iteration 2300 / 20000: loss 0.963073
iteration 2400 / 20000: loss 0.949976
iteration 2500 / 20000: loss 0.937316
iterati

In [15]:
test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()
yytest = np.ones(y_test.shape)
yytest[y_test==0] = -1
print(yytest.shape)
Ktest = linear_kernel(X_test,X_train)
# scale the kernelized data matrix
scale_Ktest = scaler.transform(Ktest)
# add the intercept term
KK_test = np.vstack([np.ones((scale_Ktest.shape[0],)),scale_Ktest.T]).T   

pred_test = svm.predict(KK_test)
accuracy = np.sum((pred_test == yytest)*1)/len(yytest)
print("max accuracy: " + str(accuracy))
##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()


print ("######## top 15 spam words ########")
w = np.dot(svm.theta[1:], X_train).argsort()[::-1]
for i in w[:15]:
    print (words[i])
##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

(1000,)
max accuracy: 0.982
######## top 15 spam words ########
remot
clearli
otherwis
york
player
mondai
wife
night
hot
franc
young
gt
believ
info
water
