# Build your own dataset

In this exercise, we provided a preprocessed training set and test set. These datasets were created using the same functions (`processEmail` and `emailFeatures`) that you now have completed. For this optional (ungraded) exercise, you will build your own dataset using the original emails from the SpamAssassin Public Corpus.

Your task in this optional (ungraded) exercise is to download the original
files from the public corpus and extract them. After extracting them, you should run the `processEmail` and `emailFeatures` functions on each email to extract a feature vector from each email. This will allow you to build a dataset `X`, `y` of examples. You should then randomly divide up the dataset into a training set, a cross validation set and a test set.

While you are building your own dataset, we also encourage you to try building your own vocabulary list (by selecting the high frequency words that occur in the dataset) and adding any additional features that you think
might be useful. Finally, we also suggest trying to use highly optimized SVM toolboxes such as [`LIBSVM`](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) or [`scikit-learn`](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm).

In [1]:
# used for manipulating directory paths
import os

import numpy as np
# Import regular expressions to process emails
import re

from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# Optimize with Sklearn svm 
from sklearn import svm
from sklearn.metrics import classification_report

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# library written for this exercise providing additional functions for assignment submission, and others
import utils

%matplotlib inline
# for expliting file name with its extention
from pathlib import Path
# for extracting files
import tarfile
# for reloading module
import importlib
# Compute execution time
import time

In [2]:
# To change directory
os.chdir('F:\Machine_Learning\ML_by_Andrew_Ng\Assignments_in_python\Ex6_week7')

In [30]:
# importing methods
import methods
importlib.reload(methods)
from methods import processEmail,emailFeatures, dataset3Params, gaussian_Kernel, gaussianKernelGramMatrix

In [None]:
# Extracting files
archives_path = 'D:/Mail Sample/Obsolete'# directory of zipped folders
extract_path = 'Extracted_files' # directory of unzipped folders
for filename in os.listdir(archives_path):
    archives = tarfile.open(os.path.join(archives_path, filename)) 
    archives.extractall(path = extract_path) 

In [None]:
# Reading files and writing files to txt
path = 'Extracted_files' # directory of unzipped folders
pathw = 'Database_' # directory for saving folders of txt files
for foldername in os.listdir():
    folderpath = os.path.join(pathw, foldername)
    os.makedirs(folderpath)
    for filename in os.listdir(os.path.join(path, foldername)):       
        with open(os.path.join(os.path.join(path, foldername), filename), encoding="latin-1") as f:
            file_content = f.read()
            filename = Path(filename).stem
            with open(os.path.join(folderpath, filename +'.txt'), 'w') as writefile:
                writefile.write(file_content)

In [None]:
# Processing and featuring mails, and saving the features in numpy array locally
for foldername in os.listdir('Database_'):
    X = []
    for filename in os.listdir(os.path.join('Database_', foldername)):
        with open(os.path.join(os.path.join('Database_', foldername), filename)) as fid:
             file_contents = fid.read()
        word_indices  = processEmail(file_contents)
        X.append(emailFeatures(word_indices))
    X_ham = np.array(X)
    np.save(os.path.join('Database', foldername + '.npy'), X_ham)

In [None]:
# loading features 
X_h1 = np.load('Database/easy_ham.npy')
X_h2 = np.load('Database/easy_ham_2.npy')
X_h3 = np.load('Database/hard_ham.npy')
X_s1 = np.load('Database/spam.npy')
X_s2 = np.load('Database/spam_2.npy')

In [None]:
# Making single vector for all feature vectors
X_h = np.concatenate((X_h1, X_h2, X_h3), axis = 0)
X_s = np.concatenate((X_s1, X_s2), axis = 0)
X_h_y = np.concatenate((X_h, np.zeros((X_h.shape[0], 1))), axis = 1)
X_s_y = np.concatenate((X_s, np.ones((X_s.shape[0],1))), axis = 1)
data = np.concatenate((X_h_y, X_s_y), axis = 0)

In [None]:
np.save(os.path.join('Database', 'data.npy'), data)

In [4]:
data = np.load('Database/data.npy')

In [6]:
for i in range(3):
    np.random.shuffle(data) 

In [7]:
data[:100, 1899]

array([0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
       1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [8]:
# Dividing the data for train(60%), cross validation(20%) and test(20%)
data_train = data[:5600, :]
data_val = data[5600:7450, :]
data_test = data[7400:, :]

In [13]:
X_train, y_train = data_train[:,:1899].astype(float), data_train[:, -1]
X_val, y_val = data_val[:,:1899].astype(float), data_val[:, -1]
X_test, y_test = data_test[:,:1899].astype(float), data_test[:, -1]

In [9]:
# Training model
startTime = time.time()

print('Training Linear SVM (Spam Classification)')
print('This may take 1 to 2 minutes ...\n')

C = 0.1
model = utils.svmTrain(X_train, y_train, C, utils.linearKernel)
executionTime(startTime)

Training Linear SVM (Spam Classification)
This may take 1 to 2 minutes ...

---11 minute(s) 42.99 seconds ---


702.9907653331757

In [10]:
# Compute the training accuracy
p = utils.svmPredict(model, X_train)

print('Training Accuracy: %.2f' % (np.mean(p == y_train) * 100))

Training Accuracy: 100.00


In [55]:
# Try different SVM Parameters here

#C, sigma = dataset3Params(X_train, y_train, X_val, y_val)
#print('C = ', C, 'sigma = ', sigma)3
# Train the SVM
# model = utils.svmTrain(X, y, C, lambda x1, x2: gaussianKernel(x1, x2, sigma))

#model = utils.svmTrain(X_train, y_train, C, gaussianKernel, args=(sigma,))

# Gaussian Kernal Taking hours to train the model that's why i replaced gaussian kernel whith linear Kernal
#model = utils.svmTrain(X_train, y_train, C, utils.linearKernal)
# compute the cross-validation accuracy
#p = utils.svmPredict(model, X_train)

#print('cross-validation Accuracy: %.2f' % (np.mean(p == y_val) * 100))

In [None]:
# compute the test accuracy

print('Evaluating the trained Linear SVM on a test set ...')
p = utils.svmPredict(model, X_test)

print('Test Accuracy: %.2f' % (np.mean(p == y_test) * 100))

**Even we are using linear kernal of utils, it takes more than 10 minuthes to train model with the dataset. It will take hours to train model by utlis.svmTrain with gaussian kernal.**

## SVM model of skikit-learn library
### SVM with linear kernal 
**svm with linear kernal of skikit-learn library is very efficient. It also takes few sec to train model and accurracy is more than 99%.**

In [50]:
# Training model 
startTime = time.time()

C = 0.1
clf = svm.SVC(C = C, kernel = 'linear')

model = clf.fit(X_train,y_train)
executionTime(startTime)

--- 6.33 seconds ---


In [51]:
# compute the train accuracy

p = model.predict(X_train)

print('Training Accuracy: %.2f' % (np.mean(p == y_train) * 100))

Training Accuracy: 100.00


In [52]:
print(classification_report(y_train, p))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4152
         1.0       1.00      1.00      1.00      1448

    accuracy                           1.00      5600
   macro avg       1.00      1.00      1.00      5600
weighted avg       1.00      1.00      1.00      5600



In [54]:
# compute the cross-validation accuracy

p = model.predict(X_val)

print('Cross-validation Accuracy: %.2f' % (np.mean(p == y_val) * 100))

Cross-validation Accuracy: 99.41


In [55]:
print(classification_report(y_val, p))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1407
         1.0       0.99      0.99      0.99       443

    accuracy                           0.99      1850
   macro avg       0.99      0.99      0.99      1850
weighted avg       0.99      0.99      0.99      1850



In [56]:
# compute the test accuracy

p = model.predict(X_test)

print('Test Accuracy: %.2f' % (np.mean(p == y_test) * 100))

Test Accuracy: 99.33


In [57]:
print(classification_report(y_test, p))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1427
         1.0       0.99      0.99      0.99       526

    accuracy                           0.99      1953
   macro avg       0.99      0.99      0.99      1953
weighted avg       0.99      0.99      0.99      1953



### SVM with gaussian kernel of sklearn  
**Even SVM with gaussian kernel of skikit-learn library is not efficient with this dataset. It takes minutes to train model**

In [41]:
# Training model
startTime = time.time()
C = 0.1
clf = svm.SVC(C = C, kernel = "precomputed")
model = clf.fit(gaussianKernelGramMatrix(X_train, X_train), y_train)

execTime = time.time() - startTime
print("--- %2d minute(s) %2.2f seconds ---" % (execTime//60, execTime%60))

---  8 minute(s) 3.24 seconds ---


In [42]:
# Training accuracy
startTime = time.time()
p = model.predict(gaussianKernelGramMatrix(X_train, X_train))

print('Training Accuracy: %.2f' % (np.mean(p == y_train) * 100))

execTime = time.time() - startTime
print("--- %2d minute(s) %2.2f seconds ---" % (execTime//60, execTime%60))

Training Accuracy: 74.14
---  8 minute(s) 16.20 seconds ---


In [49]:
model_svm_GK = model

In [48]:
print(classification_report(y_train, p))

              precision    recall  f1-score   support

         0.0       0.74      1.00      0.85      4152
         1.0       0.00      0.00      0.00      1448

    accuracy                           0.74      5600
   macro avg       0.37      0.50      0.43      5600
weighted avg       0.55      0.74      0.63      5600



  _warn_prf(average, modifier, msg_start, len(result))


After training the svm model, predicting with the trained model also takes much time and training accuracy is also very low.

In [58]:
from libsvm import *
from libsvm.svmutil import *

In [63]:
prob = svm_problem([1,-1],[[1,0,1],[-1,0,-1]])
param = svm_parameter('-q')
  ## training  the model
model_libsvm = svm_model(prob, param)
#testing the model
model_libsvm.predict([1, 1, 1])

TypeError: __init__() takes 1 positional argument but 3 were given

In [43]:
sum(y_train)/len(y_train)*100

25.857142857142858

In [44]:
sum(data[:,-1])/len(data[:,-1])*100

25.68159948679568

In [65]:
libsvm?