# Build your own dataset

In this exercise, we provided a preprocessed training set and test set. These datasets were created using the same functions (`processEmail` and `emailFeatures`) that you now have completed. For this optional (ungraded) exercise, you will build your own dataset using the original emails from the SpamAssassin Public Corpus.

Your task in this optional (ungraded) exercise is to download the original
files from the public corpus and extract them. After extracting them, you should run the `processEmail` and `emailFeatures` functions on each email to extract a feature vector from each email. This will allow you to build a dataset `X`, `y` of examples. You should then randomly divide up the dataset into a training set, a cross validation set and a test set.

While you are building your own dataset, we also encourage you to try building your own vocabulary list (by selecting the high frequency words that occur in the dataset) and adding any additional features that you think
might be useful. Finally, we also suggest trying to use highly optimized SVM toolboxes such as [`LIBSVM`](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) or [`scikit-learn`](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm).

In [36]:
# used for manipulating directory paths
import os

import numpy as np
# Import regular expressions to process emails
import re

from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# Optimize with Sklearn svm 
from sklearn import svm
from sklearn.svm import SVC
# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# library written for this exercise providing additional functions for assignment submission, and others
import utils

%matplotlib inline
# for expliting file name with its extention
from pathlib import Path
# for extracting files
import tarfile
# for reloading module
import importlib
# Compute execution time
import time

In [2]:
# To change directory
os.chdir('F:\Machine_Learning\ML_by_Andrew_Ng\Assignments_in_python\Ex6_week7')

In [70]:
# importing methods
import methods
importlib.reload(methods)
from methods import processEmail,emailFeatures, dataset3Params, gaussian_Kernel, executionTime, gaussianKernelGramMatrix

In [71]:
# Extracting files
archives_path = 'D:/Mail Sample/Obsolete'# directory of zipped folders
extract_path = 'Extracted_files' # directory of unzipped folders
for filename in os.listdir(archives_path):
    archives = tarfile.open(os.path.join(archives_path, filename)) 
    archives.extractall(path = extract_path) 

KeyboardInterrupt: 

In [None]:
# Reading files and writing files to txt
path = 'Extracted_files' # directory of unzipped folders
pathw = 'Database_' # directory for saving folders of txt files
for foldername in os.listdir():
    folderpath = os.path.join(pathw, foldername)
    os.makedirs(folderpath)
    for filename in os.listdir(os.path.join(path, foldername)):       
        with open(os.path.join(os.path.join(path, foldername), filename), encoding="latin-1") as f:
            file_content = f.read()
            filename = Path(filename).stem
            with open(os.path.join(folderpath, filename +'.txt'), 'w') as writefile:
                writefile.write(file_content)

In [None]:
# Processing and featuring mails, and saving the features in numpy array locally
for foldername in os.listdir('Database_'):
    X = []
    for filename in os.listdir(os.path.join('Database_', foldername)):
        with open(os.path.join(os.path.join('Database_', foldername), filename)) as fid:
             file_contents = fid.read()
        word_indices  = processEmail(file_contents)
        X.append(emailFeatures(word_indices))
    X_ham = np.array(X)
    np.save(os.path.join('Database', foldername + '.npy'), X_ham)

In [None]:
# loading features 
X_h1 = np.load('Database/easy_ham.npy')
X_h2 = np.load('Database/easy_ham_2.npy')
X_h3 = np.load('Database/hard_ham.npy')
X_s1 = np.load('Database/spam.npy')
X_s2 = np.load('Database/spam_2.npy')

In [None]:
# Making single vector for all feature vectors
X_h = np.concatenate((X_h1, X_h2, X_h3), axis = 0)
X_s = np.concatenate((X_s1, X_s2), axis = 0)
X_h_y = np.concatenate((X_h, np.zeros((X_h.shape[0], 1))), axis = 1)
X_s_y = np.concatenate((X_s, np.ones((X_s.shape[0],1))), axis = 1)
data = np.concatenate((X_h_y, X_s_y), axis = 0)

In [None]:
np.save(os.path.join('Database', 'data.npy'), data)

In [4]:
data = np.load('Database/data.npy')

In [6]:
for i in range(3):
    np.random.shuffle(data) 

In [7]:
data[:100, 1899]

array([1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1.])

In [8]:
# Dividing the data for train(60%), cross validation(20%) and test(20%)
data_train = data[:5600, :]
data_val = data[5600:7450, :]
data_test = data[7400:, :]

In [9]:
# Training model
startTime = time.time()
X_train, y_train = data_train[:,:1899].astype(float), data_train[:, -1]
print('Training Linear SVM (Spam Classification)')
print('This may take 1 to 2 minutes ...\n')

C = 0.1
model = utils.svmTrain(X_train, y_train, C, utils.linearKernel)
executionTime(startTime)

Training Linear SVM (Spam Classification)
This may take 1 to 2 minutes ...

---11 minute(s) 42.99 seconds ---


702.9907653331757

In [10]:
# Compute the training accuracy
p = utils.svmPredict(model, X_train)

print('Training Accuracy: %.2f' % (np.mean(p == y_train) * 100))

Training Accuracy: 100.00


In [55]:
# Try different SVM Parameters here
#X_val, y_val = data_val[:,:1899].astype(float), data_val[:, -1]

#C, sigma = dataset3Params(X_train, y_train, X_val, y_val)
#print('C = ', C, 'sigma = ', sigma)3
# Train the SVM
# model = utils.svmTrain(X, y, C, lambda x1, x2: gaussianKernel(x1, x2, sigma))

#model = utils.svmTrain(X_train, y_train, C, gaussianKernel, args=(sigma,))

# Gaussian Kernal Taking hours to train the model that's why i replaced gaussian kernel whith linear Kernal
#model = utils.svmTrain(X_train, y_train, C, utils.linearKernal)
# compute the cross-validation accuracy
#p = utils.svmPredict(model, X_train)

#print('cross-validation Accuracy: %.2f' % (np.mean(p == y_val) * 100))

In [None]:
# compute the test accuracy
X_test, y_test = data_test[:,:1899].astype(float), data_test[:, -1]

print('Evaluating the trained Linear SVM on a test set ...')
p = utils.svmPredict(model, X_test)

print('Test Accuracy: %.2f' % (np.mean(p == y_test) * 100))

*Even we are using linear kernal of utils, it takes more than 10 minuthes to train model with the dataset. It will take hours to train model by utlis.svmTrain with gaussian kernal.*

*Now I am using svm of sklearn library.*
*svm with gaussian kernal is also not very efficient. It also takes much time and accurracy is 75% which is also very low.*


In [72]:
# Training model using svm with linear kernal of sklearn library
startTime = time.time()
C = 0.1
clf = svm.SVC(C = C, kernel = 'linear')

model = clf.fit(X_train,y_train)
executionTime(startTime)

--- 6.47 seconds ---


In [65]:
p = model.predict(X_train)

print('Training Accuracy: %.2f' % (np.mean(p == y_train) * 100))

Training Accuracy: 100.00


In [29]:
C = 0.1
clf = svm.SVC(C = C, kernel = gaussian_Kernel)

In [51]:
model = clf.fit(gaussianKernelGramMatrix(X_train, X_train), y_train)

In [54]:
p = model.predict(gaussianKernelGramMatrix(X_train, X_train))

print('cross-validation Accuracy: %.2f' % (np.mean(p == y_train) * 100))

cross-validation Accuracy: 74.00


In [56]:
X_val, y_val = data_val[:,:1899].astype(float), data_val[:, -1]
p = model.predict(gaussianKernelGramMatrix(X_train, X_val))

print('cross-validation Accuracy: %.2f' % (np.mean(p == y_val) * 100))

ValueError: X.shape[1] = 1850 should be equal to 5600, the number of samples at training time