In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm
from IPython.display import Image
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline  

## Dataset
Pandas provide a .read_csv() function. the paramter engine='python' was needed here.
We can see in the output that there are two columns "text" and "spam" 

In [2]:
data = pd.read_csv('/home/shreyansh/Downloads/emails.csv', encoding='latin-1', engine='python')
data.head(n=10)
print(data)

                                                   text  spam
0     Subject: naturally irresistible your corporate...     1
1     Subject: the stock trading gunslinger  fanny i...     1
2     Subject: unbelievable new homes made easy  im ...     1
3     Subject: 4 color printing special  request add...     1
4     Subject: do not have money , get software cds ...     1
5     Subject: great nnews  hello , welcome to medzo...     1
6     Subject: here ' s a hot play in motion  homela...     1
7     Subject: save your money buy getting this thin...     1
8     Subject: undeliverable : home based business f...     1
9     Subject: save your money buy getting this thin...     1
10    Subject: las vegas high rise boom  las vegas i...     1
11    Subject: save your money buy getting this thin...     1
12    Subject: brighten those teeth  get your  teeth...     1
13    Subject: wall street phenomenon reaps rewards ...     1
14    Subject: fpa notice : ebay misrepresentation o...     1
15    Su

In [3]:
f = TfidfVectorizer(stop_words = 'english')
X = f.fit_transform(data["text"])
print(np.shape(X))
print(X)
f.get_feature_names()

(5728, 36996)
  (0, 32145)	0.01781272601181811
  (0, 23219)	0.12256415839423043
  (0, 18705)	0.1432830450797234
  (0, 9986)	0.07425233664021534
  (0, 17562)	0.09302659600058759
  (0, 21006)	0.11259587569184905
  (0, 27817)	0.06817457798935368
  (0, 16546)	0.07848771947213565
  (0, 27941)	0.11391594069511475
  (0, 9223)	0.16756208708238118
  (0, 21520)	0.11577281792556753
  (0, 32408)	0.12371376538846193
  (0, 18103)	0.04504514237999522
  (0, 18751)	0.13491099920740895
  (0, 15964)	0.1033087647302782
  (0, 7986)	0.11461456851250225
  (0, 20818)	0.265157481584021
  (0, 32126)	0.12371376538846193
  (0, 31776)	0.12256415839423043
  (0, 24679)	0.09199667182622404
  (0, 35805)	0.13645440111292698
  (0, 21296)	0.09831341956555792
  (0, 32839)	0.09815099793568317
  (0, 12539)	0.09281579204902306
  (0, 26937)	0.20113998716066434
  :	:
  (5727, 24659)	0.10273386204392845
  (5727, 21490)	0.04981665317903681
  (5727, 5683)	0.557612387822799
  (5727, 30755)	0.052872641241471056
  (5727, 2807)	0.176

['00',
 '000',
 '0000',
 '000000',
 '00000000',
 '0000000000',
 '000000000003619',
 '000000000003991',
 '000000000003997',
 '000000000005168',
 '000000000005409',
 '000000000005411',
 '000000000005412',
 '000000000005413',
 '000000000005820',
 '000000000006238',
 '000000000006452',
 '000000000007494',
 '000000000007498',
 '000000000007876',
 '000000000010552',
 '000000000011185',
 '000000000012677',
 '000000000012734',
 '000000000012735',
 '000000000012736',
 '000000000012738',
 '000000000012741',
 '000000000012987',
 '000000000013085',
 '000000000013287',
 '000000000015384',
 '000000000015793',
 '000000000023619',
 '000000000024099',
 '000000000025307',
 '000000000025312',
 '000010220',
 '0000102317',
 '0000102374',
 '0000102789',
 '0000104281',
 '0000104282',
 '0000104486',
 '0000104631',
 '0000104730',
 '0000104776',
 '0000104778',
 '0000107043',
 '0000108729',
 '000066',
 '0001',
 '000166',
 '0002',
 '000202',
 '0003',
 '0004',
 '0005',
 '0006',
 '00076',
 '0009249480',
 '000924948

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, data['spam'], test_size=0.33, random_state=42)
print([np.shape(X_train), np.shape(X_test)])

[(3837, 36996), (1891, 36996)]


In [12]:
list_C = np.arange(500, 2000, 100) #100000
score_train = np.zeros(len(list_C))
score_test = np.zeros(len(list_C))
recall_test = np.zeros(len(list_C))
precision_test= np.zeros(len(list_C))
count = 0
for C in list_C:
    svc = svm.SVC(C=C,kernel='sigmoid')
    svc.fit(X_train, y_train)
    score_train[count] = svc.score(X_train, y_train)
    score_test[count]= svc.score(X_test, y_test)
    recall_test[count] = metrics.recall_score(y_test, svc.predict(X_test))
    precision_test[count] = metrics.precision_score(y_test, svc.predict(X_test))
    count = count + 1 

array([ 500,  600,  700,  800,  900, 1000, 1100, 1200, 1300, 1400, 1500,
       1600, 1700, 1800, 1900])

In [6]:
matrix = np.matrix(np.c_[list_C, score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data = matrix, columns = 
             ['C', 'Train Accuracy', 'Test Accuracy', 'Test Recall', 'Test Precision'])
models.head(n=10)

Unnamed: 0,C,Train Accuracy,Test Accuracy,Test Recall,Test Precision
0,500.0,0.771957,0.739291,0.0,0.0
1,600.0,0.771957,0.739291,0.0,0.0
2,700.0,0.771957,0.739291,0.0,0.0
3,800.0,0.771957,0.739291,0.0,0.0
4,900.0,0.771957,0.739291,0.0,0.0
5,1000.0,0.771957,0.73982,0.002028,1.0
6,1100.0,0.78134,0.749339,0.03854,1.0
7,1200.0,0.787594,0.754627,0.058824,1.0
8,1300.0,0.807402,0.776838,0.144016,1.0
9,1400.0,0.816784,0.790058,0.194726,1.0


In [7]:
best_index = models['Test Precision'].idxmax()
models.iloc[best_index, :]

C                 1000.000000
Train Accuracy       0.771957
Test Accuracy        0.739820
Test Recall          0.002028
Test Precision       1.000000
Name: 5, dtype: float64

In [8]:
models[models['Test Precision']==1].head(n=5)

Unnamed: 0,C,Train Accuracy,Test Accuracy,Test Recall,Test Precision
5,1000.0,0.771957,0.73982,0.002028,1.0
6,1100.0,0.78134,0.749339,0.03854,1.0
7,1200.0,0.787594,0.754627,0.058824,1.0
8,1300.0,0.807402,0.776838,0.144016,1.0
9,1400.0,0.816784,0.790058,0.194726,1.0


In [9]:
best_index = models[models['Test Precision']==1]['Test Accuracy'].idxmax()
svc = svm.SVC(C=list_C[best_index])
svc.fit(X_train, y_train)
models.iloc[best_index, :]

C                 1900.000000
Train Accuracy       0.835027
Test Accuracy        0.805394
Test Recall          0.253550
Test Precision       1.000000
Name: 14, dtype: float64

In [10]:
m_confusion_test = metrics.confusion_matrix(y_test, svc.predict(X_test))
pd.DataFrame(data = m_confusion_test, columns = ['Predicted 0', 'Predicted 1'],
            index = ['Actual 0', 'Actual 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1397,1
Actual 1,199,294
