Importing Libraries

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

Load the Dataset

In [57]:

mail_data = pd.read_csv('data/dataset.csv')

In [58]:
mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [59]:
mail_data.sample(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
329,2147,ham,Subject: revised buyback\r\nthe marathon - ash...,0
4265,1640,ham,Subject: equistar nomination & scheduling chan...,0
912,2666,ham,Subject: re : revised - tenaska iv - cleburne ...,0
4047,1538,ham,Subject: re : mobil beaumont\r\nthe activity f...,0
4947,4838,spam,Subject: 86 % off for all new software . vindi...,1


Remove Null Values

In [60]:
cols = ['Unnamed: 0','label_num']

In [61]:
mail_data.drop(columns= cols, axis = 1, inplace = True)

In [62]:
mail_data = mail_data.rename(columns={'text': 'Message'})

In [63]:
mail_data['Message'].isnull().sum()

0

In [64]:
mail_data.dropna(axis = 0, inplace = True)

In [65]:
mail_data.head()

Unnamed: 0,label,Message
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [66]:
mail_data.shape

(5171, 2)

Label Encoding

In [67]:
le = LabelEncoder()
mail_data['Category'] = le.fit_transform(mail_data['label'])
mail_data.head()

Unnamed: 0,label,Message,Category
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [68]:
X = mail_data['Message']
Y = mail_data['Category']

In [69]:
print(X)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: Message, Length: 5171, dtype: object


In [70]:
print(Y)

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: Category, Length: 5171, dtype: int32


Splitting the data into training data & test data

In [71]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

Feature Extraction

In [72]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [73]:
print(X_train_features)

  (0, 37566)	0.03390898877474294
  (0, 26894)	0.29612657680252463
  (0, 4132)	0.5855796838151281
  (0, 35035)	0.2927898419075641
  (0, 12467)	0.19120758987148667
  (0, 27541)	0.15176632211845267
  (0, 17814)	0.15230033297810594
  (0, 26193)	0.13355578907978374
  (0, 28212)	0.13465733801362062
  (0, 56)	0.19294689361509368
  (0, 2003)	0.22022236239236187
  (0, 72)	0.11287588933297091
  (0, 64)	0.2320330901784706
  (0, 12407)	0.11614143442868556
  (0, 38670)	0.12368485263465394
  (0, 35741)	0.1797981565768853
  (0, 25011)	0.09352929013170533
  (0, 24291)	0.086819999802958
  (0, 25343)	0.1327903166471815
  (0, 16641)	0.1637097662727073
  (0, 28331)	0.100413186925558
  (0, 39295)	0.18296953062812027
  (0, 21876)	0.15452467049357102
  (0, 39122)	0.1489707406646257
  (1, 37566)	0.029771265227566405
  :	:
  (4135, 40607)	0.10560544386588379
  (4135, 38317)	0.10810759560254152
  (4135, 2822)	0.12063179979974499
  (4135, 37874)	0.1099845935208329
  (4135, 36363)	0.10387970460445373
  (4135, 122

In [74]:
print(X_test_features)

  (0, 5189)	0.23888649266387274
  (0, 6766)	0.11329022846512532
  (0, 9023)	0.19016636851698374
  (0, 10632)	0.22300582540706468
  (0, 11790)	0.29361716201333776
  (0, 11794)	0.22139080936025968
  (0, 14819)	0.20686246104990136
  (0, 15311)	0.24543639993616279
  (0, 19186)	0.16675340761764945
  (0, 21599)	0.23653284460039495
  (0, 22344)	0.30396569476135404
  (0, 23399)	0.3103132545462554
  (0, 26697)	0.18494703357610456
  (0, 29805)	0.26073650688988315
  (0, 31698)	0.28535401819878486
  (0, 31910)	0.18031545669894697
  (0, 33972)	0.18060853715912137
  (0, 35538)	0.1592808677490461
  (0, 35775)	0.2206064498199292
  (0, 37566)	0.04117763082209204
  (0, 38674)	0.09498206103970716
  (1, 0)	0.09701398661946094
  (1, 1)	0.12280140050665672
  (1, 198)	0.038702840981743925
  (1, 325)	0.07550270588931449
  :	:
  (1034, 17209)	0.0775214299246368
  (1034, 17683)	0.09205157546828185
  (1034, 17814)	0.066318222034221
  (1034, 17829)	0.0674124600271317
  (1034, 17843)	0.08401751119886577
  (1034, 2

In [75]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

Training the Model

In [76]:
nbb = MultinomialNB()
nbb.fit(X_train_features, Y_train)

svc = LinearSVC(loss = 'squared_hinge', random_state = 0)
svc.fit(X_train_features, Y_train)

Accuracy Score

In [77]:
print("SVC:")
svc_pred_on_training_data = svc.predict(X_train_features)
svc_accuracy_on_training_data = accuracy_score(Y_train, svc_pred_on_training_data)
print('Accuracy on training data :', svc_accuracy_on_training_data * 100)

svc_prediction_on_test_data = svc.predict(X_test_features)
svc_accuracy_on_test_data = accuracy_score(Y_test, svc_prediction_on_test_data)
print('Accuracy on test data : ', svc_accuracy_on_test_data * 100)

print("\nNaive Bayes:")
nbb_prediction_on_training_data = nbb.predict(X_train_features)
nbb_accuracy_on_training_data = accuracy_score(Y_train, nbb_prediction_on_training_data)
print('Accuracy on training data :', nbb_accuracy_on_training_data * 100)

nbb_prediction_on_test_data = nbb.predict(X_test_features)
nbb_accuracy_on_test_data = accuracy_score(Y_test, nbb_prediction_on_test_data)
print('Accuracy on test data : ', nbb_accuracy_on_test_data * 100)

SVC:
Accuracy on training data : 100.0
Accuracy on test data :  99.1304347826087

Naive Bayes:
Accuracy on training data : 96.80851063829788
Accuracy on test data :  91.88405797101449


Classification Report

In [None]:
matrix = classification_report(Y_test, svc_prediction_on_test_data)
print(matrix)

print("\n")
matrix = classification_report(Y_test, svc_prediction_on_test_data)
print(matrix)

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       732
           1       1.00      0.72      0.84       303

    accuracy                           0.92      1035
   macro avg       0.95      0.86      0.89      1035
weighted avg       0.93      0.92      0.91      1035



Predictive System

In [79]:
input_mail = ["Free entry in mall. Do it today!!Win cash and much more."]
#input_mail = ["I've been looking for the right words to thank you for this breather.You have been wonderful and a blessing at all times."]
input_data_features = feature_extraction.transform(input_mail)

prediction = svc.predict(input_data_features)

if (prediction[0] == 0):
    print(input_mail[0], '--> Ham mail')
else:
    print(input_mail[0], '--> Spam mail')

Free entry in mall. Do it today!!Win cash and much more. --> Spam mail
