In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

In [None]:
path = '/gdrive/My Drive/Lemalabs/Data/'
data = pd.read_csv(path + 'emails.csv')

In [None]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
data.shape

(5728, 2)

In [None]:
x = data['text']
y = data['spam']

In [None]:
y

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5728, dtype: int64

In [None]:
x

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object

In [None]:
print('counts of spam: {}, label is {}'.format(sum(y==1),'1'))
print('counts of not spam: {}, label is {}'.format(sum(y==0),'0'))

counts of spam: 1368, label is 1
counts of not spam: 4360, label is 0


In [None]:
x = x.values.tolist()
type(x)

list

###Pre-Processing of data - NLP

In [None]:
from nltk.corpus import stopwords
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def tok(x):
  return(tokenizer.tokenize(x))

In [None]:
sw = stopwords.words('english')

In [None]:
def process_text(text):
  pattern = '[0-9]'

  data = [re.sub(pattern,'',i) for i in text]

  pattern1 = '_'

  data = [re.sub(pattern1,'',i) for i in data]

  return data

In [None]:
data = process_text(x)
x

["Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  m

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
Vectorizer = CountVectorizer(analyzer = 'word', tokenizer = tok, stop_words= sw,ngram_range= (1,1))

In [None]:
vector_x = Vectorizer.fit_transform(data)
print(vector_x.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
vector_x.shape

(5728, 33563)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
Tfid = TfidfTransformer()

In [None]:
vector_x = Tfid.fit_transform(vector_x)

In [None]:
print(vector_x.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


###Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(vector_x,np.array(y), test_size = 0.25)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4296, 33563), (1432, 33563), (4296,), (1432,))

###Classification

In [None]:
from sklearn.naive_bayes import BernoulliNB
classifier1 = BernoulliNB()
classifier1.fit(x_train,y_train)

y_pred1 = classifier1.predict(x_test)

targets = ['not spam','spam']

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred1))
print(classification_report(y_test, y_pred1, target_names = targets))

[[1100   11]
 [  33  288]]
              precision    recall  f1-score   support

    not spam       0.97      0.99      0.98      1111
        spam       0.96      0.90      0.93       321

    accuracy                           0.97      1432
   macro avg       0.97      0.94      0.95      1432
weighted avg       0.97      0.97      0.97      1432



In [None]:
from sklearn.naive_bayes import GaussianNB
classifier2 = GaussianNB()
classifier2.fit(x_train.toarray(),y_train)

y_pred2 = classifier2.predict(x_test.toarray())

targets = ['not spam','spam']

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred2))
print(classification_report(y_test, y_pred2, target_names = targets))

[[1103    8]
 [  46  275]]
              precision    recall  f1-score   support

    not spam       0.96      0.99      0.98      1111
        spam       0.97      0.86      0.91       321

    accuracy                           0.96      1432
   macro avg       0.97      0.92      0.94      1432
weighted avg       0.96      0.96      0.96      1432



In [None]:
from sklearn.naive_bayes import ComplementNB
classifier3 = ComplementNB()
classifier3.fit(x_train,y_train)

y_pred3 = classifier3.predict(x_test)

targets = ['not spam','spam']

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred3))
print(classification_report(y_test, y_pred3, target_names = targets))

[[1106    5]
 [  58  263]]
              precision    recall  f1-score   support

    not spam       0.95      1.00      0.97      1111
        spam       0.98      0.82      0.89       321

    accuracy                           0.96      1432
   macro avg       0.97      0.91      0.93      1432
weighted avg       0.96      0.96      0.95      1432



In [None]:
s = []
s.append(input())

 Tufts University Graduate Admissions <gradadmissions@tufts.edu> Thu, Sep 10, 7:32 PM (13 days ago) to me  Why is this message in spam? It is similar to messages that were identified as spam in the past. Report not spam   Tufts GSoAS College of Engineering     twitter	   Instagram	   blog campus photo Dear Revanth,  Join us for weekly virtual information sessions and tours, at our ongoing Tuesday Tours of Tufts at 10am and 2pm!  To register for an event during September, click below:  September 15, 10:00 - 11:00 am  September 15, 2:00 - 3:00 pm  September 22, 10:00 - 11:00 am  September 22, 2:00 - 3:00 pm  September 29, 10:00 - 11:00 am  September 29, 2:00 - 3:00 pm We look forward to connecting with you!  Sincerely,  Office of Graduate Admissions   Apply Now Application Deadline Request Information Can't make these sessions? Explore all events and info sessions on our events calendar.  image   Tufts University  Office of Graduate Admissions  Bendetson Hall  617.627.3395  gradadmission

In [None]:
s = process_text(s)
vector_email = Vectorizer.transform(s)

In [None]:
vector_email.shape

(1, 33563)

In [None]:
tfid_email = Tfid.transform(vector_email)

In [None]:
pred = classifier1.predict(tfid_email)
pred

array([0])

In [None]:
if (pred[0]==0):
  print("The mail is not a spam")

else:
  print("It is a spam")

The mail is not a spam
