In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [3]:
mail = pd.read_csv('mail.csv', index_col=None)

In [4]:
mail.head()

Unnamed: 0.1,Unnamed: 0,labels,clean
0,0,0,"['', 'date', 'wed', '21', 'aug', '2002', '10',..."
1,1,0,"['', 'martin', 'posted', 'ntassos', 'papadopou..."
2,2,0,"['', 'man', 'threatens', 'explosion', 'moscow'..."
3,3,0,"['', 'klez', 'virus', 'die', 'nalready', 'prol..."
4,4,0,"['', 'wed', 'aug', '21', '2002', '15', '46', '..."


In [5]:
mail.drop('Unnamed: 0', axis=1, inplace =True)

In [6]:
mail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3051 entries, 0 to 3050
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  3051 non-null   int64 
 1   clean   3051 non-null   object
dtypes: int64(1), object(1)
memory usage: 47.8+ KB


In [7]:
mail.describe()

Unnamed: 0,labels
count,3051.0
mean,0.163881
std,0.370228
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


### Clean Text with stemming/Lemmatizing

In [8]:
wn = WordNetLemmatizer()

In [9]:
stopword = stopwords.words('english')

In [10]:
mail['clean1'] = mail['clean'].apply(lambda x: " ".join([wn.lemmatize(i).lower() for i in re.sub('[^a-zA-Z0-9]',' ',x).split() if i not in stopword]))

In [11]:
mail

Unnamed: 0,labels,clean,clean1
0,0,"['', 'date', 'wed', '21', 'aug', '2002', '10',...",date wed 21 aug 2002 10 54 46 0500 chris garri...
1,0,"['', 'martin', 'posted', 'ntassos', 'papadopou...",martin posted ntassos papadopoulos greek sculp...
2,0,"['', 'man', 'threatens', 'explosion', 'moscow'...",man threatens explosion moscow nthursday augus...
3,0,"['', 'klez', 'virus', 'die', 'nalready', 'prol...",klez virus die nalready prolific virus ever kl...
4,0,"['', 'wed', 'aug', '21', '2002', '15', '46', '...",wed aug 21 2002 15 46 ulises ponce wrote hi co...
...,...,...,...
3046,1,"['', 'stumbling', 'nthe', 'greatest', 'way', '...",stumbling nthe greatest way marketing century ...
3047,1,"['', 'mean', 'made', 'usa', 'cna', 'hitting', ...",mean made usa cna hitting road tell ingenuity ...
3048,1,"['', 'html', 'head', 'meta', 'http', 'equiv', ...",html head meta http equiv content language con...
3049,1,"['', 'html', 'body', 'tr', 'valign', 'top', 't...",html body tr valign top td height 295 bgcolor ...


In [12]:
X_train, X_test, y_train, y_test = train_test_split(mail['clean1'],mail.labels,test_size=0.2, random_state=42)

In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2440,)
(2440,)
(611,)
(611,)


In [14]:
nb_pipeline = Pipeline([('countvect', CountVectorizer(lowercase=True)),
                       ('mnb', MultinomialNB(alpha=0.0,class_prior=[0.4, 0.6]))])

In [15]:
scores = cross_val_score(nb_pipeline, X_train, y_train, cv=5)
print(scores)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[0.96311475 0.9795082  0.97131148 0.98360656 0.97336066]


In [16]:
model = nb_pipeline.fit(X_train, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


In [17]:
y_pred = model.predict(X_test)

In [18]:
model.score(X_test, y_test)

0.983633387888707

In [19]:
metrics.accuracy_score(y_test, y_pred)

0.983633387888707

In [20]:
metrics.confusion_matrix(y_test, y_pred, labels=[0,1])

array([[516,   0],
       [ 10,  85]])

In [21]:
metrics.f1_score(y_test, y_pred, average=None)

array([0.99040307, 0.94444444])

In [22]:
metrics.f1_score(y_test, y_pred, average='macro')

0.9674237577308594

In [23]:
metrics.f1_score(y_test, y_pred, average='micro')

0.983633387888707

In [24]:
metrics.f1_score(y_test, y_pred, average='weighted')

0.9832572943815644

In [25]:
X_test[y_test > y_pred]

2679    result feedback form submitted jamie msn com t...
3050    uncommon exotic pleasure botanical nfeeling ma...
2859    3672cxjk2 471rlyh4856dcwl9 357kcvy4323scva9 71...
2829    xc2 x93connecting business world wide web xc2 ...
2752    greeting nyou receiving letter expressed inter...
2783    dear sir nmy name dr steven duba son mr theo n...
2931    love nworld capital group group funding source...
2784    hi job seeker nwhen create free net temp accou...
2805    nsent mail message nfrom enenkio webtv net rob...
2921    nhallo ni found email id directoric ni russian...
Name: clean1, dtype: object

In [26]:
X_test[y_test < y_pred]

Series([], Name: clean1, dtype: object)

In [27]:
nb_pipeline_tf = Pipeline([('tfidfvect', TfidfVectorizer()),
                       ('mnb', MultinomialNB(alpha=0.0,class_prior=[0.4, 0.6]))])

In [28]:
scores = cross_val_score(nb_pipeline_tf, X_train, y_train, cv=5)
print(scores)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[0.97131148 0.98770492 0.9795082  0.98565574 0.97745902]


In [29]:
tfmodel = nb_pipeline_tf.fit(X_train, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


In [30]:
y_pred_tf = tfmodel.predict(X_test)

In [31]:
metrics.accuracy_score(y_test, y_pred_tf)

0.9852700490998363

In [32]:
metrics.confusion_matrix(y_test, y_pred_tf, labels=[0,1])

array([[516,   0],
       [  9,  86]])

In [33]:
metrics.f1_score(y_test, y_pred, average=None)

array([0.99040307, 0.94444444])

In [34]:
metrics.f1_score(y_test, y_pred, average='micro')

0.983633387888707

In [35]:
metrics.f1_score(y_test, y_pred, average='macro')

0.9674237577308594

In [36]:
metrics.f1_score(y_test, y_pred, average='weighted')

0.9832572943815644

In [37]:
X_test[y_test > y_pred_tf]

2679    result feedback form submitted jamie msn com t...
3050    uncommon exotic pleasure botanical nfeeling ma...
2859    3672cxjk2 471rlyh4856dcwl9 357kcvy4323scva9 71...
2829    xc2 x93connecting business world wide web xc2 ...
2783    dear sir nmy name dr steven duba son mr theo n...
2931    love nworld capital group group funding source...
2784    hi job seeker nwhen create free net temp accou...
2805    nsent mail message nfrom enenkio webtv net rob...
2921    nhallo ni found email id directoric ni russian...
Name: clean1, dtype: object

In [38]:
X_test[y_test < y_pred_tf]

Series([], Name: clean1, dtype: object)

In [39]:
test = ['Get started like a pro | Ultimaker Get started like a pro Having trouble viewing this email? Click here . Hi Shailesh, More and more companies are becoming convinced that 3D printing  plays a key role in their digital transformation roadmap. But what would life at  look like if you adopted 3D printing? Where would you and your team start? To give you an idea, check out our 5 top tips to ramp-up 3D printing production. Read the blog Ultimaker - Corporate Headquarters Stationsplein 32, 3511 ED Utrecht, The Netherlands Email: info@ultimaker.com Phone: +31 88 383 4000 If youâd rather not receive these updates,']

In [40]:
nb_pipeline.named_steps['countvect'].transform(test)

<1x33994 sparse matrix of type '<class 'numpy.int64'>'
	with 46 stored elements in Compressed Sparse Row format>

In [41]:
nb_pipeline.predict(test)

array([0])

In [42]:
inbox = pd.read_csv('inBox.csv')

In [49]:
test = inbox['Message']
test

0     Our latest project allows you to visualize dat...
1     Can you answer this question? As someone who a...
2     Good News for ATM Card Holders In view of anno...
3     Dear Learners Join us for a brand new lecture ...
4     Hello students and teachers, Hope all of you a...
5     Dear Learners Join us for a brand new lecture ...
6     Dear Learners Join us for a brand new lecture ...
7     Dear Learners, Join us for a brand new lecture...
8     Dear Learners, Join us for a brand new lecture...
9     Dear Learners We are pleased to announce an on...
10    https://vc.lidolearning.com?id=300d47a1-dc9a-4...
11    IBM Developer - Artificial Intelligence This m...
12    Convert to containers while you migrate.\r\n\t...
13    How to find the right data for your projects A...
14    Four ways to add data to your projects In addi...
15    Ready to learn something new? ... Recommendati...
Name: Message, dtype: object

In [50]:
nb_pipeline.named_steps['countvect'].transform(test)

<16x33994 sparse matrix of type '<class 'numpy.int64'>'
	with 1106 stored elements in Compressed Sparse Row format>

In [51]:
nb_pipeline.predict(test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [52]:
nb_pipeline_tf.named_steps['tfidfvect'].transform(test)

<16x33994 sparse matrix of type '<class 'numpy.float64'>'
	with 1106 stored elements in Compressed Sparse Row format>

In [53]:
nb_pipeline_tf.predict(test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [55]:
test[11]

"IBM Developer - Artificial Intelligence This message contains graphics.  If you do not see the graphics, click here to view . IBM \u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c\xa0\u200c IBM Developer Artificial Intelligence April 2020 Hello, AI enthusiasts! The 2020 Call for Code Global Challenge is up and running. Once again, developers around the globe will submit coding solutions that can bring relief to those most in need. This year's Challenge takes on both climate change and COVID-19, two urgent crises facing our world today. Join the 2020 Coding Challenge, and put your ideas into the world. You may even see your code turned into a technology that changes lives. Will you accept the challenge? Visit the AI hub Watson Knowledge Studio: Advanced Rules Editor Watch the video Spotlights Get started with the Data Asset eXchange DAX offers a trusted source for open data sets for AI that are ready to use in enterpris