In [96]:
import pandas as pd
import numpy as np
import seaborn as sns

In [97]:
df = pd.read_csv('emails.csv')

In [98]:
df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [99]:
df.shape

(5719, 2)

In [100]:
df = df.iloc[:,:2]

In [101]:
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5714,Subject: re : research and development charges...,0
5715,"Subject: re : receipts from visit jim , than...",0
5716,Subject: re : enron case study update wow ! a...,0
5717,"Subject: re : interest david , please , call...",0


In [102]:
df['spam'].value_counts()

0    4358
1    1361
Name: spam, dtype: int64

In [103]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [104]:
df.dropna(inplace=True)

In [105]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [106]:
df['spam'].value_counts()

0    4358
1    1361
Name: spam, dtype: int64

# Separate x and y

In [107]:
x = df.text.values

In [108]:
y = df.spam.values

In [109]:
y

array([1, 1, 1, ..., 0, 0, 0])

# split dataset into train and test

In [110]:
from sklearn.model_selection import train_test_split

In [111]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2, random_state=100)

# Data preprocessing

In [112]:
from sklearn.feature_extraction.text import CountVectorizer

In [113]:
cv=CountVectorizer()

In [114]:
x_train=cv.fit_transform(xtrain)

In [115]:
x_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# naive bayes

In [116]:
from sklearn.naive_bayes import MultinomialNB

In [117]:
model = MultinomialNB()
model.fit(x_train,ytrain)

MultinomialNB()

In [118]:
x_test = cv.transform(xtest)

In [119]:
x_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [120]:
model.predict(x_test)

array([1, 0, 0, ..., 0, 1, 0])

In [121]:
ytest

array([1, 0, 0, ..., 0, 1, 0])

In [122]:
model.score(x_test,ytest)

0.986013986013986

# prediction

In [123]:
emails = ['hey i am lokking for machine learning tutorial in begali language','hey you win an iphone x giveaway for free please do the survey']

In [124]:
cv_emails = cv.transform(emails)

In [125]:
model.predict(cv_emails)

array([0, 1])

# performance matrix


In [126]:
pred = model.predict(x_test)

In [127]:
pred

array([1, 0, 0, ..., 0, 1, 0])

# accuracy

In [128]:
from sklearn.metrics import accuracy_score

In [129]:
accuracy_score(ytest,pred)

0.986013986013986

# confusion matrix

In [130]:
from sklearn.metrics import confusion_matrix

In [131]:
confusion_matrix(ytest,pred)

array([[840,  10],
       [  6, 288]])

# report

In [132]:
from sklearn.metrics import classification_report

In [133]:
print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       850
           1       0.97      0.98      0.97       294

    accuracy                           0.99      1144
   macro avg       0.98      0.98      0.98      1144
weighted avg       0.99      0.99      0.99      1144



# Data Processing by Tf-idf Vectorizer

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer
TV =TfidfVectorizer()

In [135]:
x = TV.fit_transform(x)

# K-Fold Cross Validation

In [136]:
from sklearn.model_selection import cross_val_score

In [137]:
cv = cross_val_score(model, x, y, cv=10)

In [138]:
cv

array([0.87587413, 0.86538462, 0.88286713, 0.87762238, 0.88111888,
       0.88286713, 0.86888112, 0.85839161, 0.8951049 , 0.88791594])

In [139]:
cv.mean()

0.8776027825064603