In [62]:
#Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import pickle
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB


In [47]:
#Load the dataset
df=pd.read_csv("mail_data.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [48]:
#Dimension of the dataframe
df.shape

(5572, 2)

In [49]:
#Summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [50]:
#Datatypes of each column
df.dtypes

Category    object
Message     object
dtype: object

In [51]:
#Check proportion of each class
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [52]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [53]:
df.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [54]:
# Different columns
df.columns

Index(['Category', 'Message'], dtype='object')

In [55]:
# Labelling classes
df['Category']=df['Category'].map({'ham':0,'spam':1})


In [56]:
df

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [57]:
X = df["Message"]

In [58]:
#Target
y = df["Category"]

In [59]:
# Feature Extraction
cv=CountVectorizer()
X=cv.fit_transform(X)

In [60]:
#Data splitting
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [61]:
with open('Vectorizer.pkl','wb') as file:
    pickle.dump(cv,file)

In [63]:
#model 
model=MultinomialNB()
model.fit(X_train, y_train)

In [64]:
with open('Spam_Detection_model_NB.pkl','wb') as file:
    pickle.dump(model,file)

In [65]:
#Cross Validation
scores1=cross_val_score(model,X_train,y_train,cv=10,scoring='accuracy')
scores1

array([0.98206278, 0.98654709, 0.97533632, 0.99103139, 0.97533632,
       0.97982063, 0.97085202, 0.97977528, 0.98876404, 0.97752809])

In [66]:
#Prediction
testing_predict = model.predict(X_test)

In [67]:
#Accuracy
Acc=accuracy_score(testing_predict,y_test)

Acc=Acc*100
print(Acc)

98.56502242152466


In [68]:
#Classification Report
report2=classification_report(y_test,testing_predict)
print(report2)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.94      0.95      0.95       149

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [69]:
#Confusion matrix
confusion_matrix=confusion_matrix(y_test,testing_predict)
confusion_matrix

array([[957,   9],
       [  7, 142]], dtype=int64)

In [70]:
#Testing
msg="Hello"
data=[msg]
result=cv.transform(data).toarray()

In [71]:
prediction = model.predict(result)
print(prediction)

[0]


In [72]:
#Testing
msg="Free entry in 2 a wkly comp to win FA Cup final"
data=[msg]
result=cv.transform(data).toarray()

In [73]:
prediction = model.predict(result)
print(prediction)

[1]


In [None]:
p