In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('email.csv')
df.sample(6)

Unnamed: 0,Category,Message
2799,ham,Purity of friendship between two is not about ...
1000,ham,"Aight will do, thanks again for comin out"
718,ham,Book which lesson? then you msg me... I will c...
1115,ham,No no:)this is kallis home ground.amla home to...
1722,ham,Am watching house – very entertaining – am get...
1075,ham,Aight ill get on fb in a couple minutes


# 1.Understanding the data

In [3]:
df.shape

(5573, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.describe()

Unnamed: 0,Category,Message
count,5573,5573
unique,3,5158
top,ham,"Sorry, I'll call later"
freq,4825,30


# 1. Data Cleaning

In [6]:
df.nunique()

Category       3
Message     5158
dtype: int64

In [7]:
df['Category'].unique()

array(['ham', 'spam', '{"mode":"full"'], dtype=object)

In [8]:
df[df['Category']=='{"mode":"full"']

Unnamed: 0,Category,Message
5572,"{""mode"":""full""",isActive:false}


In [9]:
# In category column "ham" means the email is not a spam and "spam" means the email is spam.
# Since the Category column consist of a Unnecessary value i.e {"mode":"full" so we remove it

In [10]:
df['Category'].value_counts()

Category
ham               4825
spam               747
{"mode":"full"       1
Name: count, dtype: int64

In [11]:
df=df[df['Category']!='{"mode":"full"']

In [12]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [13]:
df.shape

(5572, 2)

### a. Checking for null values

In [14]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [15]:
df['Message'].value_counts()

Message
Sorry, I'll call later                                                                                                                                      30
I cant pick the phone right now. Pls send a message                                                                                                         12
Ok...                                                                                                                                                       10
Ok                                                                                                                                                           4
Ok.                                                                                                                                                          4
                                                                                                                                                            ..
I gotta collect da car at 6 lei.      

### b. Checking for Duplicate values

In [16]:
df['Message'].duplicated().sum()

415

In [17]:
df=df.drop_duplicates(subset=['Message'])

In [18]:
df['Message'].duplicated().sum()

0

In [19]:
df.shape

(5157, 2)

# Converting all the text in 'Message' column to Lower case

In [20]:
def lowercase(obj):
    return obj.lower()

In [21]:
df['Message']=df['Message'].apply(lowercase)

In [22]:
df

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ü b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i'd...


# Steaming the data 

In [23]:
import nltk
from nltk.stem import PorterStemmer

In [24]:
ps= PorterStemmer()

In [25]:
def steamming(obj):
    l=[]
    for i in obj.split(" "):  # to convert the string to list so that steaing can be done to each word
        l.append(ps.stem(i))  # applying steaming to each word 
        
    return (" ".join(l)) # after steaming returning the list as string 
        
   
    

In [26]:
df['Message']=df['Message'].apply(steamming)

In [27]:
df

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. avail onli in b..."
1,ham,ok lar... joke wif u oni...
2,spam,free entri in 2 a wkli comp to win fa cup fina...
3,ham,u dun say so earli hor... u c alreadi then say...
4,ham,"nah i don't think he goe to usf, he live aroun..."
...,...,...
5567,spam,thi is the 2nd time we have tri 2 contact u. u...
5568,ham,will ü b go to esplanad fr home?
5569,ham,"pity, * wa in mood for that. so...ani other su..."
5570,ham,the guy did some bitch but i act like i'd be i...


# Text vectorization Applying the TFIDF 

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
len(df['Message'])

5157

In [30]:
vc = TfidfVectorizer(stop_words='english',max_features=10000)

In [31]:
x=vc.fit_transform(df['Message']).toarray()

In [32]:
y=df['Category']

In [33]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5157, dtype: object

# Train_test split

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=42)

In [37]:
x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
y_train

2792     ham
757      ham
1929    spam
5399     ham
4502     ham
        ... 
4750     ham
474     spam
3273     ham
4022     ham
882      ham
Name: Category, Length: 4641, dtype: object

In [40]:
y_test

3031     ham
495      ham
2942     ham
3911     ham
3360    spam
        ... 
1941     ham
3688     ham
2361     ham
4874     ham
3617     ham
Name: Category, Length: 516, dtype: object

# Training the model using logistic regression

In [41]:
from sklearn.linear_model import LogisticRegression

In [42]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [51]:
y_predict_lr=lr.predict(x_test)
y_predict_lr

array(['ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam',
       'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       

# Training the model using Naive Bayes

In [50]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train,y_train)

In [54]:
y_predict_mnb=mnb.predict(x_test)
y_predict_mnb

array(['ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam',
       'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
    

# Checking the Accuracy,precision and Confusion matrix of the model

In [55]:
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix


In [56]:
print('Logistic Regression \n')
acc_score_lr = accuracy_score(y_test,y_predict_lr)
pre_score_lr = precision_score(y_test,y_predict_lr,pos_label='spam')
con_matrix_lr = confusion_matrix(y_test,y_predict_lr)

print("Accuracy : ",acc_score_lr*100)
print('Precision : ',pre_score_lr*100)
print("Confusion Matrix : \n",con_matrix_lr)

Logistic Regression 

Accuracy :  94.96124031007753
Precision :  97.77777777777777
Confusion Matrix : 
 [[446   1]
 [ 25  44]]


In [57]:
print('Multinominal Naive Bayes \n')
acc_score_mnb = accuracy_score(y_test,y_predict_mnb)
pre_score_mnb = precision_score(y_test,y_predict_mnb,pos_label='spam')
con_matrix_mnb = confusion_matrix(y_test,y_predict_mnb)

print("Accuracy : ",acc_score_mnb*100)
print('Precision : ',pre_score_mnb*100)
print("Confusion Matrix : \n",con_matrix_mnb)

Multinominal Naive Bayes 

Accuracy :  96.31782945736434
Precision :  100.0
Confusion Matrix : 
 [[447   0]
 [ 19  50]]


In [58]:
def test(str):
    arr=vc.transform([str]).toarray()
    return arr

str="Subject: Invitation to a Team Meeting Dear Team, I hope this email finds you well. We are scheduling a team meeting for next Monday to discuss the upcoming project. Your presence and input would be highly valuable.Date: Monday, May 10th Time: 10:00 AM Location: Conference Room 2 Agenda: 1. Project goals and objectives 2. Task allocation 3. Deadlines Please confirm your availability for the meeting.Looking forward to seeing you all there.Best regards, Prabal Kuinkel"
arr=test(str)
print(arr)

[[0.15641133 0.         0.         ... 0.         0.         0.        ]]


In [62]:
print('Prediction using Logistic Regression: ')
print(lr.predict(arr),"\n\n")

print('Prediction using Multinominal Naive Bayes: ')
print(mnb.predict(arr))

Prediction using Logistic Regression: 
['ham'] 


Prediction using Multinominal Naive Bayes: 
['ham']


In [None]:
# Best is of MNB