# Spam Mail Prediction using logistic regression and naive bayes

Importing libraries

In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

Data Collection and Data Preprocessing

In [4]:
dataset = pd.read_csv("E:\ML Projects\mail_data.csv")

In [7]:
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
dataset.shape

(5572, 2)

In [6]:
dataset.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


# Label Encoding

label 'Spam' as 0 and 'Ham' as 1

In [15]:
dataset.loc[dataset['Category']=='spam'] = 0

In [17]:
dataset.loc[dataset['Category']=='ham'] = 1

In [18]:
dataset["Category"]

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

Seperating text and labels

In [20]:
X = dataset['Message']
Y = dataset['Category']

In [22]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [23]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

# Splitting data into Training and Testing

In [25]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state=0)

In [26]:
print(X.shape,X_train.shape,X_test.shape)

(5572,) (4457,) (1115,)


# Feature Extraction

convert the text data into meaningful numerical data

In [28]:
#data to feature vectors
feature_extraction = TfidfVectorizer(min_df = 1, stop_words= 'english',lowercase=True)

In [32]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [33]:
#converting Y_train and Y_test to integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [36]:
print(X_train_features)

  (0, 3422)	0.6418008618863358
  (0, 3960)	0.40459749284424307
  (0, 4776)	0.2937599690543961
  (0, 4486)	0.4933198981059812
  (0, 3101)	0.30778739607068667
  (1, 3855)	0.4410710256765374
  (1, 4574)	0.4410710256765374
  (1, 2534)	0.4410710256765374
  (1, 814)	0.4410710256765374
  (1, 4555)	0.4205367990464199
  (1, 2902)	0.2120712188920981
  (2, 3398)	0.5133141633463273
  (2, 1317)	0.34462014146959175
  (2, 432)	0.4077104256374456
  (2, 4294)	0.36445133334144264
  (2, 2503)	0.5133141633463273
  (2, 4776)	0.2349500626979615
  (3, 1138)	0.6489221209014988
  (3, 1160)	0.44843330753299465
  (3, 3378)	0.38536596088088965
  (3, 3118)	0.3618113574629584
  (3, 3778)	0.31367701143832527
  (4, 3805)	1.0
  (5, 3731)	0.6020708068994186
  (5, 7381)	0.7984426989330436
  :	:
  (4454, 348)	0.2816333253882664
  (4454, 110)	0.3000941484572203
  (4454, 2067)	0.25658354936739225
  (4454, 4488)	0.3000941484572203
  (4454, 651)	0.3000941484572203
  (4454, 373)	0.23959800001827322
  (4454, 796)	0.28163332538

In [37]:
Y_train

1114    1
3589    1
3095    1
1012    1
3320    1
       ..
4931    0
3264    1
1653    0
2607    1
2732    1
Name: Category, Length: 4457, dtype: int32

# Training the Model

Logistic Regression VS Naive Bayes

In [38]:
model = LogisticRegression()

In [40]:
model.fit(X_train_features,Y_train)

LogisticRegression()

In [66]:
model1 = MultinomialNB()

In [67]:
model1.fit(X_train_features,Y_train)

MultinomialNB()

# Evaluating the Model

Logistic Regression

In [76]:
#prediction on training data 
LG_X_train_pred = model.predict(X_train_features)
LG_training_accuracy = accuracy_score(LG_X_train_pred,Y_train)
LG_training_accuracy

0.9679156383217411

In [81]:
#prediction on testing data
LG_X_test_pred = model.predict(X_test_features)
LG_testing_accuracy = accuracy_score(LG_X_test_pred,Y_test)
LG_testing_accuracy 

0.9668161434977578

Naive Bayes

In [82]:
#prediction on training data 
NB_X_train_pred = model1.predict(X_train_features)
NB_training_accuracy = accuracy_score(NB_X_train_pred,Y_train)
NB_training_accuracy

0.9816019744222572

In [84]:
NB_X_test_pred = model1.predict(X_test_features)
NB_testing_accuracy = accuracy_score(NB_X_test_pred,Y_test)
NB_testing_accuracy 

0.9757847533632287

# Build a Predictive System

In [86]:
input_data = ['Going for dinner.msg you after.']

#transform text to feature vectors
input_data_features = feature_extraction.transform(input_data)

In [87]:
#prediction using Logistic Regression
prediction1 = model.predict(input_data_features)

In [88]:
#prediction using Naive Bayes 
prediction2 = model1.predict(input_data_features)

In [89]:
prediction1

array([1])

In [91]:
prediction2

array([1])

In [90]:
if prediction1[0]==1:
    print('The mail is HAM')
else:
    print('The mail is SPAM')

The mail is HAM


In [92]:
if prediction2[0]==1:
    print('The mail is HAM')
else:
    print('The mail is SPAM')

The mail is HAM


The Accuracy Scores for both logistic regression and naive bayes are almost similar but when compared to logistic regression, Naive Bayes gives the best Accuracy Score