In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [4]:
df=pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
df1=df.where((pd.notnull(df)),'')

In [8]:
df1.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df1.shape

(5572, 2)

Label Encoding on Category

Spam: 0<br>
non-Spam/Ham:1

In [11]:
df1.loc[df1['Category']=='spam','Category',]=0
df1.loc[df1['Category']=='ham','Category',]=1

In [47]:
X=df1['Message']

In [48]:
Y=df1['Category']

In [49]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [50]:
x_train, x_test, y_train, y_test=train_test_split(X,Y, test_size=0.2, random_state=3)

In [51]:
print(x_train)
print(x_test)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object
2632    URGENT! Your mobile No 077xxx WON a £2,000 Bon...
454     Ok i will tell her to stay out. Yeah its been ...
983     Congrats! 2 mobile 3G Videophones R yours. cal...
1282        Am I the only one who doesn't stalk profiles?
4610                               Y de asking like this.
                             

Vectorizing the features for Log model

In [52]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [53]:
train_features=feature_extraction.fit_transform(x_train)

In [54]:
test_features=feature_extraction.transform(x_test)

In [None]:
print(train_features)

In [None]:
print(test_features)

In [57]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

Model application

In [22]:
log_model=LogisticRegression()

In [58]:
log_model.fit(train_features,y_train)

Model Evaluation

In [68]:
tr_preds=log_model.predict(train_features)
tr_accuracy=accuracy_score(tr_preds, y_train)
print("Training accuracy: ",round(tr_accuracy,3)*100,"%")

Training accuracy:  96.7 %


In [69]:
test_preds=log_model.predict(test_features)
test_accuracy=accuracy_score(test_preds,y_test)
print("Test accuracy: ",round(test_accuracy,3)*100,"%")

Test accuracy:  96.6 %


Predictive Model

In [71]:
input_data=["As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"]

In [73]:
input_features=feature_extraction.transform(input_data)

In [77]:
input_pred=log_model.predict(input_features)
if input_pred==1:
  print('Ham mail')
else:
  print('Spam Mail')

Ham mail


Ham mail
