In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#Loading the data
original_data = pd.read_csv("mail_data.csv")

#Replacing NULL values with NULL string
good_data = original_data.where( (pd.notnull(original_data) ),'')

In [3]:
good_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
good_data.shape

(5572, 2)

In [5]:
#Coding the Ham mails to False and Spam mails to True
good_data.loc[good_data['Category'] == 'spam', 'Category'] = 1
good_data.loc[good_data['Category'] == 'ham', 'Category'] = 0

In [6]:
good_data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
x = good_data['Message']
y = good_data['Category']

In [8]:
#Train and Test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

In [9]:
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4179,) (1393,)


In [10]:
print(y_train)

710     1
3740    0
2711    1
3155    1
3748    0
       ..
905     0
5192    0
3980    0
235     1
5157    0
Name: Category, Length: 4179, dtype: object


In [11]:
#Transforming text to readable data for Logistic Regression Model
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase='True')

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [12]:
print(x_train_features)

  (0, 1968)	0.20555770187976505
  (0, 4621)	0.2826608319517557
  (0, 6670)	0.23567651227529574
  (0, 50)	0.2826608319517557
  (0, 4245)	0.2826608319517557
  (0, 262)	0.25802160000805496
  (0, 4815)	0.16823978217870592
  (0, 2777)	0.2987551880042128
  (0, 4819)	0.2467015577941807
  (0, 1524)	0.2134897628788885
  (0, 3760)	0.21612407138649548
  (0, 5307)	0.25008953900893155
  (0, 3849)	0.20358236606353303
  (0, 4623)	0.22048220359216783
  (0, 5024)	0.22371596808895342
  (0, 3086)	0.2065881188886313
  (0, 507)	0.2826608319517557
  (1, 6542)	0.657492938833411
  (1, 6025)	0.657492938833411
  (1, 2166)	0.3679756388246497
  (2, 6611)	0.21801015986499822
  (2, 6470)	0.35233710750013614
  (2, 5086)	0.33581174761157134
  (2, 98)	0.35233710750013614
  (2, 3177)	0.22281059031897985
  :	:
  (4176, 387)	0.317680062733604
  (4176, 3876)	0.25677970808202527
  (4176, 7131)	0.23431439791927364
  (4176, 6188)	0.25152520362673875
  (4176, 4414)	0.28918871571362903
  (4177, 6371)	0.31506538554722807
  (417

In [13]:
LR = LogisticRegression()

In [14]:
LR.fit(x_train_features, y_train)

LogisticRegression()

In [15]:
prediction_training = LR.predict(x_train_features)
accuracy_training = accuracy_score(y_train, prediction_training)

In [16]:
print("Accuracy of the model on training data = ",accuracy_training)

Accuracy of the model on training data =  0.9662598707824839


In [17]:
prediction_test = LR.predict(x_test_features)
accuracy_test = accuracy_score(y_test, prediction_test)

In [18]:
print("Accuracy of the model on test data = ",accuracy_test)

Accuracy of the model on test data =  0.968413496051687


In [19]:
# check_mail = [" 07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow "]
# check_mail_features = feature_extraction.transform(check_mail)
 
# predict_mail = LR.predict(check_mail_features)

In [20]:
# if (predict_mail[0] == 1):
#     print("Spam mail")
# else:
#     print("Ham Mail")

In [21]:
import pickle

In [22]:
filename = "spam_model.pkl"
pickle.dump(LR, open(filename, 'wb') )

In [23]:
print(x.head)

<bound method NDFrame.head of 0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object>
