In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
mail_data = pd.read_csv('mail_data.csv')

In [5]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
mail_data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [7]:
mail_data.shape

(5572, 2)

In [10]:
mail_data.loc[mail_data['Category']=='spam','Category',]=0
mail_data.loc[mail_data['Category']=='ham','Category',]=1

In [11]:
X=mail_data['Message']
Y=mail_data['Category']

In [12]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [13]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [22]:
Y.value_counts()

1    4825
0     747
Name: Category, dtype: int64

In [23]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [25]:
print(X.shape,X_train.shape,X_test.shape)

(5572,) (4457,) (1115,)


In [26]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [27]:
X_train

5426        Oh yeah! And my diet just flew out the window
4724                     HELLO PEACH! MY CAKE TASTS LUSH!
536     Good afternoon, my love! How goes that day ? I...
3488                        Change windows logoff sound..
2551    Please sen :)my kind advice :-)please come her...
                              ...                        
1697    Sorry man, my stash ran dry last night and I c...
422     Someone has contacted our dating service and e...
4007    IM FINE BABES AINT BEEN UP 2 MUCH THO! SAW SCA...
3474                      You getting back any time soon?
3074           Somebody should go to andros and steal ice
Name: Message, Length: 4457, dtype: object

In [29]:
print(X_train_features)

  (0, 7289)	0.517250079608171
  (0, 2823)	0.517250079608171
  (0, 3764)	0.22046319970004674
  (0, 2262)	0.4931693086193514
  (0, 7438)	0.2996693624522655
  (0, 4768)	0.28858793133473676
  (1, 4136)	0.4717788963273522
  (1, 6517)	0.49481520325330863
  (1, 1558)	0.4236400720998954
  (1, 4972)	0.49481520325330863
  (1, 3317)	0.32904344933475643
  (2, 5798)	0.2821824162510531
  (2, 3835)	0.2623708342584191
  (2, 4943)	0.33789703751914013
  (2, 5837)	0.1845655907506494
  (2, 1430)	0.28509060215711635
  (2, 6641)	0.20096909705626312
  (2, 3722)	0.24768901862403342
  (2, 3935)	0.3671145612703168
  (2, 3118)	0.18009671431232455
  (2, 4269)	0.2543939099135892
  (2, 3398)	0.20665621299033204
  (2, 2136)	0.180851695270251
  (2, 3086)	0.27449720225122765
  (2, 4099)	0.186263215205624
  :	:
  (4454, 5765)	0.27366476899994313
  (4454, 4205)	0.27366476899994313
  (4454, 6404)	0.2834859847167938
  (4454, 387)	0.2598225428978842
  (4454, 865)	0.26604684225670366
  (4454, 2972)	0.2598225428978842
  (445

In [30]:
model = LogisticRegression()

In [33]:
model.fit(X_train_features,Y_train)

LogisticRegression()

In [34]:
prediction_on_Trained_data = model.predict(X_train_features)
accuracy_of_trained_data = accuracy_score(Y_train,prediction_on_Trained_data)

In [37]:
print('Accuracy of Trained Data: ',accuracy_of_trained_data)

Accuracy of Trained Data:  0.9672425398249944


In [40]:
input_mail = ['Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030']
             
              
input_mail_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_mail_features)

if(prediction[0] == 1):
    print('This is a Ham mail')
else:
    print('This is a Spam mail')



This is a Spam mail
