In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


In [4]:
data = pd.read_csv('mail_data.csv')
data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.tail(5)

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [6]:
data.shape

(5572, 2)

In [7]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [10]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [13]:
data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
print(data)


     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


# Encoding the label

In [26]:
data.loc[data['Category'] == 'spam','Category'] = 0
data.loc[data['Category'] == 'ham','Category'] =1

In [27]:
data.head(2)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...


# separating 

In [28]:
x = data['Message']
y = data['Category']

In [31]:
print(x.shape)
print(y.shape)

(5572,)
(5572,)


# splitting and training using train test split

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=4)

In [35]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3900,)
(1672,)
(3900,)
(1672,)


## feature extraction using TfidVectorizer

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english')
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)

In [38]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')


In [39]:
print(x_train)
print(y_train)

1256    Just wait till end of march when el nino gets ...
4163    How's it going? Got any exciting karaoke type ...
1994                   Eh den sat u book e kb liao huh...
3587    I am hot n horny and willing I live local to y...
1598    URGENT! Your Mobile number has been awarded wi...
                              ...                        
3671                        Ok thanx... Take care then...
709     To review and KEEP the fantastic Nokia N-Gage ...
2487           I dont thnk its a wrong calling between us
174     Bloody hell, cant believe you forgot my surnam...
1146    Thank you, winner notified by sms. Good Luck! ...
Name: Message, Length: 3900, dtype: object
1256    1
4163    1
1994    1
3587    0
1598    0
       ..
3671    1
709     0
2487    1
174     1
1146    0
Name: Category, Length: 3900, dtype: int32


# training the Model

In [40]:
from sklearn.linear_model import LogisticRegression

In [42]:
model = LogisticRegression()

In [43]:
model.fit(x_train_feature,y_train)

LogisticRegression()

# predicting the accuracy and model

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
predict = model.predict(x_train_feature)
accuracy_predict = accuracy_score(y_train,predict)

In [46]:
print('Accuracy of the train data is:',accuracy_predict)

Accuracy of the train data is: 0.9669230769230769


# predicting the test value

In [47]:
x_predict = model.predict(x_test_feature)
test_accuracy = accuracy_score(y_test,x_predict)

In [48]:
print(test_accuracy)

0.9545454545454546


# checking through predicting system

In [51]:
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]


input_data_features = feature_extraction.transform(input_mail)



prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail


# saving the Model using pickle

In [52]:
import pickle

In [55]:
file = 'model2.sav'

In [56]:
pickle.dump(model,open(file,'wb'))

In [57]:
loaded_model = pickle.load(open('model2.sav','rb'))

In [61]:
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]



input_data_features = feature_extraction.transform(input_mail)


prediction = loaded_model.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
