In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Importing mail dataset
data = pd.read_csv('mail_data.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Number of spam mails and ham mails in dataset
data.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
# Finding Null Values
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
# Assigning Feature values to X and Target values to Y
X = data['Message']
Y = data['Category']

In [6]:
# Converting categorical data/Messages to Numerial format
TF = TfidfVectorizer(min_df=4)
X_TF = TF.fit_transform(X)

In [7]:
# Splitting the data to train and test and feeding the data to Logistic Regression ML Model
x_train,x_test,y_train,y_test = train_test_split(X_TF,Y,random_state=2,test_size=.2,stratify=Y)
LR = LogisticRegression()
LRmodel = LR.fit(x_train,y_train)

In [8]:
# Checking Model accuracy on train data
y_pred_train = LRmodel.predict(x_train)
accuracy_score(y_pred_train,y_train)

0.9789095804352703

In [9]:
# Checking Model accuracy on test data
y_pred_test = LRmodel.predict(x_test)
accuracy_score(y_pred_test,y_test)

0.9739910313901345


# Predictive System

In [10]:
# Model Prediction on new data
new_data1 = ["I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]
new_data1 = TF.transform(new_data1)
new_data1_pred = LRmodel.predict(new_data1)
print(new_data1_pred)

['ham']


In [11]:
# Model Prediction on new data
new_data2 = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]
new_data2 = TF.transform(new_data2)
new_data2_pred = LRmodel.predict(new_data2)
print(new_data2_pred)

['spam']


In [12]:
# Model Prediction on new data
new_data3 = ["Thanks for your subscription to Ringtone UK your mobile will be charged Â£5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"]
new_data3 = TF.transform(new_data3)
new_data3_pred = LRmodel.predict(new_data3)
print(new_data3_pred)

['spam']


In [13]:
# Model Prediction on new data
new_data4 = ["Orange customer, you may now claim your FREE CAMERA PHONE upgrade for your loyalty. Call now on 0207 153 9996. Offer ends 14thMarch. T&C's apply. Opt-out availa"]
new_data4 = TF.transform(new_data4)
new_data4_pred = LRmodel.predict(new_data4)
print(new_data4_pred)

['spam']


In [14]:
# Model Prediction on new data
new_data5 = ["As I entered my cabin my PA said, '' Happy B'day Boss !!''. I felt special. She askd me 4 lunch. After lunch she invited me to her apartment. We went there."]
new_data5 = TF.transform(new_data5)
new_data5_pred = LRmodel.predict(new_data5)
print(new_data5_pred)

['ham']
