In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [8]:
mail_data = pd.read_csv("C:\\Users\\Risathvik\\Downloads\\mail_data.csv")
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [12]:
# replace the null values with a null string
mail_data = mail_data.where((pd.notnull(mail_data)),'')
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
mail_data.shape

(5572, 2)

In [14]:
mail_data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [15]:
mail_data.loc[mail_data['Category']=='spam','Category']=0
mail_data.loc[mail_data['Category']=='ham','Category']=1

In [16]:
# seperating the data as texts and label
X = mail_data['Message']
Y = mail_data['Category']

In [18]:
X,Y

(0       Go until jurong point, crazy.. Available only ...
 1                           Ok lar... Joking wif u oni...
 2       Free entry in 2 a wkly comp to win FA Cup fina...
 3       U dun say so early hor... U c already then say...
 4       Nah I don't think he goes to usf, he lives aro...
                               ...                        
 5567    This is the 2nd time we have tried 2 contact u...
 5568                 Will ü b going to esplanade fr home?
 5569    Pity, * was in mood for that. So...any other s...
 5570    The guy did some bitching but I acted like i'd...
 5571                           Rofl. Its true to its name
 Name: Message, Length: 5572, dtype: object,
 0       1
 1       1
 2       0
 3       1
 4       1
        ..
 5567    0
 5568    1
 5569    1
 5570    1
 5571    1
 Name: Category, Length: 5572, dtype: object)

In [19]:
# Spitting the data into training data and test data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

In [21]:
X.shape,X_train.shape,X_test.shape

((5572,), (4457,), (1115,))

In [23]:
# transfrom the text data to feature vector 
feature_ext = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
X_train_feat = feature_ext.fit_transform(X_train)
X_test_feat = feature_ext.transform(X_test)


Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [26]:
X_test_feat

<1115x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 7687 stored elements in Compressed Sparse Row format>

In [27]:
model = LogisticRegression()

In [28]:
# training the logestic regression with traning data
model.fit(X_train_feat,Y_train)

In [29]:
# prediction for traninng data
prediction_on_train = model.predict(X_train_feat)
train_accu = accuracy_score(prediction_on_train,Y_train)
train_accu

0.9670181736594121

In [30]:
prediction_on_test = model.predict(X_test_feat)
test_accu = accuracy_score(prediction_on_test,Y_test)
test_accu

0.9659192825112107

In [34]:
# Predicting a input mail
input_mail = ["As a valued customer, I am pleased to advise you that following recent review of your Mob No. you are awarded with a £1500 Bonus Prize, call 09066364589"]
input_data_fea = feature_ext.transform(input_mail)

# making predictions
pred = model.predict(input_data_fea)
pred

array([0])