In [None]:
import numpy as np
import pandas as pd

In [None]:
# Importing the logistic Regression Model
from sklearn.linear_model import LogisticRegression

In [None]:
# Importing the accuracy Score function
from sklearn.metrics import accuracy_score

In [None]:
# Importing train test split function
from sklearn.model_selection import train_test_split

In [None]:
# importing TfidfVectorizer(converting all the string formate into numeric)
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Importing dataset
mail_data = pd.read_csv("mail_data.csv")
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
mail_data.describe()
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
# Label spam email as 0 and ham email as 1
mail_data.loc[mail_data['Category'] == 'ham', 'Category']=1
mail_data.loc[mail_data['Category'] == 'spam', 'Category']=0
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [None]:
mail_data['Category'].value_counts().reset_index()

Unnamed: 0,Category,count
0,1,4825
1,0,747


In [None]:
# Createing model
# Seperating the input future (X) and label(Y)
X = mail_data['Message']
Y = mail_data['Category']

In [None]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [None]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [None]:
# Splitting the data into training data and testing data
X_train , X_test, Y_train , Y_test = train_test_split(X,Y,test_size = 0.2 , random_state=2)
print(X.shape , X_train.shape , X_test.shape)
print(Y.shape , Y_train.shape , Y_test.shape)

(5572,) (4457,) (1115,)
(5572,) (4457,) (1115,)


In [None]:
# Transforming the text data to feature vector that can be used as input to the logistic Regression
feature_extraction = TfidfVectorizer(min_df = 1 , stop_words = 'english' , lowercase = True)

In [None]:
X_train_num = feature_extraction.fit_transform(X_train)
X_test_num  = feature_extraction.transform(X_test)

In [None]:
# converting Y_train and Y_test values into integer
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
# Training the model
model = LogisticRegression()
model.fit(X_train_num, Y_train)

In [None]:
# Evaluating the training model
X_train_prediction = model.predict(X_train_num)
# Finding the accuracy score of a training data
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)
print("training_data_accuracy",training_data_accuracy)

training_data_accuracy 0.9685887368184878


In [None]:
# Evaluating the testing model
X_test_prediction = model.predict(X_test_num)
# Finding the accuracy score of a testing data
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print("testing_data_accuracy",testing_data_accuracy)

testing_data_accuracy 0.9533632286995516


In [None]:
# Predict for a new message
input_data = ["Free Free free you 've won a lottery kindly share OTP"]
input_data_num = feature_extraction.transform(input_data)
prediction = model.predict(input_data_num)

# Interpret and print result
if prediction[0] == 1:
    print('ham')
else:
    print('spam')

spam
