# Importing the Libraries and Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Data Collection and Processing

In [2]:
# Loading the data from csv file and create Pandas DataFrame
mail_data = pd.read_csv('spam.csv',encoding="ISO-8859-1")

In [3]:
# print 5 rows of dataset.
mail_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
mail_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)

In [5]:
# print 5 rows of dataset.
mail_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# print number of rows and columns 
mail_data.shape

(5572, 2)

In [7]:
# print the information of dataset
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
# find the number of missing values in each column
mail_data.isnull().sum()

v1    0
v2    0
dtype: int64

In [9]:
# Replace the null values with a null string
mail_data = mail_data.where(pd.notnull(mail_data),'')

In [10]:
# Printing the first five rows of dataframe
mail_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# label spam mail as 0 and ham mail as 1
mail_data.replace({"v1":{"ham":1,"spam":0}},inplace=True)

In [12]:
mail_data.head()

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


# Separating the data as text and labels

In [13]:
x = mail_data["v2"]
y = mail_data["v1"]

In [14]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [15]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: int64

# Splitting the data into training data and test data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 3)

In [17]:
x.shape, x_train.shape, x_test.shape

((5572,), (4457,), (1115,))

# Feature Extraction

In [18]:
# Convert the text data to feature vectors that can be used as input to the Logistic Regression

feature_extraction = TfidfVectorizer(min_df = 1,stop_words = "english", lowercase = True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

# convert y_train and y_test values as integers

y_train = y_train.astype("int")
y_test = y_test.astype("int")

# Model Training

In [19]:
model = LogisticRegression()

In [20]:
# training the Logistic Regression model with the training data
model.fit(x_train_features, y_train)

# Evaluating Model

In [21]:
# prediction on training data
x_train_prediction = model.predict(x_train_features)
print(x_train_prediction)

[1 1 1 ... 1 1 1]


In [22]:
x_test_prediction = model.predict(x_test_features)
print(x_test_prediction)

[1 1 1 ... 1 1 1]


In [23]:
# Accuracy on training data
accuracy_of_train_data = accuracy_score(x_train_prediction, y_train)
print('Accuracy Score of Training data:',accuracy_of_train_data)

# Accuracy on test data
accuracy_of_test_data = accuracy_score(x_test_prediction, y_test)
print('Accuracy Score of Testing data:',accuracy_of_test_data)

Accuracy Score of Training data: 0.9661207089970832
Accuracy Score of Testing data: 0.9623318385650225


In [24]:
# Precision on training data
precision_of_train_data = precision_score(x_train_prediction, y_train)
print('Precision Score of Training data:',precision_of_train_data)

# Precision on test data
precision_of_test_data = precision_score(x_test_prediction, y_test)
print('Precision Score of Testing data:',precision_of_test_data)

Precision Score of Training data: 0.9989650711513584
Precision Score of Testing data: 0.9989583333333333


In [25]:
# Recall on training data
recall_of_train_data = recall_score(x_train_prediction, y_train)
print('Recall Score of Training data:',recall_of_train_data)

# Recall on test data
recall_of_test_data = recall_score(x_test_prediction, y_test)
print('Recall Score of Testing data:',recall_of_test_data)

Recall Score of Training data: 0.9633233532934131
Recall Score of Testing data: 0.959


In [26]:
# f1-score on training data
f1score_of_train_data = f1_score(x_train_prediction, y_train)
print('F1-score Score of Training data:',f1score_of_train_data)

# Accuracy on test data
f1score_of_test_data = f1_score(x_test_prediction, y_test)
print('F1-score Score of Testing data:',f1score_of_test_data)

F1-score Score of Training data: 0.9808205258478344
F1-score Score of Testing data: 0.9785714285714285


# Building a Predictive System

In [27]:
input_mail=[input("Enter a mail: ")]

f_extraction = feature_extraction.transform(input_mail)
prediction = model.predict(f_extraction)

if prediction == 1:
    print("Ham mail")
else:
    print("spam mail")

Enter a mail: URGENT! We are trying to contact you. Last weekends draw shows that you have won a å£900 prize GUARANTEED. Call 09061701939. Claim code S89. Valid 12hrs only
spam mail
