### Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Reading and Analysing the Dataset

In [2]:
email_data = pd.read_csv('spam_ham_dataset.csv')

In [3]:
email_data

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


### Dropping Unnecessary Columns

In [4]:
email_data.drop(columns=["Unnamed: 0", "label"], inplace=True)

In [5]:
email_data

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...
5166,Subject: put the 10 on the ft\r\nthe transport...,0
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,Subject: industrial worksheets for august 2000...,0


In [6]:
email_data.isnull().sum()

text         0
label_num    0
dtype: int64

In [7]:
email_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       5171 non-null   object
 1   label_num  5171 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 80.9+ KB


### Calculating total number of Legit Emails and Total Number of Spam Emails

In [8]:
total_spam_emails = email_data[email_data.label_num == 1]
total_legit_emails = email_data[email_data.label_num == 0]
print(f"Total Number of Spam Email: {total_spam_emails.shape}. Total Nmber of Legit Emails: {total_legit_emails.shape}")

Total Number of Spam Email: (1499, 2). Total Nmber of Legit Emails: (3672, 2)


### Taking a sample out of the original legit emails in order to solve the problem of data imbalance




In [9]:
legit_email_sample = total_legit_emails.sample(n=1499)

In [10]:
total_legit_emails_sample = legit_email_sample[legit_email_sample.label_num == 0]
total_spam_emails = email_data[email_data.label_num == 1]
print(f"Total Number of Spam Email: {total_spam_emails.shape}. Total Nmber of Legit Emails: {total_legit_emails_sample.shape}")

Total Number of Spam Email: (1499, 2). Total Nmber of Legit Emails: (1499, 2)


### Creating a new dataset out of the new legit email sample

In [11]:
new_email_dataset = pd.concat([legit_email_sample, total_spam_emails], axis=0)

In [12]:
new_email_dataset

Unnamed: 0,text,label_num
2767,"Subject: 987012\r\ndaren , i just talked to ma...",0
2913,Subject: 2000 goals & objectives\r\n- - - - - ...,0
1307,Subject: bad estimates on 28 th\r\nwe are gett...,0
2705,Subject: same day change - revision # 1 - txu ...,0
4668,Subject: customer meeting invitation\r\ngood a...,0
...,...,...
5159,Subject: pictures\r\nstreamlined denizen ajar ...,1
5161,Subject: penny stocks are about timing\r\nnoma...,1
5162,Subject: anomaly boys from 3881\r\nuosda apapr...,1
5164,Subject: slutty milf wants to meet you\r\ntake...,1


### Separating Features and Labels

In [13]:
X = new_email_dataset['text']
y = new_email_dataset['label_num']

### Applying train Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

### Initializing CountVectorizer

In [15]:
vectorizer = CountVectorizer(min_df=1, stop_words='english', lowercase=True)

### Transforming the features from text into numeric value

In [16]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

### Initializing Logistic Regression Model

In [17]:
model = LogisticRegression()

### Training the Model

In [18]:
model.fit(X_train_vectorized, y_train)

### Calculating the Accuracy of the Training and Testing Data

In [19]:
prediction_X_train = model.predict(X_train_vectorized)

print(f"Accuracy of the Model: {accuracy_score(prediction_X_train, y_train)}")

Accuracy of the Model: 0.9991659716430359


In [20]:
prediction_X_test = model.predict(X_test_vectorized)

print(f"Accuracy of the Model: {accuracy_score(prediction_X_test, y_test)}")

Accuracy of the Model: 0.985


### Predicted Result VS Actual Result

In [21]:
label_names = ['Not A Spam Mail', 'Spam Mail']

y_test = y_test.reset_index(drop=True)

for i in range(len(prediction_X_test)):
    print(f"Predicted Result: {label_names[prediction_X_test[i]]}. Actual Result: {label_names[y_test[i]]}")


Predicted Result: Not A Spam Mail. Actual Result: Not A Spam Mail
Predicted Result: Not A Spam Mail. Actual Result: Not A Spam Mail
Predicted Result: Spam Mail. Actual Result: Spam Mail
Predicted Result: Spam Mail. Actual Result: Spam Mail
Predicted Result: Not A Spam Mail. Actual Result: Not A Spam Mail
Predicted Result: Not A Spam Mail. Actual Result: Not A Spam Mail
Predicted Result: Not A Spam Mail. Actual Result: Not A Spam Mail
Predicted Result: Not A Spam Mail. Actual Result: Not A Spam Mail
Predicted Result: Not A Spam Mail. Actual Result: Not A Spam Mail
Predicted Result: Not A Spam Mail. Actual Result: Not A Spam Mail
Predicted Result: Spam Mail. Actual Result: Spam Mail
Predicted Result: Spam Mail. Actual Result: Spam Mail
Predicted Result: Spam Mail. Actual Result: Spam Mail
Predicted Result: Spam Mail. Actual Result: Spam Mail
Predicted Result: Spam Mail. Actual Result: Spam Mail
Predicted Result: Spam Mail. Actual Result: Spam Mail
Predicted Result: Spam Mail. Actual Resu