# Importing all the required Library

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Data Ingestion

Dataset Link:https://www.kaggle.com/datasets/venky73/spam-mails-dataset

In [4]:
dataset = pd.read_csv('spam_ham_dataset1.csv')

In [5]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
dataset.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

Observation: We have2 redundant columns.

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [8]:
dataset.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [9]:
dataset.shape

(5171, 4)

In [12]:
#lets drop the unnamed: 0 column
dataset.drop(labels='Unnamed: 0',axis = 1, inplace= True)

# Level Encoding

In [13]:
#level encoding is already exist
#the spam is encoded as 1 and ham is encoded as 0


In [14]:
dataset.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


# Droppping Label column as well.

In [15]:
dataset.drop(labels= 'label', axis = 1, inplace=True)

In [16]:
dataset.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


# Spliting data into Dependent and Independent

In [17]:
X = dataset['text']

In [128]:
X.head()

0    Subject: enron methanol ; meter # : 988291\r\n...
1    Subject: hpl nom for january 9 , 2001\r\n( see...
2    Subject: neon retreat\r\nho ho ho , we ' re ar...
3    Subject: photoshop , windows , office . cheap ...
4    Subject: re : indian springs\r\nthis deal is t...
Name: text, dtype: object

In [18]:
y = dataset['label_num']

In [19]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.20, random_state=42)

# Transforming the text data to vector so that we can use it in logistic regression

In [20]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english', lowercase=True)

In [23]:
#we have to convert all the train dataset into number
X_train = feature_extraction.fit_transform(X_train)
X_test = feature_extraction.transform(X_test)

# Training

# Logistic Regression

In [25]:
logistic_model = LogisticRegression()

In [26]:
logistic_model.fit(X_train,y_train)

LogisticRegression()

In [27]:
y_pred = logistic_model.predict(X_test)

In [29]:
score = accuracy_score(y_test,y_pred)

In [30]:
score

0.9903381642512077

Observation: Pretty good score

# Test with manual input from Original data

In [31]:
dataset.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [33]:
input_data = feature_extraction.transform(['Subject: neon retreat\r\nho ho h'])

In [34]:
print(input_data)

  (0, 38949)	0.05244825820390441
  (0, 35164)	0.404810775239387
  (0, 29274)	0.2970755632717548
  (0, 21711)	0.8632053789036724


In [35]:
logistic_model.predict(input_data)

array([0])

Observation: Matched

# SVC

In [38]:
svc_model = SVC()
svc_model.fit(X_train,y_train)
y_pred = svc_model.predict(X_test)
score = accuracy_score(y_test,y_pred)


In [39]:
score

0.991304347826087

Observation: Performance is high with SVC as well

# Picking both Model for further deployment.

In [41]:
import pickle

In [42]:
pickle.dump(logistic_model, open('logistic.pkl', 'wb'))

In [43]:
pickle.dump(svc_model, open('svc.pkl', 'wb'))