# **Email Spam Detection**

The objective of this project is to create an email spam detection system that uses machine learning algorithms to classify incoming emails. By training the model on a labeled dataset of spam and non-spam emails, we aim to develop an accurate and efficient spam detector that can reliably identify and categorize emails based on their content and characteristics.

In [1]:
#import statements

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
#mounting Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#importing dataset

dataset_path = '/content/drive/MyDrive/ColabNotebooks/Datasets/spam.csv'
df = pd.read_csv(dataset_path,encoding = "ISO-8859-1") #dataset has values that aren't compatible with default encoding

#df2 = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/Datasets/encoded-spam.csv') #converted this file into utf-8 externally

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
#deleting unnecessary/emplt data columns

del df["Unnamed: 2"]
del df["Unnamed: 3"]
del df["Unnamed: 4"]

In [6]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# renaming data columns

df.columns= ["category","text"]

In [8]:
df

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  5572 non-null   object
 1   text      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [10]:
df.describe()

Unnamed: 0,category,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [11]:
#checking for null values

df.isna().sum()

category    0
text        0
dtype: int64

In [12]:
#creating a column so that it can easily fitted to a model

df['spam']=df['category'].apply(lambda x:1 if x=='spam' else 0)
df.head()

Unnamed: 0,category,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [13]:
#extracting features

X = df.text
Y = df.spam

In [14]:
#Splitting the data into training and testing data

X_train, X_test, Y_train , Y_test = train_test_split(X,Y,test_size=0.20,random_state=0)

In [15]:
#converting Textual data

cv = CountVectorizer()
X_train_transformed = cv.fit_transform(X_train.values)
X_train_transformed.toarray()


X_test_transformed = cv.transform(X_test.values)
X_test_transformed.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
# Fitting Model

model = MultinomialNB()
model.fit(X_train_transformed,Y_train)

In [17]:
#finding accuracy of the training dataset

model.score(X_train_transformed,Y_train)

0.9932690150325331

In [18]:
#finding accuracy of the test dataset

model.score(X_test_transformed,Y_test)

0.9874439461883409

In [20]:
#exporting the model using pickle

import pickle

with open("SpamDetectionModel.pkl", "wb") as file:
  pickle.dump(model,file)