 # **Email Spam Detection**
  
  Dataset by: Kaggle(Email Spam Collection Dataset)

  An email spam collection dataset is a curated dataset containing a collection of emails, typically categorized into two main classes: spam and non-spam (or ham). These datasets are used for various purposes, including the development and evaluation of spam email filters, machine learning algorithms, and data analysis.

# **Importing Libraries**

In [31]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [32]:
import warnings
warnings.filterwarnings("ignore")

# **Importing Dataset**

In [33]:
#df is the dataframe
df = pd.read_csv("/content/spam.csv" , encoding ="ISO-8859-1")

In [34]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [35]:
# finding null values
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [36]:
# dropping null values
df1 = df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis=1)

In [37]:
df1.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
df1.rename(columns = {"v1" : "Spam or Ham", "v2" :"Mail Message"},inplace = True)

In [39]:
df1.head()

Unnamed: 0,Spam or Ham,Mail Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [40]:
df1.shape

(5572, 2)

# **Converting the Categorical values into numerical values**

In [41]:
# Converting the categorical values into numerical values
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df1['Spam or Ham'] = encoder.fit_transform(df1['Spam or Ham'])

In [42]:
#Finding duplicates
df1.duplicated().sum()

403

In [43]:
df1 = df1.drop_duplicates(keep = 'first')

In [44]:
df1.duplicated().sum()

0

# **Splitting the data**

In [45]:
#Splitting the data
X = df1["Mail Message"]
y = df1["Spam or Ham"]

# **Splitting the data into training and testing data**

In [46]:
#Splitting the data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.20, random_state = 0)

In [47]:
# Convert the text data into a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train.values)
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# **Implementing the model**

In [48]:
#Fitting Mutinomial naive bayes
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [49]:
from sklearn.metrics import confusion_matrix , recall_score , precision_score
from sklearn.metrics import accuracy_score

# **Testing the data**

In [50]:
#Testing the mail(spam/ham)

mail_ham = ['Same. Wana plan a trip sometme then']
mail_ham_count = cv.transform(mail_ham)
y_pred = model.predict(mail_ham_count)
y_pred

array([0])

# **Finding accuracy of training and test dataset**

In [51]:
#finding accuracy of the training dataset
model.score(X_train_count, y_train)

0.9929866989117292

In [52]:
#finding accuracy of thr test dataset
X_test_count = cv.transform(X_test)
model.score(X_test_count, y_test)

0.9816247582205029