# Email Spam Detection

In [1]:
#Importing Libraries
import numpy as np
import pandas as pd

In [2]:
#Loading the dataset
spam_df=pd.read_csv(r"spam.csv",header=0)
spam_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
spam_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
#Checking for null values
spam_df.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
#Shape of the dataset
spam_df.shape

(5572, 2)

In [6]:
spam_df['Category']=spam_df['Category'].replace({'spam':'1','ham':'0'})
spam_df['Category']=spam_df['Category'].astype(int)

In [7]:
spam_df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
spam_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   int32 
 1   Message   5572 non-null   object
dtypes: int32(1), object(1)
memory usage: 65.4+ KB


In [9]:
#Identifying Independent and Target variables
y=spam_df['Category']
x=spam_df['Message']

In [10]:
x.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [11]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: Category, dtype: int32

In [13]:
#count the target var by 0 or 1
#and their (>=10:1 then it is imbalanced datset)
Target_count=spam_df['Category'].value_counts()
print(Target_count)
print('Class 0:',Target_count[0])
print('Class 1:',Target_count[1])
print('Proportion:',round(Target_count[0]/Target_count[1],2),':1')
print('Total email records:',len(spam_df))  #balanced dataset

0    4825
1     747
Name: Category, dtype: int64
Class 0: 4825
Class 1: 747
Proportion: 6.46 :1
Total bank records: 5572


# Splitting the data

In [14]:
# Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Display the shape for train & test data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4179,), (1393,), (4179,), (1393,))

In [15]:
y_test

3245    0
944     0
1044    0
2484    0
812     0
       ..
668     0
218     0
5536    0
1657    0
3875    0
Name: Category, Length: 1393, dtype: int32

# Count Vectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer()
x_train_count=v.fit_transform(x_train.values)

In [17]:
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Naive Bayes-Multinomial Classifier

In [18]:
from sklearn.naive_bayes import MultinomialNB
modelMNB=MultinomialNB()
modelMNB.fit(x_train_count,y_train)

In [19]:
emails=[
    "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
    "You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99"
]
emails_count=v.transform(emails)
modelMNB.predict(emails_count)

array([0, 1])

In [20]:
x_test_count=v.transform(x_test)
modelMNB.score(x_test_count,y_test)

0.9885139985642498

In [21]:
y_pred=modelMNB.predict(x_test_count)

In [22]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
Results = pd.DataFrame({'Spam_A':y_test, 'Spam_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = spam_df.merge(Results, left_index=True, right_index=True)

# Display 10 records randomly

ResultsFinal.sample(10)


Unnamed: 0,Category,Message,Spam_A,Spam_P
4387,0,", im .. On the snowboarding trip. I was wonder...",0,0
691,0,Was the farm open?,0,0
1488,0,I told your number to gautham..,0,0
1020,0,Don know..wait i will check it.,0,0
1777,1,Call FREEPHONE 0800 542 0578 now!,1,1
1632,0,We not watching movie already. Xy wants 2 shop...,0,0
5275,0,Oh yeah clearly it's my fault,0,0
3082,0,Have a great trip to India. And bring the ligh...,0,0
748,1,U are subscribed to the best Mobile Content Se...,1,1
3211,0,"She said,'' do u mind if I go into the bedroom...",0,0
