# Spam Email Classifier Using Decision Tree

# Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import math
import nltk
import string
import sklearn

# Reading Data

In [2]:
df = pd.read_csv("G:\CSE\CSE 475\Spam Email Classifier.csv", encoding="ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.shape

(5572, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


# Standardizing Data

In [5]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

df.rename(columns={'v1': 'label', 'v2': 'email'},inplace=True)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

df.head()

Unnamed: 0,label,email
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.isnull().sum()

label    0
email    0
dtype: int64

In [7]:
df.duplicated().sum()

403

In [8]:
df = df.drop_duplicates(keep='first')
df.duplicated().sum()

0

In [9]:
df.shape

(5169, 2)

In [10]:
df['label'].value_counts()

0    4516
1     653
Name: label, dtype: int64

In [11]:
df['characters'] = df['email'].apply(len)
df.head()

Unnamed: 0,label,email,characters
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [12]:
df['words'] = df['email'].apply(lambda x:len(nltk.word_tokenize(x)))
df.head()

Unnamed: 0,label,email,characters,words
0,0,"Go until jurong point, crazy.. Available only ...",111,24
1,0,Ok lar... Joking wif u oni...,29,8
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37
3,0,U dun say so early hor... U c already then say...,49,13
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15


In [13]:
df['sentences'] = df['email'].apply(lambda x:len(nltk.sent_tokenize(x)))
df.head()

Unnamed: 0,label,email,characters,words,sentences
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2
1,0,Ok lar... Joking wif u oni...,29,8,2
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2
3,0,U dun say so early hor... U c already then say...,49,13,1
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1


In [14]:
df[['characters','words','sentences']].describe()

Unnamed: 0,characters,words,sentences
count,5169.0,5169.0,5169.0
mean,78.977945,18.455794,1.965564
std,58.236293,13.324758,1.448541
min,2.0,1.0,1.0
25%,36.0,9.0,1.0
50%,60.0,15.0,1.0
75%,117.0,26.0,2.0
max,910.0,220.0,38.0


# Processing Data

In [15]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [18]:
def process_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i)) 
    return " ".join(y)

df['processed_email'] = df['email'].apply(process_text)
df.head()

Unnamed: 0,label,email,characters,words,sentences,processed_email
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2,go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,29,8,2,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,49,13,1,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1,nah think goe usf live around though


# Implementing Spam Function 

In [19]:
spam_corpus = []
for msg in df[df['label'] ==1] ['processed_email'].tolist():
    for word in msg.split():
        spam_corpus.append(word)
len(spam_corpus)

9939

In [20]:
from collections import Counter
pd.DataFrame(Counter(spam_corpus).most_common(30))

Unnamed: 0,0,1
0,call,320
1,free,191
2,2,155
3,txt,141
4,text,122
5,u,119
6,ur,119
7,mobil,114
8,stop,104
9,repli,103


In [21]:
ham_corpus = []
for msg in df[df['label'] ==0] ['processed_email'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

len(ham_corpus)

35404

In [22]:
from collections import Counter
pd.DataFrame(Counter(ham_corpus).most_common(30))

Unnamed: 0,0,1
0,u,883
1,go,404
2,get,349
3,gt,288
4,lt,287
5,2,284
6,come,275
7,got,236
8,know,236
9,like,234


# Count Vectorization

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(df['processed_email']).toarray()
X.shape

(5169, 6708)

In [24]:
y = df['label'].values
y

array([0, 0, 1, ..., 0, 0, 0])

# Train Test Split

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [26]:
X_train.shape

(4135, 6708)

In [27]:
X_test.shape

(1034, 6708)

# Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=5)

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print('Accuracy: ')
print(accuracy_score(y_test,y_pred))

Accuracy: 
0.9235976789168279


In [30]:
print('Precision:')
print(precision_score(y_test,y_pred))

Precision:
0.9154929577464789


In [31]:
print('Confusion Matrix:')
print(confusion_matrix(y_test,y_pred))

Confusion Matrix:
[[890   6]
 [ 73  65]]


In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96       896
           1       0.92      0.47      0.62       138

    accuracy                           0.92      1034
   macro avg       0.92      0.73      0.79      1034
weighted avg       0.92      0.92      0.91      1034

