# <div style="text-align: center; background-color: #649eff; color: white; padding: 14px; line-height: 1;border-radius:10px">📮EDA & Classification on Spam Email Dataset</div>

![image](https://thumbs.dreamstime.com/b/spam-mail-printed-wooden-cube-spam-mail-printed-wooden-cubes-193211215.jpg)

<cite>Image source: https://www.dreamstime.com/photos-images/spam-mail.html

> <h2> 1. About Dataset </h2>

<br>
The dataset <b>'Spam Email'</b> contains <b>2 columns</b>, each are:
<br>

* <b>Category</b>:     Whether it is spam or ham

* <b>Message</b>:      context of message

<a id="1"></a>
# <div style="text-align: left; background-color: #78aaff; color: white; padding: 10px; line-height: 1;border-radius:20px">1. Load Necessary Libraries and Dataset</div>

In [52]:
import numpy as np
import pandas as pd
import nltk

In [53]:
df = pd.read_csv('Email Spam.csv')

In [54]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Checking for missing values

In [55]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

<a id="1"></a>
# <div style="text-align: left; background-color: #78aaff; color: white; padding: 10px; line-height: 1;border-radius:20px">2. Data Preprocessing</div>

#### Tokenization

In [56]:
from nltk.tokenize import word_tokenize

In [57]:
#nltk.download('punkt')

In [58]:
df['Message_tokenized'] = df['Message'].apply(lambda x:word_tokenize(x))

#### Removing Punctuations

In [59]:
#nltk.download('punkt')

In [60]:
import string

In [61]:
df['Message_non_punctuations'] = df['Message_tokenized'].apply(lambda x: [word for word in x if word not in string.punctuation])

In [62]:
# Removing '...' and '..' from Message_non_puncutations
dot_punctuations = ['..','...']
df['Message_non_punctuations'] = df['Message_non_punctuations'].apply(lambda x : " ".join([word for word in x if word not in dot_punctuations]))

#### Convertion to lowercase

In [63]:
df['Message_lowercased'] = df['Message_non_punctuations'].apply(lambda x: " ".join(word.lower() for word in x.split()))

<a id="1"></a>
# <div style="text-align: left; background-color: #78aaff; color: white; padding: 10px; line-height: 1;border-radius:20px">3. Text Analysis of the Data</div>

#### As of now , will be considering Message_lowercased as the column for analysis after partial text cleaning

#### Word Count

In [64]:
df['Word_Count'] = df['Message_lowercased'].apply(lambda x:len(x))

#### Average Word length

In [65]:
for i,word in enumerate(df['Message_lowercased']):
    if len(word) == 0:
        print(word,i)

 3376
 4824


In [66]:
df.iloc[3376:3377]

Unnamed: 0,Category,Message,Message_tokenized,Message_non_punctuations,Message_lowercased,Word_Count
3376,ham,:),"[:, )]",,,0


In [67]:
df.iloc[4824:4825]

Unnamed: 0,Category,Message,Message_tokenized,Message_non_punctuations,Message_lowercased,Word_Count
4824,ham,:-) :-),"[:, -, ), :, -, )]",,,0


#### These 2 records will now be treated as null , because they dont give any value for our analysis

In [68]:
condition = (df['Word_Count'] == 0)

In [69]:
df_null_removed = df[~condition]

In [70]:
def AverageLength(x):
    words = x.split()
    return sum(len(word) for word in words) / len(words)  

In [71]:
df['Average_Word_Length'] = df_null_removed['Message_lowercased'].apply(lambda x: AverageLength(x))

#### Stopwords Count and Rate

In [72]:
from nltk.corpus import stopwords

In [73]:
stop_words = stopwords.words('english')
df['stopwords_count'] = df['Message_lowercased'].apply(lambda x: len([word for word in x.split() if word in stop_words]))
df['stopwords_rate'] = df['stopwords_count'] / df['Word_Count']

<a id="1"></a>
# <div style="text-align: left; background-color: #78aaff; color: white; padding: 10px; line-height: 1;border-radius:20px">4. Data Cleaning</div>

#### Removing the null records permanently from dataframe

In [74]:
df = df[~condition]

#### Removing other punctuations that might be present

In [75]:
df['Message_punctuations_removed'] = df['Message_lowercased'].str.replace('[^\w\s]', '')

  df['Message_punctuations_removed'] = df['Message_lowercased'].str.replace('[^\w\s]', '')


#### Removing Stopwords

In [76]:
df['Message_stopwords_removed'] = df['Message_punctuations_removed'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

#### Converting Abbreviations from Top 50 recurring words to meaninful words manually

In [77]:
pd.Series(" ".join(df['Message_stopwords_removed']).split()).value_counts()[:50]

u         1175
call       577
2          489
ur         391
get        387
nt         381
gt         318
lt         316
4          301
ok         285
go         282
free       278
know       262
got        252
like       246
good       243
come       232
time       217
day        211
love       207
want       194
send       192
text       189
one        176
ü          173
going      173
txt        169
need       167
home       163
lor        162
r          161
see        159
sorry      159
still      156
stop       155
today      153
back       153
dont       152
n          151
da         149
reply      147
hi         139
mobile     138
tell       137
new        136
take       135
later      135
please     132
think      132
pls        126
dtype: int64

#### Converting the maximum recurring words to meaningful words manually

In [78]:
# Done on based on top 50 recurring words
abbreviation_mapping = {
    'u': 'you',
    '2':'to',
    'ur': 'your',
    'n': 'and',
    'gt': 'great',
    'lt': 'little',
    'nt':'not',
    '4':'for',
    'ü':'you',
    'txt':'text',
    'r':'are',
    'da':'the',
    'pls':'please'
}

In [79]:
def RemoveAbbreviations(x):
    words = x.split()
    updated_words = [abbreviation_mapping.get(word,word) for word in words]
    return " ".join(updated_words)

In [80]:
df['Message_abreviations_treated'] = df['Message_stopwords_removed'].apply(RemoveAbbreviations)

#### Lemmatization

In [81]:
#pip install spacy

In [82]:
#!python -m spacy download en_core_web_sm : To download the model

In [83]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [84]:
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([word.lemma_ for word in doc])
    return lemmatized_text

In [85]:
df['Message_lemmatized'] = df['Message_abreviations_treated'].apply(lambda x: lemmatize_text(x))

#### Converting Spam to 1 and Ham to 0 (Target column encoding)

In [86]:
df['Spam']=df['Category'].apply(lambda x:1 if x=='spam' else 0)

<a id="1"></a>
# <div style="text-align: left; background-color: #78aaff; color: white; padding: 10px; line-height: 1;border-radius:20px">5. Model Training and Evaluation</div>

In [87]:
X = df['Message_lemmatized']
y = df['Spam']

#### Text to Vector conversion

In [88]:
from sklearn.feature_extraction.text import CountVectorizer

In [89]:
cv = CountVectorizer()
cv_fit = cv.fit_transform(X)
X_df = cv_fit.toarray()

#### Train and Test Data split

In [90]:
from sklearn.model_selection import train_test_split

#### Since the dataset is imbalanced , we are using stratify = y , to split the data equally between train and test based on distribution of target column

In [91]:
X_train,X_test,y_train,y_test=train_test_split(X_df,y,test_size=0.2,random_state=42,stratify=y)

#### Model Dictionary

In [92]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [128]:
models = {
    
    'Logistic regressor' : LogisticRegression(),
    'Multinomial Naive Bayes' : MultinomialNB(),
    'Support Vector Classifier' : SVC()
    
}

#### Evaluation metrics used for getting the best model

In [133]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [134]:
def evaluate_model(models):
    evaluate_report = {}
    for model_name, model in models.items():
        model.fit(X_train,y_train)
        preds = model.predict(X_test)
        roc_score = roc_auc_score(preds,y_test)
        F1 = f1_score(preds,y_test)
        evaluate_report[model_name] = {
            'ROC_AUC score' : roc_score,
            'F1 score' : F1
        }
        
    return evaluate_report

In [135]:
report = evaluate_model(models)

In [136]:
print(report)

{'Logistic regressor': {'ROC_AUC score': 0.9908443540183113, 'F1 score': 0.9357142857142857}, 'Multinomial Naive Bayes': {'ROC_AUC score': 0.9435813854332145, 'F1 score': 0.9066666666666667}, 'Support Vector Classifier': {'ROC_AUC score': 0.9878665318503539, 'F1 score': 0.9124087591240876}}


#### On checking the scores, it is evident that Logistic Regression is the best model for prediction