In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Load the datasets

* 3 different datasets will be loaded in to dataframes
* Dataset can be downloaded in https://www.kaggle.com/nitishabharathi/email-spam-dataset

In [None]:
dataset1 = pd.read_csv('../input/email-spam-dataset/completeSpamAssassin.csv')
dataset2 = pd.read_csv('../input/email-spam-dataset/enronSpamSubset.csv')
dataset3 = pd.read_csv('../input/email-spam-dataset/lingSpam.csv')

In [None]:
print('Dataset 1 shape:',dataset1.shape)
print('Dataset 2 shape:',dataset2.shape)
print('Dataset 3 shape:',dataset3.shape)

In [None]:
dataset1.head()

In [None]:
dataset2.head()

In [None]:
dataset3.head()

### Sample email

In [None]:
dataset1.iloc[0]['Body']

# Data Preparation

* Drop unnecessary columns
* Concatenate the three datasets
* Check for null values

### Drop unnecessary columns

In [None]:
dataset1_1 = dataset1[['Body','Label']]
dataset2_1 = dataset2[['Body','Label']]
dataset3_1 = dataset3[['Body','Label']]

In [None]:
dataset1_1.head(3)

In [None]:
dataset2_1.head(3)

In [None]:
dataset3_1.head(3)

### Concatenate the three datasets

In [None]:
df = pd.concat([dataset1_1,dataset2_1,dataset3_1],ignore_index=True).rename(columns={'Body':'Email','Label':'Spam'})

In [None]:
df.shape

### Check for null values

In [None]:
df.isnull().sum()

In [None]:
# drop the null email

df.dropna(inplace=True)
df.shape

In [None]:
df.head()

# Feature Engineering

* New feature for email length
* New feature for number of special characters in email
* New feature for number of digits in email
* New feature for number of spam trigger words present in email

In [None]:
df2 = df

### New feature for email length

In [None]:
email_length = []

for email in df2['Email']:
    email_length.append(len(email))
    
df2['Email length'] = email_length

### New feature for number of special characters in email

In [None]:
special_characters = []

for email in df2['Email']:
    special_characters_counter = 0
    for char in email:
        if not char.isalnum():
            special_characters_counter+=1
    special_characters.append(special_characters_counter)
        
df2['Special characters'] = special_characters

### New feature for digit count in email

In [None]:
digits = []

for email in df2['Email']:
    digits_counter = 0
    for char in email:
        if char.isdigit():
            digits_counter+=1
    digits.append(digits_counter)
            
            
df2['Digits'] = digits

### New feature for number of spam trigger words present in email

In [None]:
spam_trigger_words = ['buy','click','get','free','order','save','limited']

trigger_words = []

for email in df2['Email']:
    trigger_words_counter = 0
    email = email.split()
    for word in email:
        if word.lower() in spam_trigger_words:
            trigger_words_counter+=1
    trigger_words.append(trigger_words_counter)
    
df2['Trigger words'] = trigger_words

In [None]:
df2.head()

# Exploratory Data Analysis

* Email distribution
* Email length of spam and not spam emails
* Special characters in emails
* Digit count in spam and not spam emails
* Spam trigger words present in spam and not spam emails

### Email distribution

In [None]:
sns.countplot(x='Spam',data=df2)
plt.title('Email distribution')
plt.xticks([0,1],['Not spam','Spam'])
plt.xlabel('Emails')
plt.show()

* We can see that the distribution is not imbalanced

### Email length of spam and not spam emails

In [None]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)

df2['Email length'].describe()

In [None]:
# Average number of characters

sns.barplot(x='Spam',y='Email length',data=df2)
plt.title('Average number of characters')
plt.xticks([0,1],['Not spam','Spam'])
plt.xlabel('Emails')
plt.show()

We can see that there are emails which are too long specially on not spam emails.

### Special characters in emails

In [None]:
df2['Special characters'].describe()

In [None]:
# Average number of special characters

sns.barplot(x='Spam',y='Special characters',data=df2)
plt.title('Average number of special characters')
plt.xticks([0,1],['Not spam','Spam'])
plt.xlabel('Emails')
plt.show()

### Digit count of spam and not spam emails

In [None]:
df2['Digits'].describe()

In [None]:
# Average digit count

sns.barplot(x='Spam',y='Digits',data=df2)
plt.title('Average digit count')
plt.xticks([0,1],['Not spam','Spam'])
plt.xlabel('Emails')
plt.show()

We can see that there are huge amount of non-alphanumeric characters and digits present in emails.

### Spam trigger words present in spam and not spam emails

In [None]:
df2['Trigger words'].describe()

In [None]:
# Trigger words in emails

sns.barplot(x='Spam',y='Trigger words',data=df2)
plt.title('Trigger words in email')
plt.xticks([0,1],['Not spam','Spam'])
plt.xlabel('Emails')
plt.show()

We can see that spam trigger words are definitely present in spam emails.

# Data Preprocessing

* Remove subject, tabs and new lines
* Remove special characters and digits
* Convert emails into lower case
* Tokenize the emails by words / split by words
* Remove stopwords
* Lemmetize words
* Build corpus of emails
* Remove too short and long emails
* Create vectors using TF-IDF

In [None]:
# Importing essential libraries for data preprocessing and nlp
import re
import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# i will be dropping emails with length over 2000 and below 15

df3 = df2[(df2['Email length']<2000) & (df2['Email length']>15)]
df3.shape

In [None]:
# Data Cleaning
corpus = []
lemmatizer = WordNetLemmatizer()

for email in df3['Email']:
    # Remove subject, tabs and new lines
    removed_tabs_newline = re.sub('[\n|\t]',' ',email)
    removed_subject = re.sub('Subject:',' ',removed_tabs_newline)
    
    # Remove special characters and digits
    removed_spchar_digits = re.sub('[^a-zA-Z]',' ',removed_subject)
    
    # Convert emails into lower case
    lower_case_email = removed_spchar_digits.lower()
    
    # Tokenize the emails by words / split by words
    tokenized_email = lower_case_email.split()
    
    # Remove stopwords
    filtered_words = [word for word in tokenized_email if word not in stopwords.words('english')]
    
    # Lemmetize words
    lemmetized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # Build corpus of emails
    email = ' '.join(lemmetized_words)
    corpus.append(email)

In [None]:
# Creating vectors using TF-IDF

tfidf = TfidfVectorizer(max_features=5000)
vectors = tfidf.fit_transform(corpus).toarray()
feature_names = tfidf.get_feature_names()

# Extracting independent and dependent variables from the dataset
X = pd.DataFrame(vectors, columns=feature_names)
y = df3['Spam']

# Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [None]:
classifier = MultinomialNB()
classifier.fit(X_train,y_train)
print('Model score on test data:',classifier.score(X_test,y_test))

In [None]:
cross_val_score(classifier,X_test,y_test)

### Confusion matrix and Classification Report

In [None]:
y_predicted = classifier.predict(X_test)

In [None]:
confusion_matrix_result = confusion_matrix(y_test,y_predicted)
confusion_matrix_result

In [None]:
labels = ['Not spam','Spam']
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix_result,annot=True,cmap='Reds',fmt='.0f',xticklabels=labels,yticklabels=labels)
plt.xlabel('Predicted values')
plt.ylabel('Actual values')
plt.show()

In [None]:
classification_report_result = classification_report(y_test,y_predicted)
print(classification_report_result)