In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re
import contractions # I'll => I will

from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim 

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer

In [None]:
data = pd.read_csv("./data/completeSpamAssassin.csv")


# -- 1 = spam, 0 = ham
print(f"-- Number of examples: {len(data)}")
print(f"-- Shape: {data.shape}")
print(f"-- Null elements: {data.isnull().sum()}")
print(data.head())

In [None]:
# check for body NaN and remove it
print(data.loc[data.Body.isnull()])
data = data.dropna()

In [None]:
data.Body.describe()

In [None]:
print(type(data.Body))

data.Body[0]

In [None]:
spam_count = len(data.loc[data.Label == 1])
ham_count = len(data.loc[data.Label == 0])
print(f"Spam: {spam_count}")
print(f"Ham: {ham_count}")
# Creating histogram
bins = ["ham", "spam"]
fig, ax = plt.subplots(1, 1)
ax.hist(data.Label, bins=2, facecolor='green', edgecolor='gray')
ax.set_ylabel("Nb of examples")
ax.patches[0].set_facecolor("blue")
ax.patches[1].set_facecolor("yellow")
ax.legend(handles=[ax.patches[0], ax.patches[1]], labels=['Ham (0)', 'Spam (1)'])
ax.set_title("Spam/Ham")
plt.show();

### Text Cleaning

In [None]:
nltk.download("stopwords")
stopwords = stopwords.words("english")
print(stopwords)

In [None]:
def cleaning_text(data, stopwords):
    cleaned_text = []
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)
    remove_html_tags = lambda x: re.sub('(<[\w\s]*/?>)',"",x)
    remove_digits = lambda x: re.sub("\d+", "", x)
    tokenize = lambda x: word_tokenize(x)
    stem = lambda w: [ ps.stem(x) for x in w ]
    lem = lambda x: [ lemmatizer.lemmatize(word) for word in x ]
    
    operations = []
    operations.append(remove_non_alphabets)
    operations.append(remove_html_tags)
    operations.append(remove_digits)
    operations.append(tokenize)
    operations.append(stem)
    operations.append(lem)
    
    
    for op in tqdm(operations):
        data = data.apply(op)

    data = data.apply(lambda x: " ".join(x))

    return data

In [None]:
cleaned_body = cleaning_text(data.Body, stopwords)
print(type(cleaned_body))

In [None]:
data_processed = pd.DataFrame({"Body": cleaned_body, "Label": list(data.Label)})
data_processed

In [None]:
def data_body_to_string(data, spam):
    if spam:
        label = 1
    else:
        label = 0
    data = data.loc[data.Label == spam]
    data_list = data.Body.tolist()
    data_string = " ".join(data_list)
    return data_string

In [None]:
data_spam_string = data_body_to_string(data_processed, spam=True)
data_ham_string = data_body_to_string(data_processed, spam=False)

In [None]:
wc_spam = WordCloud(stopwords = stopwords, collocations=True).generate(data_spam_string)
wc_ham = WordCloud(stopwords = stopwords, collocations=True).generate(data_ham_string)

In [None]:
plt.figure(figsize=(12,12))
plt.imshow(wc_spam, interpolation='bilInear')
plt.title("Spam Email words")
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize=(12,12))
plt.imshow(wc_ham, interpolation='bilInear')
plt.title("Ham Email words")
plt.axis('off')
plt.show()

### Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_processed.Body, data_processed.Label, stratify=data_processed.Label, test_size=0.3)

### Model 
embedding layer => bidirectional => GRU

#### Embedding Layer
![Embedding](img/embedding_layer.png) <br>
Lookup table and this table map the index/token to a vector, and this vector represents the word in  the higher dimensional space

---

#### Bidirectional
2 LSTM Networks 

---

#### GRU
Gated Recurrent Units. 
