In this workbook, I have created a deep learning model using Keras API. 

I have used an embedded layer which creates the word2vec vectors for the feature texts. Then added a LSTM layer followed by fully connected layer. The accuracy of the model on new data is 89% 



### Importing the required modules/packages

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import re
import nltk
from nltk.corpus import stopwords
import string
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
#from mlxtend.feature_selection import ColumnSelector
from sklearn.compose import ColumnTransformer
nltk.download('stopwords')
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gunne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

### Loading file and looking into the dimensions of data

In [None]:
raw_data = pd.read_csv("SMSSpamData.csv",names=['label','text'])
pd.set_option('display.max_colwidth',100)


print(f"Shape of Data --> {raw_data.shape}\n")
#print(pd.crosstab(raw_data['label'],columns = 'label'))
#pd.crosstab(raw_data['label'],columns = 'label',normalize=True)
raw_data.head()

In [None]:
### Label distrbution (Ham is messaged which are not Spam)
pd.crosstab(raw_data['label'],columns = 'label',normalize=True)
#raw_data.head()

## **Data Processing**

### Functions to Create new features and cleaning the data

In [None]:
## Percentage Count
def punct_pc(text):
    punct_count = sum([1 for char in text if char in string.punctuation])
    return (punct_count/(len(text) - text.count(' ')))*100

## Stem
def clean_data(text):
    punct = "".join([word.lower() for word in text if word not in string.punctuation])
    splt = re.split('\W+',punct)
    txt = [nltk.PorterStemmer().stem(word) for word in splt if word not in nltk.corpus.stopwords.words('english')]
    return txt

## CAPS 
def count_caps(text):
    





### Train and Test Split 

In [None]:
## Splitting the Data using Test size 0.2
X_train,X_test,y_train,y_test = train_test_split(raw_data[['text',]],raw_data['label'],test_size=0.2,random_state=23)

In [None]:
y_train

In [None]:
## Covert target to binary
y_train = np.where(y_train == 'spam', 1,0)
y_test = np.where(y_test == 'spam', 1,0)

In [None]:
######################## Create new features Train and Test Data ########################
#- Two new features are created - 
#- 1) text_length (the total length of the text)
#- 2) Punct_pc (the percentage of punctuations in the text)

## Train Data
X_train["punct_pc"] = X_train["text"].apply(lambda x: punct_pc(x))
X_train["text_length"] = X_train["text"].apply(lambda x: len(x)-x.count(' '))

## Test Data
X_test["punct_pc"] = X_test["text"].apply(lambda x: punct_pc(x))
X_test["text_length"] = X_test["text"].apply(lambda x: len(x)-x.count(' '))



### Pipeline to process the Text Data

- Tokenization
- Cleaning
- Normalization
- Lemmatization
- Steaming

Custom Transformer to select the Columns

In [None]:
## Customer transformer to Select features 
class ColumnExtractor(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = pd.Series(X[self.cols])
        return (Xcols)


In [None]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator


nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 
                    variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)

        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])



num_cols = ["punct_pc","text_length"]
Column_trans = ColumnTransformer(
     [('scaler', StandardScaler(),num_cols)],
     remainder='drop')

num_pipe = Pipeline([
  ('scaler', Column_trans)
  ])

In [None]:
X_train.head()

In [None]:
Preprocess_text = Pipeline([("select_text", ColumnExtractor(cols="text")),
                            ('preprocess', TextPreprocessor())
                            ])

preprocessing_pipeline = Pipeline([
    ('feat_union', FeatureUnion(transformer_list=[
          ('text_pipeline', Preprocess_text),
          ('num_pipeline', num_pipe)
          ]))
    ])

**Preprocess the Train and Test data using the pipeline**

In [None]:
X_train_preprocessed = Preprocess_text.fit_transform(X_train)
X_test_preprocessed = Preprocess_text.fit_transform(X_test)

Calculate the size of the vocabulary (i.e the number of unique words in the entire corpus) . This is needed as a paramter "input_dim" in the embedded layer 

In [None]:
from collections import Counter
# Count unique words
def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

counter = counter_word(X_train_preprocessed)
vocab_size = len(counter)

In [None]:
# We are taking a maximum lenght for the sequence as 30 (sequence above this length will be trimmed down and 
# and below this lenght will be padded with zeroes)
max_length = 30
num_words = vocab_size + 1000

Using Tokenizer class from keras tokenize the sentences. Tokinzer assignes a unique ID for each word in the entire trainign set

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train_preprocessed)
X_train_sequences = tokenizer.texts_to_sequences(X_train_preprocessed)

In [None]:
## We can check the index of all the words created by the tokenizer
word_index = tokenizer.word_index
#print(word_index)

Using pad_squeence, pad the sequences to have the same length

In [None]:
from keras.preprocessing.sequence import pad_sequences
X_train_padded = pad_sequences(
    X_train_sequences, maxlen=max_length, padding="post", truncating="post"
)

In [None]:
## Applying same tranformation to Test set
test_sequences = tokenizer.texts_to_sequences(X_test_preprocessed)
test_padded = pad_sequences(
    test_sequences, maxlen=max_length, padding="post", truncating="post"
)

## Deep Learning model training

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

model = Sequential()

model.add(Embedding(num_words, 32, input_length=max_length))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(1, activation="sigmoid"))


optimizer = Adam(learning_rate=3e-4)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
Trained_model = model.fit(
    X_train_padded, y_train, epochs=10, validation_data=(test_padded, y_test)
)

In [None]:
model.save("trained_model.h5")

# **Predict on New Test Data**

In [None]:
test_data = pd.read_csv("Test_Emails.csv",names=['label','text'], encoding='ISO-8859-1')

In [None]:
test_data.shape

In [None]:
new_X_test = test_data.loc[:,["text"]]
new_Y_test = test_data.iloc[:,0]

In [None]:
new_X_test.shape

In [None]:
new_Y_test = np.where(Y_test == 'spam', 1, 0)

**Preprocess Test data by fitting in the preprocess pipeline**

In [None]:
## Fit X_test in pipeline
new_X_test_processes = Preprocess_text.fit_transform(new_X_test)

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(new_X_test_processes)
new_X_test_sequences = tokenizer.texts_to_sequences(new_X_test_processes)


from keras.preprocessing.sequence import pad_sequences
new_X_test_padded = pad_sequences(
    new_X_test_sequences, maxlen=max_length, padding="post", truncating="post"
)

In [None]:
predict = model.predict_classes(new_X_test_padded)

In [None]:
## Performance metrics
from sklearn.metrics import classification_report
print(classification_report(new_Y_test,predict))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(new_Y_test,predict))