In [None]:
import re
import nltk
import spacy
import gensim
import itertools

from nltk import ngrams
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Importing model architecture
from bilstm_model_architecture import BiLSTMClassifier

In [None]:
torch.cuda.is_available()

In [None]:
# Cast to GPU if not it will be processed with CPU
device = torch.device('cuda')

In [None]:
df = pd.read_csv('../0. Sample Datasets/spam_sample.csv')

In [None]:
pd.set_option('display.max_column', None) 
df.head()

In [None]:
df.info()

In [None]:
df.v1.value_counts()

## We will clean all data (regardless of test/val/train) with the same process before proceeding

In [None]:
lem = nltk.WordNetLemmatizer()

stop_words = nltk.corpus.stopwords.words('english')

def preprocessing(sentence):
    sent = sentence.lower()
    
    # Removing selected symbols, keeping numbers
    sent = re.sub("\(|\)|\/|\-|\#|\!|\?|\.|\,|\"|\'", "", sent)
    
    # Removing emails
    sent = re.sub("\S*@\S*\s?", "", sent)
    
    # Removing numbers
    sent = re.sub("\d+", "", sent)
    
    sent = sent.split() # Splitting
    
    # Lemmatisation and stopword removal
    sent = [lem.lemmatize(word) for word in sent if not word in stop_words]
    sent = " ".join(sent)
    
    return sent

In [None]:
print('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

In [None]:
preprocessing('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

In [None]:
preprocessing('Ok lar... Joking wif u oni...')

## Split data

In [None]:
X = df.v2
y = df.v1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)

In [None]:
y_train.value_counts()

In [None]:
y_val.value_counts()

In [None]:
y_test.value_counts()

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
# Resetting indexes for subsequent processing (less confusing to tally)
X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
X_train.head()

In [None]:
y_train.head()

## Binarising the targets

In [None]:
lb = LabelBinarizer()
y_train_ohe = lb.fit_transform(y_train)
y_val_ohe = lb.transform(y_val)
y_test_ohe = lb.transform(y_test)

In [None]:
lb.classes_

In [None]:
y_train.head()

In [None]:
y_train_ohe[:5]

## Preprocessing all input text data

In [None]:
X_train = [preprocessing(i) for i in X_train]
X_val = [preprocessing(i) for i in X_val]
X_test = [preprocessing(i) for i in X_test]

In [None]:
X_test[:2]

## Tokenising input text

In [None]:
def tokenizer_padding(input_series, max_len, vocabulary=None, train=False):
    
    if type(input_series)!=list:
        input_series = input_series.tolist()
        
    tokenized = [i.split() for i in input_series]
    
    if train:
        # Building vocabulary
        unique_words = ['<PAD>', '<UNK>'] + list(set(itertools.chain.from_iterable(tokenized)))
        vocabulary = dict(zip(unique_words, range(len(unique_words))))
        
    assert vocabulary
    
    # Encoding and padding
    document = []
    
    for i in tokenized:
        tok_sent = [vocabulary[j] if j in vocabulary else 1 for j in i]
        document.append(tok_sent)
        
    for i in range(len(document)):
        if len(document[i])<=max_len:
            document[i] = [0]*(max_len-len(document[i])) + document[i]
        else:
            document[i] = document[i][-max_len:]
            
    output = [np.array(i) for i in document]
    
    if train:
        return np.vstack(output), vocabulary, len(vocabulary)
    else:
        return np.vstack(output)

In [None]:
# Defining parameters for modeling
max_length = seq_len = n_units = 150
d_features = 32

In [None]:
X_train_padded_doc, X_train_vocab, X_train_vocab_size = tokenizer_padding(X_train, max_len=max_length, train=True)
X_val_padded_doc = tokenizer_padding(X_val, max_len=max_length, vocabulary=X_train_vocab)
X_test_padded_doc = tokenizer_padding(X_test, max_len=max_length, vocabulary=X_train_vocab)

In [None]:
X_train_padded_doc[:1]

In [None]:
X_test_padded_doc[:1]

In [None]:
# Convert list to tensors
train_X = torch.tensor(X_train_padded_doc)
train_y = torch.tensor(y_train_ohe.astype(float))

test_X = torch.tensor(X_test_padded_doc)
test_y = torch.tensor(y_test_ohe.astype(float))

val_X = torch.tensor(X_val_padded_doc)
val_y = torch.tensor(y_val_ohe.astype(float))

## Preparing data for training

In [None]:
# Define batch size
batch_size = 16

# FOR TRAINING
# Wrap tensors
train_data = TensorDataset(train_X, train_y)

# Sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# Dataloader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)



# FOR VALIDATING
# Wrap tensors
val_data = TensorDataset(val_X, val_y)

# Sampler for sampling the data during validation for training
val_sampler = SequentialSampler(val_data)

# Dataloader for val set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

## Modeling without class balancing?

In [None]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('..\..\..\Pre-Trained Models\word2vec\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
# Getting embedding matrix for pre-trained Word2Vec model
embeddings_index = dict()

# We will populate the embeddings_index dictionary with all the key<->vector pairs in the Word2Vec model
for line in range(len(w2v_model.index_to_key)):
    embeddings_index[w2v_model.index_to_key[line]] = w2v_model.get_vector(w2v_model.index_to_key[line])
    
# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((X_train_vocab_size, 300)) # Change X_train_vocab_size
for word, i in X_train_vocab.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
embedding_matrix.shape

In [None]:
X_train_vocab_size

## Defining model

In [None]:
model = BiLSTMClassifier(d_features=embedding_matrix.shape[1], embedding_matrix=embedding_matrix, vocab_size=X_train_vocab_size)

In [None]:
model = model.to(device)

In [None]:
print (model)

In [None]:
from torch.optim import AdamW

# Define optimiser
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
y_train.value_counts()

In [None]:
weight = np.array(y_train.value_counts()[0]/y_train.value_counts()[1])

In [None]:
weight

In [None]:
# Converting list of class weights to a tensor
weights = torch.tensor(weight, dtype=torch.float)

# Push weights to GPU
weights = weights.to(device)

# Define loss function
cross_entropy = nn.BCEWithLogitsLoss(pos_weight=weights)

# No of training epochs
epochs = 30

## Define Training & Evaluation Functions

In [None]:
def train():
    model.train()
    
    total_loss, total_accuracy = 0, 0
    
    # Empty list to save model predictions
    total_preds = []
    
    # Iterate over batches
    for step, batch in enumerate(train_dataloader):
        # Progress update for every 50 batches
        if step%50==0 and not step==0:
            print ('Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
            
        # Push batch to GPU
        batch = [r.to(device) for r in batch]
        
        sent_id, labels = batch
        
        # Clear previously calculated gradients
        model.zero_grad()
        
        # Get model predictions for the current batch
        preds = model(sent_id)
        
        # Compute loss between actual and predicted values
        loss = cross_entropy(preds, labels)
        
        # Add on to the total loss
        total_loss = total_loss + loss.item()
        
        # Backward pass to calculate gradients
        loss.backward()
        
        # Clip gradients to 1.0. It helps in preventing exploding gradient problem
        torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
        
        # Update parameters
        optimizer.step()
        
        # Model predictions are stored on GPU, so push it to CPU
        preds = preds.detach().cpu().numpy()
        
        # Append model predictions
        total_preds.append(preds)
        
    # Compute training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
    
    # Predictions are in the form of (no. of batches, size of batch, no of classes)
    # Reshape the prediction in form of (no of samples, no of classes)
    total_preds = np.concatenate(total_preds, axis=0)
    
    return avg_loss, total_preds

In [None]:
def evaluate():
    print ('\nEvaluating...')
    
    # Deactivate dropout layers
    model.eval()
    
    total_loss, total_accuracy = 0, 0
    
    # Empty list to save model predictions
    total_preds = []
    
    # Iterate over batches
    for step, batch in enumerate(val_dataloader):
        # Progress update for every 50 batches
        if step%50==0 and not step==0:
            print ('Batch {:>5,} of {:>5,}.'.format(step, len(val_dataloader)))
            
        # Push batch to GPU
        batch = [t.to(device) for t in batch]
        
        sent_id, labels = batch
        
        # Deactivate autograd()
        with torch.no_grad():
            
            # Model predictions
            preds = model(sent_id)
            
            # Compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)
            
            total_loss = total_loss + loss.item()
            
            preds = preds.detach().cpu().numpy()
            
            total_preds.append(preds)
            
    # Compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)
    
    # Reshape the predictions in form of (no of samples, no of classes)
    total_preds = np.concatenate(total_preds, axis=0)
    
    return avg_loss, total_preds

## Iterate through training loop

In [None]:
# Set initial loss to infinite
best_valid_loss = float('inf')

# Empty lists to store training and validation loss of each epoch
train_losses = []
valid_losses = []

# For each epoch
for epoch in range(epochs):
    print ('\nEpoch {:}/ {:}'.format(epoch+1, epochs))
    
    # Train model
    train_loss, _ = train()
    
    # Evaluate model
    valid_loss, _ = evaluate()
    
    # Save the best model
    if valid_loss<best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_model_weights/pytorch_bilstm.pt')
        
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print (f"\nTraining Loss: {train_loss:.5f}")
    print (f"Validation Loss: {valid_loss:.5f}")

## Visualise training and validation loss

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses, 'g', valid_losses, 'r')

## Model evaluation

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

In [None]:
model_path = 'saved_model_weights/pytorch_bilstm.pt'

model.load_state_dict(torch.load(model_path))

In [None]:
y_preds = model(test_X.to(device))

In [None]:
# Apply sigmoid function to outputs (sigmoid was auto applied by the loss function during training but the model architecture outputs predictions pre-sigmoid application)
y_preds = nn.functional.sigmoid(y_preds)

In [None]:
y_preds = y_preds.detach().cpu().numpy()

In [None]:
y_preds[:5]

In [None]:
# Change probabilities above to 1 for probabilities above 0.5
y_hat = (y_preds>=0.5).astype(int)

In [None]:
print (classification_report(y_test_ohe, y_hat, target_names=lb.classes_))

In [None]:
roc_auc_score(y_test_ohe, y_hat)

In [None]:
accuracy_score(y_test_ohe, y_hat)