# Simple RNN, LSTM and GRU implementation. 
# Pipeline for spam detection task solving, using pretrained glove embedding.

## Data preparing

In [None]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
import re
import fasttext.util
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

In [None]:
INPUT_PATH = '/kaggle/input/sms-spam-collection-dataset/spam.csv'
data = pd.read_csv(INPUT_PATH)[['v1', 'v2']]
data['target'] = (data['v1'] == 'spam').astype(int)
data['v2'] = data['v2'].apply(lambda w : w.lower())
data['v2'] = data['v2'].apply(lambda w : re.sub(r'[^a-z]+', ' ', w))

In [None]:
data.head(5)

In [None]:
not_null_ind = data['v2'].apply(lambda w : len(w.strip()) > 0)
sentences = data['v2'][not_null_ind].values
pre_target = data['target'][not_null_ind].values.astype(int)
words = map(lambda w : w.split(), sentences)
words = list(filter(lambda w : len(w) > 0, words))

In [None]:
device = 'cuda'

In [None]:
%%time
embeddings_dict = {}
with open("/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [None]:
embeddings = []
target = []
for i, sentence in enumerate(words):
    sent_emb = []
    for word in sentence:
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            sent_emb.append([embedding_vector])
    if len(sent_emb) > 0:
        embeddings.append(torch.Tensor(sent_emb).to(device))
        target.append(pre_target[i])

In [None]:
len(embeddings), len(target), embeddings[0].shape

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.Tensor(y).to(device)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
training_data = CustomDataset(embeddings, target)
train_dataloader = DataLoader(training_data, batch_size=1, shuffle=True)

## Simple RNN

In [None]:
class Head(nn.Module):
    def __init__(self, input_size):
        super(Head, self).__init__()
        self.layer = nn.Linear(input_size, 1)
        self.act = nn.Sigmoid()
        
    def forward(self, x):
        return self.act(self.layer(x))

In [None]:
class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.Lo = nn.Linear(input_size + hidden_size, output_size)
        self.tanh = nn.Tanh()

    def forward(self, x, hidden):
        return self.tanh( self.Lo( torch.cat([x, hidden], dim=1) ) ) 

In [None]:
hidden_size = 50
epochs = 3
rnn_model = RNN(embeddings[0][0][0].size(0), hidden_size, hidden_size).to(device)
head_model = Head(hidden_size).to(device)
optimizer = torch.optim.Adam( list(rnn_model.parameters()) + list(head_model.parameters()) )
criterion = nn.BCEWithLogitsLoss()

In [None]:
for epoch in range(epochs):
    
    losses = []
    i = 0
    for X_in, y_in in train_dataloader:
        
        rnn_model.zero_grad()
        head_model.zero_grad()

        ht = torch.zeros(1, hidden_size).to(device)

        for i, x_in in enumerate(X_in[0]):
            ht = rnn_model(x_in, ht)

        out = head_model(ht)
        loss = criterion( out, y_in.unsqueeze(1) )
        
        loss.backward()
        optimizer.step()
        
        if not np.isnan(loss.item()):
            losses.append(loss.item())
        
    print(np.mean(losses))

## LSTM

In [None]:
class LSTM(nn.Module):
    
    def __init__(self, input_dim, hidden_dim):
        super(LSTM, self).__init__()
        
        self.forget_gate = nn.Sequential(
            nn.Linear( input_dim + hidden_dim, hidden_dim ),
            nn.Sigmoid()
        )
        self.input_gate = nn.Sequential(
            nn.Linear( input_dim + hidden_dim, hidden_dim ),
            nn.Sigmoid()
        )
        self.new_info = nn.Sequential(
            nn.Linear( input_dim + hidden_dim, hidden_dim ),
            nn.Tanh()
        )
        self.output_gate = nn.Sequential(
            nn.Linear( input_dim + hidden_dim, hidden_dim ),
            nn.Sigmoid()
        )
        self.new_hidden = nn.Sequential(
            nn.Linear( hidden_dim, hidden_dim ),
            nn.Tanh()
        )
        
        self.head = nn.Sequential(
            nn.Linear( hidden_dim, 1 ),
            nn.Sigmoid()
        )
    
    def forward(self, x, hidden, cell):
        combined = torch.cat( [x, hidden], 1 )
        old_info = self.forget_gate(combined) * cell
        new_info = self.input_gate(combined) * self.new_info(combined)
        new_cell_state = old_info + new_info
        new_hidden = self.output_gate(combined) * self.new_hidden(new_cell_state)
        output = self.head(new_hidden)
        return new_hidden, new_cell_state, output

In [None]:
hidden_size = 50
epochs = 3
lstm_model = LSTM(embeddings[0][0][0].size(0), hidden_size).to(device)
optimizer = torch.optim.Adam( lstm_model.parameters() )
criterion = nn.BCEWithLogitsLoss()

In [None]:
def train(model, optimizer, train_dataloader):
    for epoch in range(epochs):
        losses = []
        for X_in, y_in in train_dataloader:

            model.zero_grad()
            ht = torch.zeros(1, hidden_size).to(device)
            ct = torch.zeros(1, hidden_size).to(device)

            for i, x_in in enumerate(X_in[0]):
                ht, ct, output = model(x_in, ht, ct)

            loss = criterion( output, y_in.unsqueeze(1) )

            loss.backward()
            optimizer.step()

            if not np.isnan(loss.item()):
                losses.append(loss.item())
            else:
                print('nan')

        print(np.mean(losses))

    return model, optimizer, losses

In [None]:
_ = train(lstm_model, optimizer, train_dataloader)

## GRU

In [None]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GRU, self).__init__()
        self.reset_gate = nn.Sequential(
            nn.Linear(input_dim + hidden_dim, hidden_dim),
            nn.Sigmoid()
        )
        self.new_info = nn.Sequential(
            nn.Linear(input_dim + hidden_dim, hidden_dim),
            nn.Tanh()
        )
        self.update_gate = nn.Sequential(
            nn.Linear(input_dim + hidden_dim, hidden_dim),
            nn.Sigmoid()
        )
        
        self.head = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x, hidden):
        combined = torch.cat( [x, hidden], 1 )
        forget_hidden = self.reset_gate(combined) * hidden
        new_combined = torch.cat( [x, forget_hidden], 1 )
        updated = self.update_gate(combined)
        new_hidden = hidden * (1 - updated) + self.new_info(new_combined) * updated
        output = self.head(new_hidden)
        return new_hidden, output

In [None]:
hidden_size = 50
epochs = 3
gru_model = GRU(embeddings[0][0][0].size(0), hidden_size).to(device)
optimizer = torch.optim.Adam( gru_model.parameters() )
criterion = nn.BCEWithLogitsLoss()

In [None]:
def train(model, optimizer, train_dataloader):
    for epoch in range(epochs):
        losses = []
        for X_in, y_in in train_dataloader:

            model.zero_grad()
            ht = torch.zeros(1, hidden_size).to(device)

            for i, x_in in enumerate(X_in[0]):
                ht, output = model(x_in, ht)

            loss = criterion( output, y_in.unsqueeze(1) )

            loss.backward()
            optimizer.step()

            if not np.isnan(loss.item()):
                losses.append(loss.item())
            else:
                print('nan')

        print(np.mean(losses))

    return model, optimizer, losses

In [None]:
_ = train(gru_model, optimizer, train_dataloader)

## Attention

Just a scheme for attention in RNN implementation. You need to specify vocabulary, get_word_by_prob function, get_embedding_by_word function and reform train data to sequences of words.

In [None]:
def get_word_by_prob(probs):
    pass
def get_embedding_by_word(word):
    pass

In [None]:
class AttentionGRU(nn.Module):
    def __init__(self, input_size, hidden_size, seq_len, vocab_size):
        super(AttentionLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        
        self.enc = GRU(input_size, hidden_size)
        self.dec = GRU(input_size, hidden_size)
        self.proj = nn.Sequential(
            nn.Linear(hidden_size, seq_len),
            nn.Softmax(seq_len)
        )
        self.vocab_proj = nn.Sequential(
            nn.Linear(hidden_size, vocab_size),
            nn.Softmax(vocab_size)
        )
        
    def forward(self, seq_x, start_emb):
        assert len(seq_x) == self.seq_len
        hs = []
        hidden = torch.zeros((1, self.hidden_size)).to(device)
        
        for x in seq_x:
            hidden = self.enc(x, hidden)
            hs.append(hidden)
        
        hs = torch.Tensor(hs).to(device)
        
        hidden = torch.zeros((1, self.hidden_size)).to(device)
        prev = start_emb
        outputs = []
        for i in range(self.seq_len):
            hidden = self.dec(prev, hidden)
            input_mask = self.proj(hidden)
            att_vector = (input_mask * hs).sum(dim=-1)
            output = self.vocab_proj(att_vector)
            
            word = get_word_by_prob(output)
            outputs.append(word)
            
            prev = get_embedding_by_word(word)