# Recurrent Neural Network

![LSTM](imgs/LSTM3-chain.png)

![LSTM](imgs/LSTM2-notation.png)



![LSTM](imgs/LSTM3-focus-f.png)

![LSTM](imgs/LSTM3-focus-i.png)

![LSTM](imgs/LSTM3-focus-C.png)

![LSTM](imgs/LSTM3-focus-o.png)

## From pytorch documentation

\begin{array}{ll} \\
    f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
    i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
    g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
    c_t = f_t * c_{(t-1)} + i_t * g_t \\
    o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\    
    h_t = o_t * \tanh(c_t) \\
\end{array}

In [113]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

batch_size = 2
seq_len = 10
input_size = 3
hidden_size = 4 

inputs = torch.randn(seq_len, batch_size, input_size)
inputs

tensor([[[-0.8835, -0.0504,  0.8493],
         [ 0.9167, -1.7890, -0.0660]],

        [[-1.3275, -2.4630, -0.1197],
         [-0.8243, -1.2423, -0.8760]],

        [[ 1.4885,  0.6106,  1.1932],
         [-1.1535,  0.9402,  1.0332]],

        [[ 0.5672, -0.8871,  0.5998],
         [ 1.1069, -1.5476, -0.7041]],

        [[ 1.0619,  0.9808, -0.3712],
         [-0.5278, -0.4647, -0.9003]],

        [[ 0.8746,  0.8302,  1.2621],
         [-0.6655, -0.3673, -1.1719]],

        [[ 0.6300,  1.9727,  0.4929],
         [ 0.1673,  0.0748, -0.8296]],

        [[-0.1490,  1.3435, -0.3468],
         [ 0.6468,  0.2840, -2.7029]],

        [[ 0.4469, -0.4357,  0.4136],
         [ 2.3348, -0.6733, -0.7259]],

        [[ 1.5137,  0.0275,  0.6132],
         [ 0.3686,  0.1394,  0.9901]]])

In [114]:
lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size)

In [115]:
## hidden_0 = (h_0, c_0)
hidden_0 = (torch.zeros(1, batch_size, hidden_size), torch.zeros(1, batch_size, hidden_size))

In [116]:
lstm_out, lstm_hidden = lstm(inputs)
lstm_out.shape

torch.Size([10, 2, 4])

In [117]:
lstm_out

tensor([[[ 0.0640,  0.0464,  0.1051,  0.0191],
         [ 0.2720,  0.0936,  0.0765, -0.0414]],

        [[ 0.1944,  0.1321,  0.0993, -0.0421],
         [ 0.1642,  0.0201,  0.0750, -0.0731]],

        [[ 0.2065,  0.1581,  0.0218,  0.1140],
         [ 0.0220, -0.0575,  0.1398, -0.0065]],

        [[ 0.2827,  0.2223,  0.0993,  0.0716],
         [ 0.2915,  0.0311,  0.0617, -0.0182]],

        [[ 0.0796,  0.0630, -0.0630,  0.2852],
         [ 0.0971, -0.0696,  0.0629, -0.0192]],

        [[ 0.1165,  0.1402,  0.0299,  0.2328],
         [ 0.0239, -0.1590,  0.0522, -0.0334]],

        [[ 0.0026,  0.0230, -0.0890,  0.2620],
         [ 0.0248, -0.1396,  0.0399,  0.0643]],

        [[-0.0547, -0.0997, -0.0248,  0.2094],
         [-0.1517, -0.1526, -0.0122,  0.1687]],

        [[ 0.1376,  0.0074,  0.0915,  0.1350],
         [ 0.2518, -0.0673,  0.0108,  0.2985]],

        [[ 0.2623,  0.0973,  0.0545,  0.2693],
         [ 0.1425,  0.0413,  0.1222,  0.1845]]], grad_fn=<StackBackward>)

If we want to put hidden inputs to zeros, there is no need to provide them.

In [118]:
lstm_out, lstm_hidden = lstm(inputs)
lstm_out

tensor([[[ 0.0640,  0.0464,  0.1051,  0.0191],
         [ 0.2720,  0.0936,  0.0765, -0.0414]],

        [[ 0.1944,  0.1321,  0.0993, -0.0421],
         [ 0.1642,  0.0201,  0.0750, -0.0731]],

        [[ 0.2065,  0.1581,  0.0218,  0.1140],
         [ 0.0220, -0.0575,  0.1398, -0.0065]],

        [[ 0.2827,  0.2223,  0.0993,  0.0716],
         [ 0.2915,  0.0311,  0.0617, -0.0182]],

        [[ 0.0796,  0.0630, -0.0630,  0.2852],
         [ 0.0971, -0.0696,  0.0629, -0.0192]],

        [[ 0.1165,  0.1402,  0.0299,  0.2328],
         [ 0.0239, -0.1590,  0.0522, -0.0334]],

        [[ 0.0026,  0.0230, -0.0890,  0.2620],
         [ 0.0248, -0.1396,  0.0399,  0.0643]],

        [[-0.0547, -0.0997, -0.0248,  0.2094],
         [-0.1517, -0.1526, -0.0122,  0.1687]],

        [[ 0.1376,  0.0074,  0.0915,  0.1350],
         [ 0.2518, -0.0673,  0.0108,  0.2985]],

        [[ 0.2623,  0.0973,  0.0545,  0.2693],
         [ 0.1425,  0.0413,  0.1222,  0.1845]]], grad_fn=<StackBackward>)

Finally the last output is the output of RRR. We can get it by

In [119]:
lstm_out[-1]

tensor([[0.2623, 0.0973, 0.0545, 0.2693],
        [0.1425, 0.0413, 0.1222, 0.1845]], grad_fn=<SelectBackward>)

It is often convient to have batches as the first dimension of the input. One can do it by adding `batch_first=True` parameter.

In [124]:
lstm_batch_first = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True) 
inputs_batch_first = torch.randn(batch_size, seq_len, input_size)
inputs_batch_first

tensor([[[ 0.4287, -0.8089,  0.6496],
         [ 1.0766, -0.9408,  0.4500],
         [-2.0297,  0.4132, -0.3924],
         [-0.6662,  1.3396, -0.6382],
         [-0.7902, -1.3808, -0.3269],
         [ 0.4475,  1.4489,  0.6695],
         [ 1.4372, -0.7656,  0.5237],
         [ 1.3693, -0.2232,  0.7301],
         [ 1.9571, -1.6995,  0.5775],
         [-0.0383,  0.9943, -0.8937]],

        [[-0.3910,  2.0286, -0.5208],
         [ 0.9066, -1.7970,  0.0912],
         [-0.4176, -0.2242, -0.8004],
         [ 0.3201, -0.6448,  0.0149],
         [ 1.4970,  0.3302,  0.4920],
         [ 0.3469,  1.1633,  0.5200],
         [ 0.4284, -1.3784,  0.3542],
         [-0.8269,  0.3485, -0.8807],
         [ 0.7595, -0.8450,  0.4487],
         [ 0.1381, -1.2362, -1.0107]]])

In [125]:
lstm_out, lstm_hidden = lstm_batch_first(inputs_batch_first)
lstm_out

tensor([[[ 0.1009,  0.0273,  0.0387, -0.0461],
         [ 0.1351,  0.0650,  0.0433, -0.1063],
         [ 0.1947, -0.1088,  0.0679, -0.0051],
         [ 0.0817, -0.0986,  0.1297,  0.1314],
         [ 0.1716, -0.2346,  0.1086,  0.0577],
         [ 0.0385, -0.0090,  0.1567,  0.0923],
         [ 0.0777,  0.0757,  0.1092, -0.0861],
         [ 0.0860,  0.1483,  0.0588, -0.1190],
         [ 0.1517,  0.2252, -0.0215, -0.1544],
         [ 0.1559,  0.0661,  0.0597,  0.0148]],

        [[ 0.0413,  0.0129,  0.0749,  0.1803],
         [ 0.1303, -0.0385,  0.0931, -0.0588],
         [ 0.1456, -0.1233,  0.1383,  0.0170],
         [ 0.1230, -0.1054,  0.1387, -0.0338],
         [ 0.0651,  0.0693,  0.1362, -0.0918],
         [ 0.0652,  0.0854,  0.1089,  0.0176],
         [ 0.1615,  0.0134,  0.0784, -0.0625],
         [ 0.1523, -0.1066,  0.1237,  0.0600],
         [ 0.1207, -0.0580,  0.0993, -0.0582],
         [ 0.1892, -0.1685,  0.1514, -0.0690]]], grad_fn=<TransposeBackward0>)

Then we get the finial output by:

In [126]:
lstm_out[:, -1]

tensor([[ 0.1559,  0.0661,  0.0597,  0.0148],
        [ 0.1892, -0.1685,  0.1514, -0.0690]], grad_fn=<SelectBackward>)

## Embedings

In [109]:
dict_size = 100
sentences = torch.randint(dict_size, (batch_size, seq_len))
sentences

tensor([[39, 19, 19, 26, 91, 40, 57, 78, 89, 31],
        [46,  5, 71, 60, 56, 35, 97, 78, 19, 78]])

In [110]:
embedding_dim = 3
embedding = nn.Embedding(dict_size, embedding_dim)

In [127]:
sentences_embedded = embedding(sentences)
sentences_embedded

tensor([[[ 0.1624,  1.3295, -3.2450],
         [ 0.1561, -0.2889, -0.1083],
         [ 0.1561, -0.2889, -0.1083],
         [-1.4842,  0.3282,  0.0872],
         [ 1.2635,  2.7697, -0.3629],
         [-0.6307,  0.6052,  0.2908],
         [ 0.5322, -0.7821, -0.0670],
         [ 0.5202, -0.4762, -0.6227],
         [-0.4210,  0.3746,  0.2844],
         [-0.5235, -1.5446,  1.5872]],

        [[ 1.1131,  0.4510,  0.8111],
         [-0.8521, -0.2145,  0.1316],
         [ 0.3181, -0.9739,  0.3885],
         [-1.0910, -2.0403, -0.7762],
         [ 0.1402, -0.5982, -0.6415],
         [-0.8607, -1.0654, -1.3223],
         [-0.4741, -0.8455, -0.5330],
         [ 0.5202, -0.4762, -0.6227],
         [ 0.1561, -0.2889, -0.1083],
         [ 0.5202, -0.4762, -0.6227]]], grad_fn=<EmbeddingBackward>)

In [129]:
lstm_out, _ = lstm_batch_first(sentences_embedded)
lstm_out[:, -1]

tensor([[ 0.1401, -0.0974,  0.0119, -0.0081],
        [ 0.0828, -0.0992,  0.2193, -0.0599]], grad_fn=<SelectBackward>)

### Procesamiento del lenguaje natural

Tenemos un dataset que tiene textos que queremos evaluar si son tóxicos o no tóxicos. Este dataset puedes bajarlo del siguiente enlace 
[aquí](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). 


In [86]:
import pandas as pd

comments_df = pd.read_csv("data/jigsaw-toxic-comment-classification-challenge/train.csv")[:1000]
comments_df.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [87]:
from sklearn.model_selection import train_test_split
label_colnames = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

X_train, X_test, y_train, y_test = \
    train_test_split(comments_df[['comment_text']], comments_df[label_colnames], random_state=667)
X_train.head(2)

Unnamed: 0,comment_text
744,"""\n\nNo not really. We may ask that the mentio..."
981,"""\nHaha, you're fine. I mean, you're allowed t..."


In [88]:
import re

import nltk
from nltk.stem import SnowballStemmer

BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
STEMMER = SnowballStemmer('english')

class TextPreprocessor:
        
    def transfrom_text(self, text):
        text = text.lower()
        text = re.sub(BAD_SYMBOLS_RE, " ", text) # process bad symbols
        # text = " ".join([STEMMER.stem(word) for word in text.split()])
        return text
    
    def transform(self, series):
        return series.apply(lambda text: self.transfrom_text(text))

In [89]:
preprocessor = TextPreprocessor()
X_train_preprocessed = preprocessor.transform(X_train['comment_text'])
X_test_preprocessed = preprocessor.transform(X_test['comment_text'])

In [92]:
print(X_train["comment_text"].iloc[0])
print(X_train_preprocessed.iloc[0])

"

No not really. We may ask that the mention of fat being the fire source of the cremation of millions be reconsidered though - along with a few other items. The fat cremation ""wiki fact"" is citable ( www.hdot - Emory U no less, Lipstadt) but doubtful. If the same science was applied to the holocaust as say the tinfoilers or flat earthers the deniers would be overjoyed. Be careful as to who gets the nutty fringe tinfoil label in the end. You get the permits and we'll bring the shovels. 159.105.80.141  "
   no not really  we may ask that the mention of fat being the fire source of the cremation of millions be reconsidered though   along with a few other items  the fat cremation   wiki fact   is citable   www hdot   emory u no less  lipstadt  but doubtful  if the same science was applied to the holocaust as say the tinfoilers or flat earthers the deniers would be overjoyed  be careful as to who gets the nutty fringe tinfoil label in the end  you get the permits and we ll bring the sho

In [93]:
print(X_train["comment_text"].iloc[1])
print(X_train_preprocessed.iloc[1])

"
Haha, you're fine. I mean, you're allowed to do it, but I'm just selfish, I guess. =) I really appreciate your kindness, though. And I really respect that you asked, because when other signatures that were borrowed, no one let me know or gave me any credit! So I feel badly that since you asked, you'd feel really badly about doing it now, haha. But I can help you figure out a nice one or pick out some fun colors. Have a great day, and happy Wikying! τ "
  haha  you re fine  i mean  you re allowed to do it  but i m just selfish  i guess     i really appreciate your kindness  though  and i really respect that you asked  because when other signatures that were borrowed  no one let me know or gave me any credit  so i feel badly that since you asked  you d feel really badly about doing it now  haha  but i can help you figure out a nice one or pick out some fun colors  have a great day  and happy wikying     


In [94]:
def create_dicts(text):
    word_set = set()
    words = text.split()
    for word in words:
        word_set.add(word)
    word_list = ["<UNK>", "<PAD>"] + sorted(list(word_set))
    word2idx = {word_list[idx]: idx for idx in range(len(word_list))}
    idx2word = {idx: word_list[idx] for idx in range(len(word_list))}
    return word2idx, idx2word

class Tokenizer:
    
    def __init__(self):
        self.word2idx = None
        self.idx2word = None
        
    def fit(self, X):
        text = " ".join(X)
        self.word2idx, self.idx2word = create_dicts(text)
    
    def transform(self, X):
        return [self.transform_line(line) for line in X]
        
    def transform_line(self, line):
        return [self.word2idx.get(word, 0) for word in line.split()]

In [95]:
tokenizer = Tokenizer()
tokenizer.fit(X_train_preprocessed)

In [96]:
X_train_tokenized = tokenizer.transform(X_train_preprocessed)

In [97]:
class Cutter:

    def __init__(self, size=150):
        self.size = size
        
    def transform(self, X):
        new_X = []
        for line in X:
            new_line = line[:self.size]
            new_line = new_line + [1] * (self.size - len(new_line))
            new_X.append(new_line)
        return new_X    

In [98]:
cutter = Cutter()
X_train_cutted = cutter.transform(X_train_tokenized)

In [167]:
labels = torch.from_numpy(y_train.values)
labels

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [168]:
train_data = TensorDataset(torch.tensor(X_train_cutted), torch.from_numpy(y_train.values).float())

batch_size = 32

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

In [169]:
class LSTMModel(nn.Module):
    
    def __init__(self, dict_size, output_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(dict_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embeded = self.embedding(x)
        lstm_out, _ = self.lstm(embeded)
        lstm_out = lstm_out[:, -1]        
        logits = self.fc(lstm_out)
        out = self.sigmoid(logits)
        return out

In [170]:
dict_size = len(tokenizer.word2idx)
output_size = len(label_colnames)
embedding_dim = 3
hidden_dim = 4

lstm_model = LSTMModel(dict_size, output_size, embedding_dim, hidden_dim)

In [171]:
X_train_torch = torch.tensor(X_train_cutted)
X_train_torch.shape

torch.Size([750, 150])

In [172]:
lstm_model(X_train_torch)

tensor([[0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4708, 0.4680, 0.4677, 0.4272, 0.5332, 0.5480],
        ...,
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4881, 0.4844, 0.4511, 0.4564, 0.5862, 0.5416]],
       grad_fn=<SigmoidBackward>)

In [173]:
dataiter = iter(train_loader)
input_data, labels = dataiter.next()
lstm_model(input_data)

tensor([[0.4743, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.4744, 0.4811, 0.4592, 0.4354, 0.5314, 0.5572],
        [0.474

In [174]:
lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=lr)

In [177]:
n_epoch = 10

for i in range(n_epoch):
    for batch_i, (input_data, labels) in enumerate(train_loader):
        # Zero gradients (just in case)
        optimizer.zero_grad()

        # Forward pass, calculate predictions
        output = lstm_model(input_data) 
        # Calculate loss
        loss = criterion(output, labels)
        ## Backward propagation
        loss.backward()
        ## Upade weights
        optimizer.step()

In [179]:
dataiter = iter(train_loader)
input_data, labels = dataiter.next()
lstm_model(input_data) >= 0.5

tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=torch.uint8)

In [180]:
labels

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0