## Train LSTM model

In this notebook we will train a LSTM model for Sentiment Analysis in English

In [2]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
from datasets import Dataset, Value, ClassLabel, Features
from pysentimiento.preprocessing import preprocess_tweet
from pysentimiento.emotion import load_datasets


train_dataset, dev_dataset, test_dataset = load_datasets(preprocessing_args={
    "user_token": "USUARIO",
    "url_token": "URL",
    "hashtag_token": "hashtag",
    "emoji_wrapper": "",
})


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import torch
from pysentimiento.emotion import id2label
from sklearn.utils.class_weight import compute_class_weight

class_weight = torch.Tensor(
    compute_class_weight('balanced', list(id2label), y=train_dataset["label"])
)



In [4]:
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
import unidecode

tokenizer = get_tokenizer("spacy", "es_core_news_sm")

counter = Counter()

"""
No overfitting, just taking the embeddings
"""
for dataset in [train_dataset, dev_dataset, test_dataset]:
    for example in dataset:
        tokens = tokenizer(unidecode.unidecode(example["text"].lower()))
        counter.update(tokens)

# Meto todas
vocab = Vocab(counter, min_freq=1)

len(vocab)


16313

In [5]:

def tokenize(batch):
    text = unidecode.unidecode(batch['text'].lower())
    tokens = tokenizer(text)
    token_ids = [vocab.stoi[t] for t in tokens]
    return {"input_ids": token_ids}

batch_size = 32

eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=False)
dev_dataset = dev_dataset.map(tokenize, batched=False)
test_dataset = test_dataset.map(tokenize, batched=False)

HBox(children=(FloatProgress(value=0.0, max=5049.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1683.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1677.0), HTML(value='')))




In [6]:
def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['label']})
    dataset.set_format(type='torch', columns=['input_ids', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=5049.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1683.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1677.0), HTML(value='')))




In [7]:
import torch
import fasttext

fasttext_model = fasttext.load_model("../../embeddings/cc.es.300.bin")


DIM = fasttext_model.get_word_vector("random").shape[0]
emb_matrix = torch.randn(len(vocab), DIM)
UNK_IDX = vocab.stoi["<unk>"]
PAD_IDX = vocab.stoi["<pad>"]

# emb_matrix[UNK_IDX] = 0
emb_matrix[PAD_IDX] = 0

for i, word in enumerate(vocab.itos):
    if i == UNK_IDX or i == PAD_IDX:
        # Let them unmodified
        pass
    else:
        emb_matrix[i] = torch.tensor(fasttext_model.get_word_vector(word))



In [8]:
import torch 
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

PAD_IDX = vocab.stoi["<pad>"]

def collate_batch(batch):
    labels = [t["labels"] for t in batch]
    input_ids = [t["input_ids"] for t in batch]

    # Return text, text_lens, labels
    return pad_sequence(input_ids, padding_value=PAD_IDX, batch_first=True), torch.tensor([len(t) for t in input_ids]), torch.tensor(labels)


train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_batch)
dev_dataloader = DataLoader(dev_dataset, batch_size=16, collate_fn=collate_batch)
test_dataset = DataLoader(test_dataset, batch_size=16, collate_fn=collate_batch)


In [9]:
import pytorch_lightning as pl
import torch.nn.functional as F
from pysentimiento.emotion import id2label
from pysentimiento.metrics import get_metrics
from torch import nn

class RNNModel(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, pad_idx, rnn_units, num_labels, num_layers=1,
                 bidirectional=False, dropout=0.25, embedding_matrix=None, freeze_embeddings=True):

        super().__init__()

        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(
                embedding_matrix, padding_idx=pad_idx,
                freeze=freeze_embeddings
            )
        else:
            self.embedding = nn.Embedding(
                vocab_size, embedding_dim,
                padding_idx = pad_idx)

        self.rnn = nn.GRU(embedding_dim,
                   rnn_units,
                   num_layers=num_layers,
                   bidirectional=bidirectional, batch_first=True)

        self.dropout = nn.Dropout(dropout)

        factor = 2 if bidirectional else 1

        self.fc = nn.Linear(rnn_units * factor, num_labels)

    def forward(self, text, text_lens):
        #text = [batch_size, text len]
        #permuted = text.permute(1, 0)
        # permuted shape [batch_size, sent len]
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            # WTF no sé por qué hago esto de cpu
            embedded, text_lens.to("cpu"), batch_first=True, enforce_sorted=False)

        packed_output, _ = self.rnn(packed_embedded)
        # hidden is the last state of the

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        output = self.dropout(output)
        # output is shape [seq, batch, hid]
        s = output.permute(1, 0, 2)
        # now [batch, seq, hid]
        mean = s.sum(dim=1) / text_lens.view(-1, 1)

        return self.fc(mean)

    def training_step(self, batch, batch_idx):
        x, lens, y = batch
        outs = self.forward(x, lens)
        loss = F.cross_entropy(outs, y, weight=class_weight.to(x.device))
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, lens, y = batch
        outs = self.forward(x, lens)
        loss = F.cross_entropy(outs, y, weight=class_weight.to(x.device))
        preds = outs.argmax(-1).cpu()
        metrics = get_metrics(preds, y.cpu(), id2label)
        self.log('val_loss', loss, prog_bar=True, on_epoch=True)

        for k, v in metrics.items():
            self.log("val_"+k, v, prog_bar=True, on_epoch=True)

    def test_step(self, batch, batch_idx):
        x, lens, y = batch
        outs = self.forward(x, lens)
        preds = outs.argmax(-1).cpu()
        metrics = get_metrics(preds, y.cpu(), id2label)

        for k, v in metrics.items():
            self.log("test_"+k, v, prog_bar=True, on_epoch=True)

    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

  rank_zero_deprecation(


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = RNNModel(
    vocab_size=len(vocab), embedding_dim=DIM, pad_idx=PAD_IDX, rnn_units=256, embedding_matrix=emb_matrix,
    freeze_embeddings=True, num_labels=7,
)

trainer = pl.Trainer(
    max_epochs=5, 
    gpus=1
)
trainer.fit(model, train_dataloader, dev_dataloader)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.9 M 
1 | rnn       | GRU       | 428 K 
2 | dropout   | Dropout   | 0     
3 | fc        | Linear    | 1.8 K 
----------------------------------------
430 K     Trainable params
4.9 M     Non-trainable params
5.3 M     Total params
21.297    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  rank_zero_warn(
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [11]:
trainer.test(model, test_dataset)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.5211687684059143,
 'test_anger_f1': 0.16684989631175995,
 'test_anger_precision': 0.1739974021911621,
 'test_anger_recall': 0.21719539165496826,
 'test_disgust_f1': 0.052338361740112305,
 'test_disgust_precision': 0.04770423471927643,
 'test_disgust_recall': 0.07791692018508911,
 'test_fear_f1': 0.051096536219120026,
 'test_fear_precision': 0.040548600256443024,
 'test_fear_recall': 0.08109720051288605,
 'test_joy_f1': 0.4242527186870575,
 'test_joy_precision': 0.4065885543823242,
 'test_joy_recall': 0.4828502833843231,
 'test_macro_f1': 0.20564809441566467,
 'test_macro_precision': 0.23136986792087555,
 'test_macro_recall': 0.2187984585762024,
 'test_micro_f1': 0.5211687684059143,
 'test_others_f1': 0.517051637172699,
 'test_others_precision': 0.6873546242713928,
 'test_others_recall': 0.44072675704956055,
 'test_sadness_f1': 0.15013666450977325,
 'test_sadness_pr

[{'test_others_f1': 0.517051637172699,
  'test_others_precision': 0.6873546242713928,
  'test_others_recall': 0.44072675704956055,
  'test_joy_f1': 0.4242527186870575,
  'test_joy_precision': 0.4065885543823242,
  'test_joy_recall': 0.4828502833843231,
  'test_sadness_f1': 0.15013666450977325,
  'test_sadness_precision': 0.1970866322517395,
  'test_sadness_recall': 0.13051065802574158,
  'test_anger_f1': 0.16684989631175995,
  'test_anger_precision': 0.1739974021911621,
  'test_anger_recall': 0.21719539165496826,
  'test_surprise_f1': 0.07781090587377548,
  'test_surprise_precision': 0.0663088858127594,
  'test_surprise_recall': 0.10129199177026749,
  'test_disgust_f1': 0.052338361740112305,
  'test_disgust_precision': 0.04770423471927643,
  'test_disgust_recall': 0.07791692018508911,
  'test_fear_f1': 0.051096536219120026,
  'test_fear_precision': 0.040548600256443024,
  'test_fear_recall': 0.08109720051288605,
  'test_micro_f1': 0.5211687684059143,
  'test_macro_f1': 0.20564809441566

## Twitter Embeddings

In [12]:
import torch
import fasttext

fasttext_model = fasttext.load_model("../../embeddings/tweet_dim_300_ws_5.bin")


DIM = fasttext_model.get_word_vector("random").shape[0]
emb_matrix = torch.randn(len(vocab), DIM)
UNK_IDX = vocab.stoi["<unk>"]
PAD_IDX = vocab.stoi["<pad>"]

# emb_matrix[UNK_IDX] = 0
emb_matrix[PAD_IDX] = 0

for i, word in enumerate(vocab.itos):
    if i == UNK_IDX or i == PAD_IDX:
        # Let them unmodified
        pass
    else:
        emb_matrix[i] = torch.tensor(fasttext_model.get_word_vector(word))



In [13]:
model = RNNModel(
    vocab_size=len(vocab), embedding_dim=DIM, pad_idx=PAD_IDX, rnn_units=256, embedding_matrix=emb_matrix,
    freeze_embeddings=True, num_labels=7,
)

trainer = pl.Trainer(
    max_epochs=5, 
    gpus=1
)
trainer.fit(model, train_dataloader, dev_dataloader)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.9 M 
1 | rnn       | GRU       | 428 K 
2 | dropout   | Dropout   | 0     
3 | fc        | Linear    | 1.8 K 
----------------------------------------
430 K     Trainable params
4.9 M     Non-trainable params
5.3 M     Total params
21.297    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  rank_zero_warn(
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [14]:
trainer.test(model, test_dataset)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.5557543039321899,
 'test_anger_f1': 0.19131353497505188,
 'test_anger_precision': 0.1866428107023239,
 'test_anger_recall': 0.22623974084854126,
 'test_disgust_f1': 0.04770423471927643,
 'test_disgust_precision': 0.04531902074813843,
 'test_disgust_recall': 0.05485986918210983,
 'test_fear_f1': 0.04727261886000633,
 'test_fear_precision': 0.0367322601377964,
 'test_fear_recall': 0.08348240703344345,
 'test_joy_f1': 0.45676466822624207,
 'test_joy_precision': 0.4410092532634735,
 'test_joy_recall': 0.5313382148742676,
 'test_macro_f1': 0.22454555332660675,
 'test_macro_precision': 0.24968823790550232,
 'test_macro_recall': 0.24108773469924927,
 'test_micro_f1': 0.5557543039321899,
 'test_others_f1': 0.48737478256225586,
 'test_others_precision': 0.7017671465873718,
 'test_others_recall': 0.3992690145969391,
 'test_sadness_f1': 0.2416417896747589,
 'test_sadness_prec

[{'test_others_f1': 0.48737478256225586,
  'test_others_precision': 0.7017671465873718,
  'test_others_recall': 0.3992690145969391,
  'test_joy_f1': 0.45676466822624207,
  'test_joy_precision': 0.4410092532634735,
  'test_joy_recall': 0.5313382148742676,
  'test_sadness_f1': 0.2416417896747589,
  'test_sadness_precision': 0.25631389021873474,
  'test_sadness_recall': 0.2392943948507309,
  'test_anger_f1': 0.19131353497505188,
  'test_anger_precision': 0.1866428107023239,
  'test_anger_recall': 0.22623974084854126,
  'test_surprise_f1': 0.09974727779626846,
  'test_surprise_precision': 0.0800333246588707,
  'test_surprise_recall': 0.15313057601451874,
  'test_disgust_f1': 0.04770423471927643,
  'test_disgust_precision': 0.04531902074813843,
  'test_disgust_recall': 0.05485986918210983,
  'test_fear_f1': 0.04727261886000633,
  'test_fear_precision': 0.0367322601377964,
  'test_fear_recall': 0.08348240703344345,
  'test_micro_f1': 0.5557543039321899,
  'test_macro_f1': 0.22454555332660675