## Train LSTM model

In this notebook we will train a LSTM model for Sentiment Analysis in English

In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
from datasets import Dataset, Value, ClassLabel, Features, concatenate_datasets
from pysentimiento.preprocessing import preprocess_tweet
from pysentimiento.tass import load_datasets


train_dataset, dev_dataset, test_dataset = load_datasets()


4802 2443 7264


In [2]:
from torchtext.data.utils import get_tokenizer
from pysentimiento.baselines.utils import build_vocab

tokenizer_name="spacy"
language="es_core_news_sm"
tokenizer = get_tokenizer(tokenizer_name, language)
vocab = build_vocab(concatenate_datasets([train_dataset, dev_dataset, test_dataset]), tokenizer)

In [4]:
import unidecode

stoi = vocab.get_stoi()
itos = vocab.get_itos()

def tokenize(batch):
    text = unidecode.unidecode(batch['text'].lower())
    tokens = tokenizer(text)
    
    token_ids = [stoi[t] for t in tokens]
    return {"input_ids": token_ids}

batch_size = 32

eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=False)
dev_dataset = dev_dataset.map(tokenize, batched=False)
test_dataset = test_dataset.map(tokenize, batched=False)

  0%|          | 0/4802 [00:00<?, ?ex/s]

  0%|          | 0/2443 [00:00<?, ?ex/s]

  0%|          | 0/7264 [00:00<?, ?ex/s]

In [5]:
def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['label']})
    dataset.set_format(type='torch', columns=['input_ids', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

  0%|          | 0/4802 [00:00<?, ?ex/s]

  0%|          | 0/2443 [00:00<?, ?ex/s]

  0%|          | 0/7264 [00:00<?, ?ex/s]

In [7]:
import fasttext
from pysentimiento.baselines.utils import build_embedding_matrix

emb_matrix = build_embedding_matrix(vocab, fasttext.load_model("../../embeddings/cc.es.300.bin"))



In [12]:
import torch 
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

PAD_IDX = stoi["<pad>"]

def collate_batch(batch):
    labels = [t["labels"] for t in batch]
    input_ids = [t["input_ids"] for t in batch]

    # Return text, text_lens, labels
    text = pad_sequence(input_ids, padding_value=PAD_IDX, batch_first=True)
    lens = torch.tensor([len(t) for t in input_ids])
    labels = torch.tensor(labels)
    return text, lens, labels 


train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_batch)
dev_dataloader = DataLoader(dev_dataset, batch_size=16, collate_fn=collate_batch)
test_dataset = DataLoader(test_dataset, batch_size=16, collate_fn=collate_batch)


In [None]:
import pytorch_lightning 
from pysentimiento.baselines.models import RNNModel

device = "cuda" if torch.cuda.is_available() else "cpu"

model = RNNModel(
    vocab_size=len(vocab), embedding_dim=DIM, pad_idx=PAD_IDX, rnn_units=256, embedding_matrix=emb_matrix,
    freeze_embeddings=True, num_labels=3,
)

trainer = pl.Trainer(
    max_epochs=10, 
    gpus=1
)
trainer.fit(model, train_dataloader, dev_dataloader)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 7.1 M 
1 | rnn       | GRU       | 428 K 
2 | dropout   | Dropout   | 0     
3 | fc        | Linear    | 771   
----------------------------------------
429 K     Trainable params
7.1 M     Non-trainable params
7.5 M     Total params
29.933    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  rank_zero_warn(
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [11]:
trainer.test(model, test_dataset)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.607929527759552,
 'test_macro_f1': 0.5443885326385498,
 'test_macro_precision': 0.5734540224075317,
 'test_macro_recall': 0.5794802904129028,
 'test_micro_f1': 0.607929527759552,
 'test_neg_f1': 0.6783047914505005,
 'test_neg_precision': 0.6344955563545227,
 'test_neg_recall': 0.7742480039596558,
 'test_neu_f1': 0.3349578082561493,
 'test_neu_precision': 0.4272288978099823,
 'test_neu_recall': 0.329830139875412,
 'test_pos_f1': 0.6199030876159668,
 'test_pos_precision': 0.658637523651123,
 'test_pos_recall': 0.6343626379966736}
--------------------------------------------------------------------------------


[{'test_neg_f1': 0.6783047914505005,
  'test_neg_precision': 0.6344955563545227,
  'test_neg_recall': 0.7742480039596558,
  'test_neu_f1': 0.3349578082561493,
  'test_neu_precision': 0.4272288978099823,
  'test_neu_recall': 0.329830139875412,
  'test_pos_f1': 0.6199030876159668,
  'test_pos_precision': 0.658637523651123,
  'test_pos_recall': 0.6343626379966736,
  'test_micro_f1': 0.607929527759552,
  'test_macro_f1': 0.5443885326385498,
  'test_macro_precision': 0.5734540224075317,
  'test_macro_recall': 0.5794802904129028,
  'test_acc': 0.607929527759552}]

## Twitter Embeddings

In [12]:
import torch
import fasttext

fasttext_model = fasttext.load_model("../../embeddings/tweet_dim_300_ws_5.bin")


DIM = fasttext_model.get_word_vector("random").shape[0]
emb_matrix = torch.randn(len(vocab), DIM)
UNK_IDX = vocab.stoi["<unk>"]
PAD_IDX = vocab.stoi["<pad>"]

# emb_matrix[UNK_IDX] = 0
emb_matrix[PAD_IDX] = 0

for i, word in enumerate(vocab.itos):
    if i == UNK_IDX or i == PAD_IDX:
        # Let them unmodified
        pass
    else:
        emb_matrix[i] = torch.tensor(fasttext_model.get_word_vector(word))



In [13]:
model = RNNModel(
    vocab_size=len(vocab), embedding_dim=DIM, pad_idx=PAD_IDX, rnn_units=256, embedding_matrix=emb_matrix,
    freeze_embeddings=True, num_labels=3,
)

trainer = pl.Trainer(
    max_epochs=10, 
    gpus=1
)
trainer.fit(model, train_dataloader, dev_dataloader)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 7.1 M 
1 | rnn       | GRU       | 428 K 
2 | dropout   | Dropout   | 0     
3 | fc        | Linear    | 771   
----------------------------------------
429 K     Trainable params
7.1 M     Non-trainable params
7.5 M     Total params
29.933    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  rank_zero_warn(
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [14]:
trainer.test(model, test_dataset)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.6153634190559387,
 'test_macro_f1': 0.5710034966468811,
 'test_macro_precision': 0.5967069268226624,
 'test_macro_recall': 0.6014646291732788,
 'test_micro_f1': 0.6153634190559387,
 'test_neg_f1': 0.6628380417823792,
 'test_neg_precision': 0.6905190944671631,
 'test_neg_recall': 0.678328275680542,
 'test_neu_f1': 0.4081096351146698,
 'test_neu_precision': 0.43777358531951904,
 'test_neu_recall': 0.45257747173309326,
 'test_pos_f1': 0.6420629024505615,
 'test_pos_precision': 0.6618279814720154,
 'test_pos_recall': 0.6734883189201355}
--------------------------------------------------------------------------------


[{'test_neg_f1': 0.6628380417823792,
  'test_neg_precision': 0.6905190944671631,
  'test_neg_recall': 0.678328275680542,
  'test_neu_f1': 0.4081096351146698,
  'test_neu_precision': 0.43777358531951904,
  'test_neu_recall': 0.45257747173309326,
  'test_pos_f1': 0.6420629024505615,
  'test_pos_precision': 0.6618279814720154,
  'test_pos_recall': 0.6734883189201355,
  'test_micro_f1': 0.6153634190559387,
  'test_macro_f1': 0.5710034966468811,
  'test_macro_precision': 0.5967069268226624,
  'test_macro_recall': 0.6014646291732788,
  'test_acc': 0.6153634190559387}]