Nome: Rebeca Andrade Medeiros

---
Matrícula: 11500782


Projeto final: Classifição de dataset de tweets com LSTM.
Código com base no exemplo 'lstm.ipynb'



O objetivo era realizar a classificação de dois subsets do [dataset tweet_eval](https://https://huggingface.co/datasets/tweet_eval). O primeiro é um subset de emoções, com 4 classes. E o segundo é um subset de emojis, com 20 classes.

In [3]:
pip install datasets



In [5]:
import datasets
import torch
import torch.nn as nn
import torchtext as ttxt
from torch.utils.data import Dataset as ds
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

#### Dataset de emoções

In [6]:
dataset_emotion = datasets.load_dataset('tweet_eval', 'emotion')

Downloading builder script:   0%|          | 0.00/9.72k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/21.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/134k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183 [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

In [7]:
dataset_emotion

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})

Label:

* 0 - Anger
* 1 - Joy
* 2 - Optimism
* 3 - Sadness



In [8]:
dataset_emotion['train'][0]

{'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'label': 2}

In [9]:
dataset_emotion['test'][0]

{'text': '#Deppression is real. Partners w/ #depressed people truly dont understand the depth in which they affect us. Add in #anxiety &amp;makes it worse',
 'label': 3}

In [10]:
dataset_emotion['validation'][0]

{'text': '@user @user Oh, hidden revenge and anger...I rememberthe time,she rebutted you.',
 'label': 0}

In [37]:
tokenizer = ttxt.data.utils.get_tokenizer('basic_english')

print(tokenizer(dataset_emotion['train']['text'][0]))

['“worry', 'is', 'a', 'down', 'payment', 'on', 'a', 'problem', 'you', 'may', 'never', 'have', "'", '.', 'joyce', 'meyer', '.', '#motivation', '#leadership', '#worry']


In [38]:
def split_tokens(row):
  row['tokens'] = tokenizer(row['text'])
  return row

In [13]:
dataset_emotion = dataset_emotion.map(split_tokens)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

In [14]:
dataset_emotion

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'tokens'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label', 'tokens'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label', 'tokens'],
        num_rows: 374
    })
})

In [15]:
# Vocabulário

vocab_emotion = ttxt.vocab.build_vocab_from_iterator(dataset_emotion['train']['tokens'], min_freq = 2, specials = [''])

len(vocab_emotion)

3851

In [44]:
vocab_emotion.set_default_index(vocab_emotion[''])

In [17]:
# IDs a partir dos tokens

def vocabulario(tokens):
  resultado = []
  for t in tokens:
    resultado.append(vocab_emotion[t])
  return resultado

In [18]:
def token_id(row):
  row['tokens_id'] = vocabulario(row['tokens'])
  return row

In [19]:
dataset_emotion = dataset_emotion.map(token_id)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

In [20]:
dataset_emotion['train'][0]

{'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'label': 2,
 'tokens': ['“worry',
  'is',
  'a',
  'down',
  'payment',
  'on',
  'a',
  'problem',
  'you',
  'may',
  'never',
  'have',
  "'",
  '.',
  'joyce',
  'meyer',
  '.',
  '#motivation',
  '#leadership',
  '#worry'],
 'tokens_id': [0,
  11,
  8,
  154,
  0,
  22,
  8,
  564,
  12,
  291,
  112,
  30,
  3,
  1,
  0,
  0,
  1,
  1709,
  1115,
  701]}

Dataset e Dataloader

In [31]:
class Tweets_emotions(ds):
  def __init__(self, dataset_emotion):
    self.data = dataset_emotion['tokens_id']
    self.labels = dataset_emotion['label']

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])

In [22]:
emotion = Tweets_emotions(dataset_emotion['train'])

In [23]:
dataloader_emotion = DataLoader(Tweets_emotions(dataset_emotion['train']), shuffle = True)

In [24]:
emotion[0]

(tensor([   0,   11,    8,  154,    0,   22,    8,  564,   12,  291,  112,   30,
            3,    1,    0,    0,    1, 1709, 1115,  701]),
 tensor(2))

In [25]:
embed_len = 32
hidden_dim = 16
n_layers = 4

In [26]:
class Classificador(nn.Module):
  def __init__(self):
    super(Classificador, self).__init__()

    self.embedding_layer = nn.Embedding(num_embeddings=len(vocab_emotion),
                                        embedding_dim=embed_len)
    self.rnn = nn.LSTM(input_size=embed_len, hidden_size=hidden_dim,
                       num_layers=n_layers)
    self.linear = nn.Linear(hidden_dim, 4)
    self.soft = nn.Softmax()

  def forward(self, input):
    embeddings = self.embedding_layer(input)
    output, (final_hidden, final_cell) = self.rnn(embeddings)
    return self.soft(self.linear(final_hidden[0]))

In [28]:
x = Classificador()
device = torch.device('cuda')
x.to(device)

RuntimeError: ignored

#####Treinamento e teste

In [None]:
lr = 1e-3
epochs = 5
loss_lstm = nn.BCELoss()
optimizer = torch.optim.Adam(x.parameters(), lr=lr)

In [None]:
def train(dataloader):
  size = len(dataloader_emotion.dataset)

  train_loss = 0
  for data, label in enumerate(tqdm(dataloader_emotion)):
    data, label = data.to(device), label.to(device)
    print("Data:", data)
    print("Label:", label)

    out = x(data[0])
    print("Out:", out)

    loss = loss_lstm(out.to(torch.float32), label.to(torch.float32))

    print("Loss:", loss)

    train_loss += loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  train_loss /= size

  return train_loss

In [None]:
def test(dataloader):
  size = len(dataloader_emotion.dataset)
  test_loss = 0
  correct = 0

  with torch.no_grad():

    for data, label in tqdm(dataloader_emotion):
      data, label = data.to(device), label.to(device)
      out = x(data[0])
      loss = loss_lstm(out.to(torch.float32), label.to(torch.float32))
      test_loss += loss.item()

  test_loss /= size
  correct /= size

  return test_loss

In [None]:
teste = DataLoader(Tweets_emotions(dataset_emotion['test']), shuffle=True)

In [None]:
train_losses = []
test_losses = []
for t in range(epochs):
    print("Epochs", t)

    print('Treino')
    train_losses.append(train(dataloader_emotion))

    print('Test')
    test_losses.append(test(teste))

Epochs 0
Train


  0%|          | 0/3257 [00:00<?, ?it/s]

Data: tensor([[   2,    2,    0,  280,  880,  451, 1000,   46,   70,   74,  786,  280,
          547]], device='cuda:0')
Label: tensor([0], device='cuda:0')
Out: tensor([0.2922, 0.2608, 0.2417, 0.2052], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)


ValueError: ignored

#### Dataset com **emojis**

In [30]:
dataset_emojis = datasets.load_dataset('tweet_eval', 'emoji')

Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/159k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/45000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [32]:
dataset_emojis

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [35]:
dataset_emojis['train'].unique('label')

[12, 19, 0, 2, 11, 7, 1, 9, 4, 8, 5, 10, 14, 16, 18, 6, 13, 3, 15, 17]

Labels:

0. ❤
1. 😍
2. 😂
3. 💕
4. 🔥
5. 😊
6. 😎
7. ✨
8. 💙
9. 😘
10. 📷
11. 🇺🇸
12. ☀
13. 💜
14. 😉
15. 💯
16. 😁
17. 🎄
18. 📸
19. 😜


In [36]:
dataset_emojis['train'][0]

{'text': 'Sunday afternoon walking through Venice in the sun with @user ️ ️ ️ @ Abbot Kinney, Venice',
 'label': 12}

In [39]:
def split_tokens_emojis(row):
  row['tokens'] = tokenizer(row['text'])
  return row

In [40]:
dataset_emojis = dataset_emojis.map(split_tokens_emojis)

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [42]:
dataset_emojis['train'][0]

{'text': 'Sunday afternoon walking through Venice in the sun with @user ️ ️ ️ @ Abbot Kinney, Venice',
 'label': 12,
 'tokens': ['sunday',
  'afternoon',
  'walking',
  'through',
  'venice',
  'in',
  'the',
  'sun',
  'with',
  '@user',
  '️',
  '️',
  '️',
  '@',
  'abbot',
  'kinney',
  ',',
  'venice']}

In [43]:
# Novo vocabulário
vocab_emojis = ttxt.vocab.build_vocab_from_iterator(dataset_emojis['train']['tokens'], min_freq = 2, specials = [''])

len(vocab_emojis)

19199

In [45]:
vocab_emojis.set_default_index(vocab_emojis[''])

In [46]:
# Token para id

def vocabulario_emojis(tokens):
  resultado_emojis = []
  for t in tokens:
    resultado_emojis.append(vocab_emojis[t])
  return resultado_emojis

def token_id_emojis(row):
  row['tokens_id'] = vocabulario(row['tokens'])
  return row

In [47]:
dataset_emojis = dataset_emojis.map(token_id_emojis)

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [48]:
dataset_emojis['train'][0]

{'text': 'Sunday afternoon walking through Venice in the sun with @user ️ ️ ️ @ Abbot Kinney, Venice',
 'label': 12,
 'tokens': ['sunday',
  'afternoon',
  'walking',
  'through',
  'venice',
  'in',
  'the',
  'sun',
  'with',
  '@user',
  '️',
  '️',
  '️',
  '@',
  'abbot',
  'kinney',
  ',',
  'venice'],
 'tokens_id': [1621,
  583,
  1646,
  206,
  0,
  15,
  5,
  1273,
  25,
  2,
  0,
  0,
  0,
  775,
  0,
  0,
  7,
  0]}