# Fine-Tuning Homework

В этой домашней работе вы самостоятельно воспользуетесь библиотекой **transformers**, загрузите предобученную модель и обучите модель классификации тональности твитов

https://www.kaggle.com/datasets/kazanova/sentiment140

In [21]:
!pip install -q transformers
!pip install -q pytorch-lightning

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
!unzip -q /content/gdrive/My\ Drive/ABBYY\ Homeworks/tweets.zip

### Загрузка данных

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split



DATASET_COLUMNS=['target','ids','date','flag','user','text']
DATASET_ENCODING = "ISO-8859-1"

tweets_data = pd.read_csv("training.1600000.processed.noemoticon.csv", names=DATASET_COLUMNS, encoding=DATASET_ENCODING)

df_train, df_test = train_test_split(tweets_data.sample(frac=1).head(400000), test_size=0.2, random_state=42)

In [24]:
device = 'cuda'

### Датасет (2 балла)

В этой части вам нужно с помощью библиотеки **Transformers** сделать обработку предложений для последующей передачи в качестве аргументов модели.

Что нужно сделать:
1. Токенизировать текст 
2. Конвертировать токены в input_ids (не забудьте специальные токены!)
3. Создать attention_mask (маска для механизма внимания из 0 и 1, где 1 ставится всем токенам, кроме токена "[PAD]")

In [25]:
import torch 
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer


class TweetDataset(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        self.data = data

        self.tweets = data.text.tolist()
        self.labels = data.target.tolist()

        self.tokenizer = BertTokenizer.from_pretrained(
                "bert-base-uncased", do_lower_case=True)
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, item: int):
        text = self.tweets[item]
        label = self.labels[item]/2

        # make tokenization, convert to token_ids, get attention mask 
        input_ids = self.tokenizer.encode(text, add_special_tokens=True)
        encoded_dict = self.tokenizer.encode_plus(text, add_special_tokens = True, return_attention_mask = True)
        
        return torch.IntTensor(encoded_dict['input_ids']), torch.IntTensor(encoded_dict['attention_mask']), torch.LongTensor([label])      

    def paddings(self, batch):
        tokens, attention_masks, labels = list(zip(*batch)) #, document_ids, sentences_ids = list(zip(*batch))

        tokens = pad_sequence(tokens, batch_first=True, padding_value=0)
        attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
        labels = pad_sequence(labels, batch_first=True, padding_value=-100)
        
        return tokens, labels, attention_masks #, words_ids, document_ids, sentences_ids  

In [None]:
train_data = TweetDataset(df_train)
test_data = TweetDataset(df_test)

In [None]:
train_data[9][2]

tensor([0])

In [None]:
len(train_data)

80000

### Модель (2 балла)
Теперь нам нужна модель. Мы можем использовать уже готовые варианты под конкретные задачи, однако в этой части, вам нужно написать свой класс модели, в основе которой будет лежать модель BERT. Для этого, реализуйте методы **__init__()** и **forward()** и добавьте нужные слои (для начала можно просто использовать nn.Linear() поверх BertModel())

In [26]:
from transformers import BertPreTrainedModel, BertConfig, BertModel
import torch.nn as nn
import torch

class BERT(BertPreTrainedModel):
    def __init__(self, config: BertConfig):
        super(BERT, self).__init__(config)

        self.bert = BertPreTrainedModel(config)

        # <your code>
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 3)
        self.relu = nn.ReLU()

    def forward(self, input_ids, mask=None):
        _, _, output = self.bert(input_ids= input_ids, attention_mask=mask,return_dict=False)
        output = self.relu(self.dropout(self.linear(output[0][:, 0, :])))

        return output

### Обучение (3 балла)
Здесь вам предстоит написать свой класс-trainer с помощью PyTorch-Lightning. Мы уже разбирали его на 5-ом семинаре по ведению экспериментов. Семинарский ноутбук в папке. **В качестве метрики, используйте F1 Score.**

В этом задании вам нужно обучать все веса модели (т.е. файнтюнить и BERT и остальные слои).

Отдельно отмечу следующее. Поскольку ваша модель будет изначально выдавать векторные представления токенов, а классифицировать нужно весь текст. Поэтому для классификации берем "[CLS]" токен..

In [27]:
from sklearn.metrics import f1_score
import numpy as np

class LightningBERT(pl.LightningModule):
    def __init__(self, params):
        super().__init__()

        self.model = BERT(BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True))

        # other members

        self.params = params

        self.train_data = TweetDataset(df_train)
        self.test_data = TweetDataset(df_test)

        self.train_epoch_labels = []
        self.train_epoch_predictions = []
        self.train_epoch_losses = []

        self.test_epoch_labels = []
        self.test_epoch_predictions = []
        self.test_epoch_losses = []

        self.criterion = nn.CrossEntropyLoss(ignore_index=-100)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        return optimizer

    def compute_loss(self, logits, labels):
        loss = self.criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        return loss

    def training_step(self, batch, _):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        words_mask = batch[2].to(device)

        logits = self.model(input_ids, words_mask)
        loss = self.compute_loss(logits, labels)

        preds = torch.argmax(torch.softmax(logits, dim=1), dim=1)

        self.train_epoch_predictions = np.concatenate((self.train_epoch_predictions, preds.detach().cpu().numpy()))
        self.train_epoch_labels = np.concatenate((self.train_epoch_labels, labels.detach().cpu().numpy().T[0]))

        self.train_epoch_losses.append(loss.item())

        return loss

    def training_epoch_end(self, _):
        epoch_metric = f1_score(self.train_epoch_labels, self.train_epoch_predictions, average='macro')

        print("Epoch loss: ", np.mean(self.train_epoch_losses))
        print("Epoch metric: ", epoch_metric)

        self.train_epoch_labels = []
        self.train_epoch_predictions = []
        self.train_epoch_losses = []

    def test_step(self, batch, _):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        words_mask = batch[2].to(device)

        logits = self.model(input_ids, words_mask)
        loss = self.compute_loss(logits, labels)

        preds = torch.argmax(torch.softmax(logits, dim=1), dim=1)

        self.test_epoch_predictions = np.concatenate((self.test_epoch_predictions, preds.detach().cpu().numpy()))
        self.test_epoch_labels = np.concatenate((self.test_epoch_labels, labels.detach().cpu().numpy().T[0]))

        self.test_epoch_losses.append(loss.item())

        return loss

    def test_epoch_end(self, _):
        epoch_metric = f1_score(self.test_epoch_labels, self.test_epoch_predictions, average='macro')

        print("Test loss: ", np.mean(self.test_epoch_losses))
        print("Test metric: ", epoch_metric)

        self.test_epoch_labels = []
        self.test_epoch_predictions = []
        self.test_epoch_losses = []

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_data, 
                                           self.params['batch_size'], 
                                           shuffle=self.params['shuffle_train_eval'], 
                                           collate_fn=self.train_data.paddings)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_data, 
                                           self.params['batch_size'], 
                                           shuffle=self.params['shuffle_train_eval'], 
                                           collate_fn=self.test_data.paddings)

### Запуск обучения

Запустим обучение всего Берта на части датасета в 100000 твитов

In [16]:
from pytorch_lightning import Trainer
import warnings
warnings.filterwarnings('ignore')

params = {
    "batch_size": 32,
    "shuffle_train_eval": True,
    "lowercase": False
}

lightning_model = LightningBERT(params).to(device)

trainer = Trainer(
    gpus=1,
    accelerator='gpu',
    max_epochs=2,
    gradient_clip_val=1.0,
    val_check_interval=0.5)

trainer.fit(lightning_model)
trainer.test(lightning_model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /content/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | BERT             | 109 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.938   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Epoch loss:  1.0986081229388713
Epoch metric:  0.22288490798705204
Epoch loss:  1.0986120700836182
Epoch metric:  0.3328593855217978


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

Test loss:  1.0986120700836182
Test metric:  0.33311659622710715


[{}]

### Эксперимент заморозкой весов (3 балла)
В этом эксперименте, вам нужно заморозить веса BERT и использовать его выход (последнее скрытое состояние) в качестве признаков для другой модели (например, простой линейный слой). Сравните качество с предыдущим экспериментом. 

In [16]:
class FreezedBERT(BertPreTrainedModel):
    def __init__(self, config: BertConfig):
        super(FreezedBERT, self).__init__(config)

        self.bert = BertModel(config)

        for param in self.bert.parameters():
          param.requires_grad = False

        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 3)
        self.relu = nn.ReLU()

    def forward(self, input_ids, mask=None):
        _, _, output = self.bert(input_ids= input_ids, attention_mask=mask,return_dict=False)
        output = self.linear(output[0][:,0,:])

        return output

In [17]:
class LightningFreezedBERT(pl.LightningModule):
    def __init__(self, params):
        super().__init__()

        self.model = FreezedBERT(BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True))

        # other members

        self.params = params

        self.train_data = TweetDataset(df_train)
        self.test_data = TweetDataset(df_test)

        self.train_epoch_labels = []
        self.train_epoch_predictions = []
        self.train_epoch_losses = []

        self.test_epoch_labels = []
        self.test_epoch_predictions = []
        self.test_epoch_losses = []

        self.criterion = nn.CrossEntropyLoss(ignore_index=-100)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        return optimizer

    def compute_loss(self, logits, labels):
        loss = self.criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        return loss

    def training_step(self, batch, _):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        words_mask = batch[2].to(device)

        logits = self.model(input_ids, words_mask)
        loss = self.compute_loss(logits, labels)

        preds = torch.argmax(torch.softmax(logits, dim=1), dim=1)

        self.train_epoch_predictions = np.concatenate((self.train_epoch_predictions, preds.detach().cpu().numpy()))
        self.train_epoch_labels = np.concatenate((self.train_epoch_labels, labels.detach().cpu().numpy().T[0]))

        self.train_epoch_losses.append(loss.item())

        return loss

    def training_epoch_end(self, _):
        epoch_metric = f1_score(self.train_epoch_labels, self.train_epoch_predictions, average='macro')

        print("Epoch loss: ", np.mean(self.train_epoch_losses))
        print("Epoch metric: ", epoch_metric)

        self.train_epoch_labels = []
        self.train_epoch_predictions = []
        self.train_epoch_losses = []

    def test_step(self, batch, _):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        words_mask = batch[2].to(device)

        logits = self.model(input_ids, words_mask)
        loss = self.compute_loss(logits, labels)

        preds = torch.argmax(torch.softmax(logits, dim=1), dim=1)

        self.test_epoch_predictions = np.concatenate((self.test_epoch_predictions, preds.detach().cpu().numpy()))
        self.test_epoch_labels = np.concatenate((self.test_epoch_labels, labels.detach().cpu().numpy().T[0]))

        self.test_epoch_losses.append(loss.item())

        return loss

    def test_epoch_end(self, _):
        epoch_metric = f1_score(self.test_epoch_labels, self.test_epoch_predictions, average='macro')

        print("Test loss: ", np.mean(self.test_epoch_losses))
        print("Test metric: ", epoch_metric)

        self.test_epoch_labels = []
        self.test_epoch_predictions = []
        self.test_epoch_losses = []

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_data, 
                                           self.params['batch_size'], 
                                           shuffle=self.params['shuffle_train_eval'], 
                                           collate_fn=self.train_data.paddings)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_data, 
                                           self.params['batch_size'], 
                                           shuffle=self.params['shuffle_train_eval'], 
                                           collate_fn=self.test_data.paddings)

In [20]:
from pytorch_lightning import Trainer
import warnings
warnings.filterwarnings('ignore')

params = {
    "batch_size": 32,
    "shuffle_train_eval": True,
    "lowercase": False
}

lightning_model = LightningFreezedBERT(params).to(device)

trainer = Trainer(
    gpus=1,
    accelerator='gpu',
    max_epochs=3,
    gradient_clip_val=1.0,
    val_check_interval=0.5)

trainer.fit(lightning_model)
trainer.test(lightning_model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /content/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | FreezedBERT      | 109 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
2.3 K     Trainable params
109 M     Non-trainable params
109 M     Total params
437.938   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Epoch loss:  0.7222418060839176
Epoch metric:  0.5009355500298862
Epoch loss:  0.722265169507265
Epoch metric:  0.5005531139211549
Epoch loss:  0.7249546636521816
Epoch metric:  0.5003402874905991


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

Test loss:  0.7162020359992981
Test metric:  0.33313327331532794


[{}]

Ячейка для результатов:

CLS : 

MEAN:

### Бонус (2 балла)

В качестве бонуса, вы можете попробовать поиграться с скрытыми состояниями BERT.
Вы можете либо взять среднее от N последних скрытых состояний (например 4), либо их конкатенацию и посмотреть, как ведет себя модель. 

In [28]:
class FreezedLastBERT(BertPreTrainedModel):
    def __init__(self, config: BertConfig):
        super(FreezedLastBERT, self).__init__(config)

        self.bert = BertModel(config)

        for param in self.bert.parameters():
          param.requires_grad = False

        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 3)
        self.relu = nn.ReLU()

    def forward(self, input_ids, mask=None):
        _, _, output = self.bert(input_ids= input_ids, attention_mask=mask,return_dict=False)
        output = (output[0][:, 0, :] + output[1][:, 0, :] + output[2][:, 0, :] + output[3][:, 0, :])/4
        output = self.linear(output)

        return output

In [29]:
class LightningFreezedLastBERT(pl.LightningModule):
    def __init__(self, params):
        super().__init__()

        self.model = FreezedLastBERT(BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True))

        # other members

        self.params = params

        self.train_data = TweetDataset(df_train)
        self.test_data = TweetDataset(df_test)

        self.train_epoch_labels = []
        self.train_epoch_predictions = []
        self.train_epoch_losses = []

        self.test_epoch_labels = []
        self.test_epoch_predictions = []
        self.test_epoch_losses = []

        self.criterion = nn.CrossEntropyLoss(ignore_index=-100)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        return optimizer

    def compute_loss(self, logits, labels):
        loss = self.criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        return loss

    def training_step(self, batch, _):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        words_mask = batch[2].to(device)

        logits = self.model(input_ids, words_mask)
        loss = self.compute_loss(logits, labels)

        preds = torch.argmax(torch.softmax(logits, dim=1), dim=1)

        self.train_epoch_predictions = np.concatenate((self.train_epoch_predictions, preds.detach().cpu().numpy()))
        self.train_epoch_labels = np.concatenate((self.train_epoch_labels, labels.detach().cpu().numpy().T[0]))

        self.train_epoch_losses.append(loss.item())

        return loss

    def training_epoch_end(self, _):
        epoch_metric = f1_score(self.train_epoch_labels, self.train_epoch_predictions, average='macro')

        print("Epoch loss: ", np.mean(self.train_epoch_losses))
        print("Epoch metric: ", epoch_metric)

        self.train_epoch_labels = []
        self.train_epoch_predictions = []
        self.train_epoch_losses = []

    def test_step(self, batch, _):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        words_mask = batch[2].to(device)

        logits = self.model(input_ids, words_mask)
        loss = self.compute_loss(logits, labels)

        preds = torch.argmax(torch.softmax(logits, dim=1), dim=1)

        self.test_epoch_predictions = np.concatenate((self.test_epoch_predictions, preds.detach().cpu().numpy()))
        self.test_epoch_labels = np.concatenate((self.test_epoch_labels, labels.detach().cpu().numpy().T[0]))

        self.test_epoch_losses.append(loss.item())

        return loss

    def test_epoch_end(self, _):
        epoch_metric = f1_score(self.test_epoch_labels, self.test_epoch_predictions, average='macro')

        print("Test loss: ", np.mean(self.test_epoch_losses))
        print("Test metric: ", epoch_metric)

        self.test_epoch_labels = []
        self.test_epoch_predictions = []
        self.test_epoch_losses = []

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_data, 
                                           self.params['batch_size'], 
                                           shuffle=self.params['shuffle_train_eval'], 
                                           collate_fn=self.train_data.paddings)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_data, 
                                           self.params['batch_size'], 
                                           shuffle=self.params['shuffle_train_eval'], 
                                           collate_fn=self.test_data.paddings)

In [None]:
from pytorch_lightning import Trainer
import warnings
warnings.filterwarnings('ignore')

params = {
    "batch_size": 32,
    "shuffle_train_eval": True,
    "lowercase": False
}

lightning_model = LightningFreezedLastBERT(params).to(device)

trainer = Trainer(
    gpus=1,
    accelerator='gpu',
    max_epochs=3,
    gradient_clip_val=1.0,
    val_check_interval=0.5)

trainer.fit(lightning_model)
trainer.test(lightning_model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | FreezedLastBERT  | 109 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
2.3 K     Trainable params
109 M     Non-trainable params
109 M     Total params
437.938   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Удивительная модель!
