In [1]:
import os, sys
import json

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

import transformers
from transformers import RobertaModel, RobertaTokenizer
from transformers import get_linear_schedule_with_warmup
from sklearn.utils.class_weight import compute_class_weight

from torch.optim.lr_scheduler import ReduceLROnPlateau

device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.cuda.empty_cache()
    torch.set_float32_matmul_precision('high')
    
from utils import read_process_data


def additional_preprocessing(df):

    columns_map = { "text": "body", "sentiment": "emotion"}
    df = df.drop(columns = [column for column in df.columns if column not in columns_map.keys()])
    df.columns = df.columns.map(columns_map)

    df = df.dropna(axis=0, how="any").reset_index(drop=True)

    return df


TRAIN_DATA_PATH = "data/train.csv"
TEST_DATA_PATH = "data/test.csv"

train_data = read_process_data(TRAIN_DATA_PATH)
test_data = read_process_data(TEST_DATA_PATH)

train_data = additional_preprocessing(train_data) 
test_data = additional_preprocessing(test_data) 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lucap\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lucap\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
bert_model = RobertaModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
class_weights = compute_class_weight(class_weight="balanced", 
                                     classes= ["negative", "neutral", "positive"], 
                                     y=train_data["emotion"])

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights

tensor([1.1773, 0.8239, 1.0674], device='cuda:0')

In [4]:
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data.to_dict()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_encoder = {
            "negative": 0,
            "neutral": 1,
            "positive": 2
        }

    def __len__(self):
        return len(self.data["body"])

    def __getitem__(self, idx):
        text, label = self.data["body"][idx], self.label_encoder[self.data["emotion"][idx]]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.float)
        }

# Tokenize and prepare the dataset
max_len = 128  # or another appropriate length
train_dataset = SentimentDataset(train_data, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=128, num_workers=0, shuffle=True)


valid_dataset = SentimentDataset(test_data, tokenizer, max_len)
valid_loader = DataLoader(valid_dataset, batch_size=128, num_workers=0, shuffle=False)


In [5]:
class SentimentClassifier(nn.Module):
    def __init__(self, bert_model):
        super(SentimentClassifier, self).__init__()
        self.bert = bert_model

        for param in list(self.bert.parameters())[:-40]:
            param.requires_grad = False

        self.dropout = nn.Dropout(0.3)

        self.linear = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.Dropout(0.3),
            nn.LeakyReLU(0.01)
        )
        self.classifier = nn.Linear(256, 3)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        dropout_output = self.dropout(pooled_output)
        
        first_layer = self.linear(dropout_output)
        return self.classifier(first_layer)

model = SentimentClassifier(bert_model)

In [6]:
def calculate_accuracy(preds, y):
    max_preds = preds.argmax(dim=1)  # get the index of the max probability for each sample
    correct = max_preds.eq(y)
    return correct.sum().cpu() / torch.FloatTensor([y.shape[0]])


In [7]:
epochs = 100

optimizer = AdamW(model.parameters())
total_steps = len(train_loader) * epochs
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss(weight=class_weights)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = criterion.to(device)

In [8]:
RUN_ID = "roberta-finetune-logs-full-unfrozen"
logger = TensorBoardLogger(RUN_ID)
checkpoint_callback = ModelCheckpoint(
    dirpath=f"results/{RUN_ID}/checkpoints",
    filename="model-{epoch:02d}-{valid_loss:.5f}",
    save_top_k=1,
    monitor="valid_loss",
    mode="min",
    verbose=True
)

In [9]:

class SentimentClassifier(pl.LightningModule):
    def __init__(self, model=model, criterion=criterion, class_weights=class_weights):
        super(SentimentClassifier, self).__init__()
        self.model = model
        self.criterion = criterion
        self.class_weights = class_weights
    
    def calculate_accuracy(self, preds, y):
        max_preds = preds.argmax(dim=1)  # get the index of the max probability for each sample
        correct = max_preds.eq(y)
        return correct.sum().cpu() / torch.FloatTensor([y.shape[0]])


    def forward(self, ids, mask):
        return self.model(ids, mask)

    def training_step(self, batch, batch_idx):
        ids, mask, targets = batch['ids'], batch['mask'], batch['targets']
        outputs = self(ids, mask)
        loss = self.criterion(outputs, targets.long())
        self.log('train_loss', loss, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        ids, mask, targets = batch['ids'], batch['mask'], batch['targets']
        outputs = self(ids, mask)
        loss = self.criterion(outputs, targets.long())
        acc = self.calculate_accuracy(outputs, targets)
        self.log('valid_loss', loss, on_epoch=True, prog_bar=True)
        self.log('valid_acc', acc, on_epoch=True, prog_bar=True)
        # print(f"Valid {loss=}  {acc=}")
        return loss

    def configure_optimizers(self):
        optimizers_dict = {}
        optimizer = AdamW(self.model.parameters(), lr=0.0005)
        optimizers_dict["optimizer"] = optimizer
        
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.25, patience=2, min_lr=0.000005)
        optimizers_dict["lr_scheduler"] = scheduler
        optimizers_dict["monitor"] = "valid_loss"
        return optimizers_dict

In [10]:
sentiment_classifier = SentimentClassifier(model, criterion, class_weights)

# Configure Trainer
trainer = Trainer(max_epochs=epochs,
                  logger=logger)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
trainer.fit(sentiment_classifier, train_loader, valid_loader)

Missing logger folder: roberta-finetune-logs-full-unfrozen\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                | Params
--------------------------------------------------
0 | model     | SentimentClassifier | 124 M 
1 | criterion | CrossEntropyLoss    | 0     
--------------------------------------------------
19.7 M    Trainable params
105 M     Non-trainable params
124 M     Total params
499.373   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\ProgramData\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
c:\ProgramData\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

c:\ProgramData\anaconda3\Lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [12]:
model_trained = SentimentClassifier.load_from_checkpoint(checkpoint_path="results/roberta-finetune-logs-full-unfrozen/lightning_logs/version_0/checkpoints/epoch=11-step=10308.ckpt")

In [13]:
with torch.no_grad():
    preds, act = [], []
    for batch in valid_loader:
        ids, mask, targets = batch['ids'], batch['mask'], batch['targets']
        cnt_preds = model_trained(ids.to(device), mask.to(device))
        preds.append(cnt_preds.argmax(dim=1))
        act.append(targets)

In [14]:
preds_cat = torch.cat(preds, dim=0).cpu()
act_cat = torch.cat(act, dim=0).cpu()
acc = preds_cat.eq(act_cat).sum() / preds_cat.shape[0]

print(f"Final accuracy: {acc}")

Final accuracy: 0.757498562335968
