<a href="https://colab.research.google.com/github/sheldor07/2d-collider-game/blob/main/finetuning_roberta_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install transformers accelerate -U



In [167]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import torch
import transformers
import torch.nn as nn
from transformers import AdamW


In [168]:
# Step 2: Load and preprocess data
df = pd.read_csv('/content/essayData.csv')

# Converting labels from comma separated string to list of strings
df['terms'] = df['terms'].apply(lambda x: x.split(','))

# Strip leading/trailing whitespaces from the labels
df['terms'] = df['terms'].apply(lambda x: [i.strip() for i in x])

# Binarize the labels
mlb = MultiLabelBinarizer()
df['labels'] = list(mlb.fit_transform(df['terms']))

# Checking the classes we have
print(mlb.classes_)

# Split dataframe
train_data, test_data = train_test_split(df, test_size=0.4, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)


['business' 'humanities' 'sciences' 'technology']


In [169]:
print(df)

     Essay  Section                                           ABSTRACT  \
0        1        1  Four in five Singaporeans say that they can co...   
1        1        1  While the terms ÃfalsehoodsÃ and Ãfake news...   
2        1        2  The digital era is characterised by easy creat...   
3        1        2  It is important to be critical of fake news du...   
4        1        3  Fake news has played an enormous role during C...   
..     ...      ...                                                ...   
231     10       10  The average temperature of Singapore has been ...   
232     10       11  The plants along the facade require large amou...   
233     10       12  Currently, AI regulates the release of water b...   
234     10       13  We would like to merge and build upon these ex...   
235     10       14  We have analysed the severity of global water ...   

                                terms        labels  
0                        [humanities]  [0, 1, 0, 0]  
1  

In [176]:
# Set some parameters
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

# Define the class for the dataset
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        title = str(self.data.ABSTRACT[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']  # Add this line

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),  # Add this line
            'targets': torch.tensor(self.data.labels[index], dtype=torch.float)
        }


    def __len__(self):
        return self.len

# Create DataLoaders
training_set = Triage(train_data, tokenizer, MAX_LEN)
testing_set = Triage(test_data, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)


In [177]:
import torch.nn
class CustomRoberta(nn.Module):
    def __init__(self):
        super(CustomRoberta, self).__init__()
        self.l1 = transformers.RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [178]:
model = CustomRoberta()
model.to(device)

# Creating the loss function and optimizer
loss_function = torch.nn.BCEWithLogitsLoss()  # Change to BCEWithLogitsLoss for multi-label classification
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx == targets.max(dim=1)[1]).sum().item()
    return n_correct


def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if _ % 5000 == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct * 100) / nb_tr_examples}')
    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return


for epoch in range(EPOCHS):
    train(epoch)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.59it/s]

Training Loss per 5000 steps: 0.7036645412445068
Training Accuracy per 5000 steps: 0.0


18it [00:05,  3.56it/s]


The Total Accuracy for Epoch 0: 23.404255319148938
Training Loss Epoch: 0.6875591609213088
Training Accuracy Epoch: 23.404255319148938


1it [00:00,  4.54it/s]

Training Loss per 5000 steps: 0.636780858039856
Training Accuracy per 5000 steps: 50.0


18it [00:05,  3.51it/s]


The Total Accuracy for Epoch 1: 20.56737588652482
Training Loss Epoch: 0.6453358464770846
Training Accuracy Epoch: 20.56737588652482


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.6465994715690613
Training Accuracy per 5000 steps: 37.5


18it [00:05,  3.52it/s]


The Total Accuracy for Epoch 2: 21.27659574468085
Training Loss Epoch: 0.6204099820719825
Training Accuracy Epoch: 21.27659574468085


1it [00:00,  4.57it/s]

Training Loss per 5000 steps: 0.5607369542121887
Training Accuracy per 5000 steps: 25.0


18it [00:05,  3.48it/s]


The Total Accuracy for Epoch 3: 29.78723404255319
Training Loss Epoch: 0.5673788024319543
Training Accuracy Epoch: 29.78723404255319


1it [00:00,  4.60it/s]

Training Loss per 5000 steps: 0.5462499856948853
Training Accuracy per 5000 steps: 37.5


18it [00:05,  3.46it/s]


The Total Accuracy for Epoch 4: 37.5886524822695
Training Loss Epoch: 0.5110354456636641
Training Accuracy Epoch: 37.5886524822695


1it [00:00,  4.56it/s]

Training Loss per 5000 steps: 0.44903069734573364
Training Accuracy per 5000 steps: 25.0


18it [00:05,  3.46it/s]


The Total Accuracy for Epoch 5: 52.4822695035461
Training Loss Epoch: 0.45272703137662673
Training Accuracy Epoch: 52.4822695035461


1it [00:00,  4.51it/s]

Training Loss per 5000 steps: 0.3179827928543091
Training Accuracy per 5000 steps: 50.0


18it [00:05,  3.46it/s]


The Total Accuracy for Epoch 6: 56.737588652482266
Training Loss Epoch: 0.3933553712235557
Training Accuracy Epoch: 56.737588652482266


1it [00:00,  4.43it/s]

Training Loss per 5000 steps: 0.39760732650756836
Training Accuracy per 5000 steps: 87.5


18it [00:05,  3.49it/s]


The Total Accuracy for Epoch 7: 59.57446808510638
Training Loss Epoch: 0.35029176539844936
Training Accuracy Epoch: 59.57446808510638


1it [00:00,  4.33it/s]

Training Loss per 5000 steps: 0.33122408390045166
Training Accuracy per 5000 steps: 62.5


18it [00:05,  3.50it/s]


The Total Accuracy for Epoch 8: 66.66666666666667
Training Loss Epoch: 0.3122323461704784
Training Accuracy Epoch: 66.66666666666667


1it [00:00,  4.56it/s]

Training Loss per 5000 steps: 0.2454569935798645
Training Accuracy per 5000 steps: 62.5


18it [00:05,  3.50it/s]

The Total Accuracy for Epoch 9: 66.66666666666667
Training Loss Epoch: 0.2651233540640937
Training Accuracy Epoch: 66.66666666666667





In [179]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

            if _ % 5000 == 0:
                loss_step = tr_loss / nb_tr_steps
                accu_step = (n_correct * 100) / nb_tr_examples
                print(f"Validation Loss per 5000 steps: {loss_step}")
                print(f"Validation Accuracy per 5000 steps: {accu_step}")

    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

acc = valid(model, testing_loader)



1it [00:00,  7.88it/s]

Validation Loss per 5000 steps: 0.6245239973068237
Validation Accuracy per 5000 steps: 75.0


24it [00:01, 16.70it/s]

Validation Loss Epoch: 0.4991012793034315
Validation Accuracy Epoch: 45.26315789473684





In [180]:
output_model_file = 'pytorch_roberta_sentiment.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed
