<a href="https://colab.research.google.com/github/sheldor07/Essay-Evaluator-Research-Assistantship/blob/main/finetuning_bert_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install transformers accelerate -U



In [56]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import torch
import transformers
import torch.nn as nn
from transformers import AdamW


In [131]:
# Step 2: Load and preprocess data
df = pd.read_csv('/content/essayData.csv')

# Converting labels from comma separated string to list of strings
df['terms'] = df['terms'].apply(lambda x: x.split(','))

# Strip leading/trailing whitespaces from the labels
df['terms'] = df['terms'].apply(lambda x: [i.strip() for i in x])

# Binarize the labels
mlb = MultiLabelBinarizer()
df['labels'] = list(mlb.fit_transform(df['terms']))

# Checking the classes we have
print(mlb.classes_)

# Split dataframe
train_data, test_data = train_test_split(df, test_size=0.4, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)


['business' 'humanities' 'sciences' 'technology']


In [132]:
print(df)

     Essay  Section                                           ABSTRACT  \
0        1        1  Four in five Singaporeans say that they can co...   
1        1        1  While the terms ÃfalsehoodsÃ and Ãfake news...   
2        1        2  The digital era is characterised by easy creat...   
3        1        2  It is important to be critical of fake news du...   
4        1        3  Fake news has played an enormous role during C...   
..     ...      ...                                                ...   
231     10       10  The average temperature of Singapore has been ...   
232     10       11  The plants along the facade require large amou...   
233     10       12  Currently, AI regulates the release of water b...   
234     10       13  We would like to merge and build upon these ex...   
235     10       14  We have analysed the severity of global water ...   

                                terms        labels  
0                        [humanities]  [0, 1, 0, 0]  
1  

In [152]:
# Set some parameters
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 20
LEARNING_RATE = 1e-05
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

# Define the class for the dataset
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        title = str(self.data.ABSTRACT[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']  # Add this line

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),  # Add this line
            'targets': torch.tensor(self.data.labels[index], dtype=torch.float)
        }


    def __len__(self):
        return self.len

# Create DataLoaders
training_set = Triage(train_data, tokenizer, MAX_LEN)
testing_set = Triage(test_data, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)


In [153]:
class CustomBERT(nn.Module):
    def __init__(self):
        super(CustomBERT, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)  # Change the output size to 4

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output


In [154]:
model = CustomBERT()
model.to(device)

# Creating the loss function and optimizer
loss_function = torch.nn.BCEWithLogitsLoss()  # Change to BCEWithLogitsLoss for multi-label classification
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx == targets.max(dim=1)[1]).sum().item()
    return n_correct


def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if _ % 5000 == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct * 100) / nb_tr_examples}')
    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return


for epoch in range(EPOCHS):
    train(epoch)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1it [00:00,  5.35it/s]

Training Loss per 5000 steps: 0.689335286617279
Training Accuracy per 5000 steps: 25.0


18it [00:05,  3.55it/s]


The Total Accuracy for Epoch 0: 18.43971631205674
Training Loss Epoch: 0.6650684873263041
Training Accuracy Epoch: 18.43971631205674


1it [00:00,  4.65it/s]

Training Loss per 5000 steps: 0.6136883497238159
Training Accuracy per 5000 steps: 37.5


18it [00:05,  3.53it/s]


The Total Accuracy for Epoch 1: 19.148936170212767
Training Loss Epoch: 0.6214131977823045
Training Accuracy Epoch: 19.148936170212767


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.5295875072479248
Training Accuracy per 5000 steps: 37.5


18it [00:05,  3.54it/s]


The Total Accuracy for Epoch 2: 24.822695035460992
Training Loss Epoch: 0.5852986474831899
Training Accuracy Epoch: 24.822695035460992


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.5707253217697144
Training Accuracy per 5000 steps: 12.5


18it [00:05,  3.55it/s]


The Total Accuracy for Epoch 3: 31.914893617021278
Training Loss Epoch: 0.5396412644121382
Training Accuracy Epoch: 31.914893617021278


1it [00:00,  4.56it/s]

Training Loss per 5000 steps: 0.4747523069381714
Training Accuracy per 5000 steps: 50.0


18it [00:05,  3.54it/s]


The Total Accuracy for Epoch 4: 45.39007092198582
Training Loss Epoch: 0.4996279196606742
Training Accuracy Epoch: 45.39007092198582


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.4510856866836548
Training Accuracy per 5000 steps: 37.5


18it [00:05,  3.57it/s]


The Total Accuracy for Epoch 5: 49.645390070921984
Training Loss Epoch: 0.459328497449557
Training Accuracy Epoch: 49.645390070921984


1it [00:00,  4.58it/s]

Training Loss per 5000 steps: 0.38370761275291443
Training Accuracy per 5000 steps: 75.0


18it [00:05,  3.57it/s]


The Total Accuracy for Epoch 6: 53.90070921985816
Training Loss Epoch: 0.423645771212048
Training Accuracy Epoch: 53.90070921985816


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.42064252495765686
Training Accuracy per 5000 steps: 37.5


18it [00:05,  3.56it/s]


The Total Accuracy for Epoch 7: 58.86524822695036
Training Loss Epoch: 0.3886156032482783
Training Accuracy Epoch: 58.86524822695036


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.3764053285121918
Training Accuracy per 5000 steps: 62.5


18it [00:04,  3.61it/s]


The Total Accuracy for Epoch 8: 58.156028368794324
Training Loss Epoch: 0.35020240975750816
Training Accuracy Epoch: 58.156028368794324


1it [00:00,  4.68it/s]

Training Loss per 5000 steps: 0.42847704887390137
Training Accuracy per 5000 steps: 62.5


18it [00:04,  3.60it/s]


The Total Accuracy for Epoch 9: 59.57446808510638
Training Loss Epoch: 0.3134437981579039
Training Accuracy Epoch: 59.57446808510638


1it [00:00,  4.66it/s]

Training Loss per 5000 steps: 0.29217201471328735
Training Accuracy per 5000 steps: 50.0


18it [00:05,  3.59it/s]


The Total Accuracy for Epoch 10: 64.53900709219859
Training Loss Epoch: 0.27835561003949905
Training Accuracy Epoch: 64.53900709219859


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.25941383838653564
Training Accuracy per 5000 steps: 37.5


18it [00:04,  3.61it/s]


The Total Accuracy for Epoch 11: 63.12056737588652
Training Loss Epoch: 0.24930295762088564
Training Accuracy Epoch: 63.12056737588652


1it [00:00,  4.59it/s]

Training Loss per 5000 steps: 0.20411933958530426
Training Accuracy per 5000 steps: 37.5


18it [00:05,  3.59it/s]


The Total Accuracy for Epoch 12: 68.08510638297872
Training Loss Epoch: 0.22378453529543346
Training Accuracy Epoch: 68.08510638297872


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.17091527581214905
Training Accuracy per 5000 steps: 75.0


18it [00:04,  3.60it/s]


The Total Accuracy for Epoch 13: 63.829787234042556
Training Loss Epoch: 0.20203787088394165
Training Accuracy Epoch: 63.829787234042556


1it [00:00,  4.70it/s]

Training Loss per 5000 steps: 0.16904273629188538
Training Accuracy per 5000 steps: 75.0


18it [00:05,  3.60it/s]


The Total Accuracy for Epoch 14: 61.702127659574465
Training Loss Epoch: 0.17571299895644188
Training Accuracy Epoch: 61.702127659574465


1it [00:00,  4.61it/s]

Training Loss per 5000 steps: 0.1507468968629837
Training Accuracy per 5000 steps: 75.0


18it [00:05,  3.38it/s]


The Total Accuracy for Epoch 15: 62.4113475177305
Training Loss Epoch: 0.1609327851070298
Training Accuracy Epoch: 62.4113475177305


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.1545194685459137
Training Accuracy per 5000 steps: 50.0


18it [00:05,  3.58it/s]


The Total Accuracy for Epoch 16: 65.2482269503546
Training Loss Epoch: 0.1453128270804882
Training Accuracy Epoch: 65.2482269503546


1it [00:00,  4.63it/s]

Training Loss per 5000 steps: 0.1423581838607788
Training Accuracy per 5000 steps: 25.0


18it [00:05,  3.58it/s]


The Total Accuracy for Epoch 17: 62.4113475177305
Training Loss Epoch: 0.13400823457373512
Training Accuracy Epoch: 62.4113475177305


1it [00:00,  4.56it/s]

Training Loss per 5000 steps: 0.12187384814023972
Training Accuracy per 5000 steps: 75.0


18it [00:05,  3.56it/s]


The Total Accuracy for Epoch 18: 68.08510638297872
Training Loss Epoch: 0.12503968386186493
Training Accuracy Epoch: 68.08510638297872


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.09130550175905228
Training Accuracy per 5000 steps: 87.5


18it [00:05,  3.59it/s]

The Total Accuracy for Epoch 19: 68.08510638297872
Training Loss Epoch: 0.11290984228253365
Training Accuracy Epoch: 68.08510638297872





In [166]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

            if _ % 5000 == 0:
                loss_step = tr_loss / nb_tr_steps
                accu_step = (n_correct * 100) / nb_tr_examples
                print(f"Validation Loss per 5000 steps: {loss_step}")
                print(f"Validation Accuracy per 5000 steps: {accu_step}")

    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

acc = valid(model, testing_loader)



2it [00:00,  9.76it/s]

Validation Loss per 5000 steps: 0.655745267868042
Validation Accuracy per 5000 steps: 25.0


24it [00:01, 12.90it/s]

Validation Loss Epoch: 0.5850999560207129
Validation Accuracy Epoch: 40.0





In [157]:
output_model_file = 'pytorch_roberta_sentiment.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed
