# Upload training, validation, and test sets to your drive then run this to mount your drive and have access to files from the notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install latest version of Huggingface and import necessary packages

In [2]:
!pip install -q git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install transformers



In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel

#Q1.

In [5]:
lst=['CS','ECE','MAE','Medical','MAE','biochemistry','Psychology','Psychology','Medical','CS','Medical','CS','MAE','MAE','Psychology','Medical','Civil','Medical','Civil','MAE']
lst

['CS',
 'ECE',
 'MAE',
 'Medical',
 'MAE',
 'biochemistry',
 'Psychology',
 'Psychology',
 'Medical',
 'CS',
 'Medical',
 'CS',
 'MAE',
 'MAE',
 'Psychology',
 'Medical',
 'Civil',
 'Medical',
 'Civil',
 'MAE']

In [6]:
len(lst)

20

In [7]:
from google.colab import files
uploaded=files.upload()

Saving test_llm.csv to test_llm.csv


In [8]:
df_test=pd.read_csv('test_llm.csv')
df_test

Unnamed: 0,Domain,area,keywords,Abstract
0,CS,Data structures,In-memory XPath processing; NESTOR; Set-based...,XML is a pervasive technology for representing...
1,Civil,Ambient Intelligence,Home energy management; persuasive interface;...,The integration of renewable energy sources in...
2,ECE,Electrical generator,non-standard electrical machine; generation o...,The original free-swinging piston engine with ...
3,Medical,Hepatitis C,complications; patient engagement; patient-ce...,Barriers to access and long-term complications...
4,ECE,Control engineering,force feedback haptic interface; virtual real...,This paper is to present a technological solut...
5,CS,Bioinformatics,Bioinformatics; genomics,Transposable elements (TEs) constitute the mos...
6,Medical,Weight Loss,Obesity; weight loss; moral work; body projec...,Cultural notions equating greater morality and...
7,Psychology,Leadership,Data-based decision making; school improvemen...,Although data-based decision making can lead t...
8,Psychology,Seasonal affective disorder,Ramelteon; sleep; agomelatine; depression; in...,Insomnia is common among elderly people and ne...
9,CS,Data structures,Succinct dynamic data structures; Succinct tr...,Cardinal trees (or tries of degree ) are a fun...


In [9]:
lst_actual=df_test['Domain'].str.strip().tolist()
lst_actual

['CS',
 'Civil',
 'ECE',
 'Medical',
 'ECE',
 'CS',
 'Medical',
 'Psychology',
 'Psychology',
 'CS',
 'Medical',
 'CS',
 'MAE',
 'ECE',
 'Psychology',
 'Medical',
 'Civil',
 'biochemistry',
 'Civil',
 'Civil']

In [10]:
total=len(lst_actual)
total

20

In [11]:
correct=0
for i in range(total):
  if lst_actual[i]==lst[i]:
    correct+=1
correct

11

In [12]:
accuracy=correct/total
accuracy

0.55

https://chat.openai.com/share/e6806eaa-8386-4453-8a07-c4af2538daab

## Q2.

In [13]:
lst_trained=['CS', 'ECE', 'MAE', 'Medical', 'ECE', 'biochemistry', 'Psychology', 'Psychology', 'Medical', 'CS', 'Medical', 'CS', 'MAE', 'MAE', 'Psychology', 'Medical', 'Civil', 'Medical', 'MAE', 'MAE']
lst_trained

['CS',
 'ECE',
 'MAE',
 'Medical',
 'ECE',
 'biochemistry',
 'Psychology',
 'Psychology',
 'Medical',
 'CS',
 'Medical',
 'CS',
 'MAE',
 'MAE',
 'Psychology',
 'Medical',
 'Civil',
 'Medical',
 'MAE',
 'MAE']

In [14]:
len(lst_trained)

20

In [15]:
lst_actual

['CS',
 'Civil',
 'ECE',
 'Medical',
 'ECE',
 'CS',
 'Medical',
 'Psychology',
 'Psychology',
 'CS',
 'Medical',
 'CS',
 'MAE',
 'ECE',
 'Psychology',
 'Medical',
 'Civil',
 'biochemistry',
 'Civil',
 'Civil']

In [16]:
total = len(lst_actual)
total

20

In [17]:
correct_trained=0
for i in range(total):
  if lst_trained[i]==lst_actual[i]:
    correct_trained+=1
correct_trained

11

In [18]:
accuracy_trained=correct_trained/total
accuracy_trained

0.55

https://chat.openai.com/share/33d4ef90-0d5a-494b-a3ad-a88f68954427

# Question #3: Prepare the data for fine-tuning using OpenAI Playground

In [19]:
# Load training, test, and validation sets
def load_data(PATH):
  try:
    data=pd.read_csv(PATH)
    return data
  except:
    raise NotImplementedError

In [20]:
df_train=load_data('/content/drive/My Drive/training.csv')
df_train.head()

Unnamed: 0,Domain,area,keywords,Abstract
0,Medical,Hepatitis C,Feasibility study; hepatitis C risk behaviour...,Aims: This study aimed to develop and test the...
1,CS,Distributed computing,Agent Architecture; Mobile Agent; Agent Cloni...,Mobile agent technology is becoming more popul...
2,ECE,Control engineering,educational software tool; multivariable cont...,This paper presents an educational software to...
3,Psychology,False memories,judgment; metamemory; accuracy; eyewitness me...,"Different researchers have reported positive, ..."
4,Psychology,Leadership,Implementation support; Co-occurring disorder...,Background: Incorporating evidence-based integ...


In [21]:
df_val=load_data('/content/drive/My Drive/validation.csv')
df_val.head()

Unnamed: 0,Domain,area,keywords,Abstract
0,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."


In [22]:
df_test=load_data('/content/drive/My Drive/test_llm.csv')
df_test.head()

Unnamed: 0,Domain,area,keywords,Abstract
0,CS,Data structures,In-memory XPath processing; NESTOR; Set-based...,XML is a pervasive technology for representing...
1,Civil,Ambient Intelligence,Home energy management; persuasive interface;...,The integration of renewable energy sources in...
2,ECE,Electrical generator,non-standard electrical machine; generation o...,The original free-swinging piston engine with ...
3,Medical,Hepatitis C,complications; patient engagement; patient-ce...,Barriers to access and long-term complications...
4,ECE,Control engineering,force feedback haptic interface; virtual real...,This paper is to present a technological solut...


In [23]:
# Format the training and validation sets
# to make them usable for Open AI playground fine-tuning
# Format:
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
# Use gpt-3.5-turbo
# See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
def make_finetuning_data_gpt(dataset):
  try:
    formatted_data = []
    for index, row in dataset.iterrows():
        # Structure the prompt with the abstract as a user input
        user_message = {"role": "user", "content": row["Abstract"]}

        # Structure the response with the domain as an assistant output
        assistant_message = {"role": "assistant", "content": row["Domain"]}

        # Combine into a single data point
        data_point = {"messages": [user_message, assistant_message]}

        formatted_data.append(data_point)

    return formatted_data
  except:
    raise NotImplementedError

In [24]:
formatted_training_data = make_finetuning_data_gpt(df_train)
formatted_validation_data = make_finetuning_data_gpt(df_val)

In [25]:
import json
with open('formatted_training_data.jsonl', 'w') as file_train:
    for entry in formatted_training_data:
      json.dump(entry, file_train)
      file_train.write('\n')

In [26]:
import json
with open('formatted_validation_data.jsonl', 'w') as file_val:
    for entry in formatted_validation_data:
      json.dump(entry, file_val)
      file_val.write('\n')

In [27]:
from google.colab import files
files.download('formatted_training_data.jsonl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
files.download('formatted_validation_data.jsonl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Without Prompt Engineering

In [29]:
lst_determined=['CS', 'Civil', 'MAE', 'Medical', 'ECE', 'biochemistry', 'Psychology', 'Psychology', 'Medical', 'CS', 'Medical', 'CS', 'Civil', 'MAE', 'Psychology', 'Medical', 'Civil', 'Medical', 'ECE', 'MAE']
lst_determined

['CS',
 'Civil',
 'MAE',
 'Medical',
 'ECE',
 'biochemistry',
 'Psychology',
 'Psychology',
 'Medical',
 'CS',
 'Medical',
 'CS',
 'Civil',
 'MAE',
 'Psychology',
 'Medical',
 'Civil',
 'Medical',
 'ECE',
 'MAE']

In [30]:
len(lst_determined)

20

In [31]:
df_test

Unnamed: 0,Domain,area,keywords,Abstract
0,CS,Data structures,In-memory XPath processing; NESTOR; Set-based...,XML is a pervasive technology for representing...
1,Civil,Ambient Intelligence,Home energy management; persuasive interface;...,The integration of renewable energy sources in...
2,ECE,Electrical generator,non-standard electrical machine; generation o...,The original free-swinging piston engine with ...
3,Medical,Hepatitis C,complications; patient engagement; patient-ce...,Barriers to access and long-term complications...
4,ECE,Control engineering,force feedback haptic interface; virtual real...,This paper is to present a technological solut...
5,CS,Bioinformatics,Bioinformatics; genomics,Transposable elements (TEs) constitute the mos...
6,Medical,Weight Loss,Obesity; weight loss; moral work; body projec...,Cultural notions equating greater morality and...
7,Psychology,Leadership,Data-based decision making; school improvemen...,Although data-based decision making can lead t...
8,Psychology,Seasonal affective disorder,Ramelteon; sleep; agomelatine; depression; in...,Insomnia is common among elderly people and ne...
9,CS,Data structures,Succinct dynamic data structures; Succinct tr...,Cardinal trees (or tries of degree ) are a fun...


In [32]:
lst_actual=df_test['Domain'].str.strip().tolist()
lst_actual

['CS',
 'Civil',
 'ECE',
 'Medical',
 'ECE',
 'CS',
 'Medical',
 'Psychology',
 'Psychology',
 'CS',
 'Medical',
 'CS',
 'MAE',
 'ECE',
 'Psychology',
 'Medical',
 'Civil',
 'biochemistry',
 'Civil',
 'Civil']

In [33]:
total=len(lst_actual)
total

20

In [34]:
correct=0
for i in range(total):
  if lst_actual[i]==lst_determined[i]:
    correct+=1
correct

11

In [35]:
accuracy=correct/total
accuracy

0.55

https://platform.openai.com/playground/p/0BX4x6Ak7zaUDudqjODwZbwl?model=ft:gpt-3.5-turbo-1106:personal:gpt-fine-tuned:9672aibL&mode=chat

## With Prompt Engineering

In [36]:
lst_determined1=['CS','MAE','ECE','Medical','ECE','biochemistry','Psychology','Psychology','Medical','CS','Medical','CS','MAE','MAE','Psychology','Medical','Civil','Medical','ECE','MAE']
lst_determined1

['CS',
 'MAE',
 'ECE',
 'Medical',
 'ECE',
 'biochemistry',
 'Psychology',
 'Psychology',
 'Medical',
 'CS',
 'Medical',
 'CS',
 'MAE',
 'MAE',
 'Psychology',
 'Medical',
 'Civil',
 'Medical',
 'ECE',
 'MAE']

In [37]:
lst_actual

['CS',
 'Civil',
 'ECE',
 'Medical',
 'ECE',
 'CS',
 'Medical',
 'Psychology',
 'Psychology',
 'CS',
 'Medical',
 'CS',
 'MAE',
 'ECE',
 'Psychology',
 'Medical',
 'Civil',
 'biochemistry',
 'Civil',
 'Civil']

In [38]:
total=len(lst_actual)
total

20

In [39]:
correct=0
for i in range(total):
  if lst_actual[i]==lst_determined1[i]:
    correct+=1
correct

12

In [40]:
accuracy=correct/total
accuracy

0.6

https://platform.openai.com/playground/p/0pkZyS46MVZ20wosRgfW3pdZ?model=gpt-3.5-turbo&mode=chat

# Question #4: Fine-tune a Distilbert model on the training set


1) Prepare the data and load the pre-trained model

In [75]:
training_set = load_data('/content/drive/My Drive/training.csv')
validation_set = load_data('/content/drive/My Drive/validation.csv')
test_set = load_data('/content/drive/My Drive/test_llm.csv')

In [76]:
# Format the training, validation, and test sets
# to use them for fine-tuning and evaluation using Distilbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def make_finetuning_data_distilbert(dataset, tokenizer, max_length=512):
    inputs = tokenizer(dataset["Abstract"].tolist(), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    labels = pd.get_dummies(dataset["Domain"]).values.argmax(axis=1)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": torch.tensor(labels)}

MAX_LENGTH = 512
training_set = make_finetuning_data_distilbert(training_set, tokenizer, MAX_LENGTH)
validation_set = make_finetuning_data_distilbert(validation_set, tokenizer, MAX_LENGTH)
test_set = make_finetuning_data_distilbert(test_set, tokenizer, MAX_LENGTH)

In [77]:
# use the distilbert tokenizer and pre-trained model
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

2) Prepare the dataloader

In [78]:
MAX_LENGTH = MAX_LENGTH
BATCH_SIZE_TRAIN = 8
BATCH_SIZE_VAL = 8

class FTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        # Here, 'encodings' is a dictionary containing 'input_ids' and 'attention_mask'
        item = {key: val[index] for key, val in self.encodings.items()}
        item['labels'] = self.labels[index]
        return item

# Correctly passing the entire dictionaries
dataset_train = FTDataset(training_set, training_set['labels'])
dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE_TRAIN)
dataset_val = FTDataset(validation_set, validation_set['labels'])
dataloader_val = DataLoader(dataset=dataset_val, batch_size=BATCH_SIZE_VAL)

In [79]:
training_set

{'input_ids': tensor([[ 101, 8704, 1024,  ...,    0,    0,    0],
         [ 101, 4684, 4005,  ...,    0,    0,    0],
         [ 101, 2023, 3259,  ...,    0,    0,    0],
         ...,
         [ 101, 1015, 1012,  ...,    0,    0,    0],
         [ 101, 1015, 1012,  ...,    0,    0,    0],
         [ 101, 1037, 2186,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([4, 0, 2, 5, 5, 0, 5, 2, 0, 4, 4, 1, 6, 4, 2, 4, 1, 4, 3, 5, 4, 4, 0, 2,
         6, 6, 4, 6, 6, 5, 6, 0, 2, 6, 5, 6, 2, 6, 6, 6, 6, 1, 1, 1, 3, 4, 6, 6,
         4, 3])}

2) Add (a) trainable layer(s) on top of DistilBert

In [80]:
path= '/content/drive/My Drive/'

In [81]:
df_training_for_labels = pd.read_csv(path+'training.csv')
df_training_for_labels.head()

Unnamed: 0,Domain,area,keywords,Abstract
0,Medical,Hepatitis C,Feasibility study; hepatitis C risk behaviour...,Aims: This study aimed to develop and test the...
1,CS,Distributed computing,Agent Architecture; Mobile Agent; Agent Cloni...,Mobile agent technology is becoming more popul...
2,ECE,Control engineering,educational software tool; multivariable cont...,This paper presents an educational software to...
3,Psychology,False memories,judgment; metamemory; accuracy; eyewitness me...,"Different researchers have reported positive, ..."
4,Psychology,Leadership,Implementation support; Co-occurring disorder...,Background: Incorporating evidence-based integ...


In [82]:
num_labels=df_training_for_labels['Domain'].nunique()
num_labels

7

In [83]:
class FTModel(nn.Module):
    def __init__(self, num_labels):
      try:
        super(FTModel, self).__init__()
        self.bert_model = bert_model
        self.classifier = nn.Linear(self.bert_model.config.dim, num_labels)
      except:
        raise NotImplementedError

    def forward(self,input_ids,attention_mask):
      try:
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        pred = self.classifier(cls_output)
        return pred
      except:
        raise NotImplementedError

model = FTModel(num_labels)

In [84]:
LEARNING_RATE = 0.001
# Use cross-entropy loss
loss_fn = nn.CrossEntropyLoss()

# Initialize Optimizer
optimizer= optim.Adam(model.parameters(),lr= LEARNING_RATE)

In [85]:
# Freeze parameters of the pre-trained Distilbert model
for parameters_1 in model.bert_model.parameters():
    parameters_1.requires_grad = False

3) Write the fine-tuning and evaluation functions

In [86]:
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

In [87]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VAL_FREQUENCY = 1
EPOCHS = 3
for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")
    train_acc, train_loss = train_epoch(model, dataloader_train, loss_fn, optimizer, device)
    print(f"Train loss {train_loss} accuracy {train_acc}")

    if epoch % VAL_FREQUENCY == 0 or epoch == EPOCHS:
        val_acc, val_loss = eval_model(model, dataloader_val, loss_fn, device)
        print(f"Val loss {val_loss} accuracy {val_acc}")


Epoch 1/3
Train loss 2.0483247893197194 accuracy 0.08
Val loss 1.7680703571864538 accuracy 0.34
Epoch 2/3
Train loss 1.750523294721331 accuracy 0.26
Val loss 1.5295626095363073 accuracy 0.4
Epoch 3/3
Train loss 1.651096957070487 accuracy 0.36
Val loss 1.4328575730323792 accuracy 0.38


# Report accuracy on test set using sklearn.metrics.accuracy_score

In [88]:
from sklearn.metrics import accuracy_score

def evaluate_on_test(model, data_loader, device):
    model = model.eval()
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds.tolist())
            real_values.extend(labels.tolist())

    return accuracy_score(real_values, predictions)

In [89]:
dataset_test = FTDataset(test_set, test_set['labels'])
dataloader_test = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE_TRAIN)
evaluate_on_test(model, dataloader_test, device)

0.2

# Question 5: Fine-tune all parameters (requires GPU)


In [106]:
training_set = load_data('/content/drive/My Drive/training.csv')
validation_set = load_data('/content/drive/My Drive/validation.csv')
test_set = load_data('/content/drive/My Drive/test_llm.csv')

In [107]:
# Format the training, validation, and test sets
# to use them for fine-tuning and evaluation using Distilbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def make_finetuning_data_distilbert(dataset, tokenizer, max_length=512):
    inputs = tokenizer(dataset["Abstract"].tolist(), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    labels = pd.get_dummies(dataset["Domain"]).values.argmax(axis=1)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": torch.tensor(labels)}

MAX_LENGTH = 512
training_set = make_finetuning_data_distilbert(training_set, tokenizer, MAX_LENGTH)
validation_set = make_finetuning_data_distilbert(validation_set, tokenizer, MAX_LENGTH)
test_set = make_finetuning_data_distilbert(test_set, tokenizer, MAX_LENGTH)

In [108]:
MAX_LENGTH = MAX_LENGTH
BATCH_SIZE_TRAIN = 8
BATCH_SIZE_VAL = 8

class FTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        # Here, 'encodings' is a dictionary containing 'input_ids' and 'attention_mask'
        item = {key: val[index] for key, val in self.encodings.items()}
        item['labels'] = self.labels[index]
        return item

# Correctly passing the entire dictionaries
dataset_train = FTDataset(training_set, training_set['labels'])
dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE_TRAIN)
dataset_val = FTDataset(validation_set, validation_set['labels'])
dataloader_val = DataLoader(dataset=dataset_val, batch_size=BATCH_SIZE_VAL)

In [109]:
training_set

{'input_ids': tensor([[ 101, 8704, 1024,  ...,    0,    0,    0],
         [ 101, 4684, 4005,  ...,    0,    0,    0],
         [ 101, 2023, 3259,  ...,    0,    0,    0],
         ...,
         [ 101, 1015, 1012,  ...,    0,    0,    0],
         [ 101, 1015, 1012,  ...,    0,    0,    0],
         [ 101, 1037, 2186,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([4, 0, 2, 5, 5, 0, 5, 2, 0, 4, 4, 1, 6, 4, 2, 4, 1, 4, 3, 5, 4, 4, 0, 2,
         6, 6, 4, 6, 6, 5, 6, 0, 2, 6, 5, 6, 2, 6, 6, 6, 6, 1, 1, 1, 3, 4, 6, 6,
         4, 3])}

In [110]:
bert_model_fullft = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [111]:
path='/content/drive/My Drive/training.csv'
df_training_for_labels = pd.read_csv(path)
df_training_for_labels.head()

Unnamed: 0,Domain,area,keywords,Abstract
0,Medical,Hepatitis C,Feasibility study; hepatitis C risk behaviour...,Aims: This study aimed to develop and test the...
1,CS,Distributed computing,Agent Architecture; Mobile Agent; Agent Cloni...,Mobile agent technology is becoming more popul...
2,ECE,Control engineering,educational software tool; multivariable cont...,This paper presents an educational software to...
3,Psychology,False memories,judgment; metamemory; accuracy; eyewitness me...,"Different researchers have reported positive, ..."
4,Psychology,Leadership,Implementation support; Co-occurring disorder...,Background: Incorporating evidence-based integ...


In [112]:
num_labels=df_training_for_labels['Domain'].nunique()
num_labels

7

In [113]:
class FTModelFull(nn.Module):
    def __init__(self,num_labels):
      try:
        super(FTModelFull, self).__init__()
        self.bert_model_fullft = bert_model_fullft
        self.classifier = nn.Linear(self.bert_model_fullft.config.dim, num_labels)
      except:
        raise NotImplementedError

    def forward(self,input_ids,attention_mask):
      try:
        outputs = self.bert_model_fullft(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        pred = self.classifier(cls_output)
        return pred
      except:
        raise NotImplementedError

model = FTModelFull(num_labels)

In [114]:
# Initialize model but don't freeze Distilbert parameters
#model_full_ft = FTModelFull(num_labels).to('cuda')

In [115]:
from transformers import AdamW

In [116]:
loss_fn = nn.CrossEntropyLoss()

# Choose parameters wisely!
learning_rate = 0.001
adam_epsilon = 0.1

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.2},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)



In [117]:
# Unfreeze parameters of the pre-trained Distilbert model
for parameters in model.bert_model_fullft.parameters():
    parameters.requires_grad = True

In [118]:
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

In [119]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VAL_FREQUENCY = 1
EPOCHS = 3
for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")
    train_acc, train_loss = train_epoch(model, dataloader_train, loss_fn, optimizer, device)
    print(f"Train loss {train_loss} accuracy {train_acc}")

    if epoch % VAL_FREQUENCY == 0 or epoch == EPOCHS:
        val_acc, val_loss = eval_model(model, dataloader_val, loss_fn, device)
        print(f"Val loss {val_loss} accuracy {val_acc}")

Epoch 1/3
Train loss 1.917412621634347 accuracy 0.2
Val loss 1.8569792338779993 accuracy 0.34
Epoch 2/3
Train loss 1.8940377916608537 accuracy 0.18
Val loss 1.8071059840066093 accuracy 0.34
Epoch 3/3
Train loss 1.8606900147029333 accuracy 0.24
Val loss 1.7540547507149833 accuracy 0.34


# Report accuracy on test set using sklearn.metrics.accuracy_score

In [120]:
from sklearn.metrics import accuracy_score

def evaluate_on_test(model, data_loader, device):
    model = model.eval()
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds.tolist())
            real_values.extend(labels.tolist())

    return accuracy_score(real_values, predictions)

In [121]:
dataset_test = FTDataset(test_set, test_set['labels'])
dataloader_test = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE_TRAIN)
evaluate_on_test(model, dataloader_test, device)

0.15