In [None]:
# Step 1: Uninstall all conflicting or broken packages
!pip uninstall -y torch torchvision torchaudio transformers numpy

# Step 2: Reinstall numpy (downgrade to compatible version <2.0)
!pip install numpy==1.24.4

# Step 3: Install PyTorch that matches your CUDA version (Colab uses CUDA 11.8)
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118



#!pip install accelerate
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


In [None]:
# Step 4: Reinstall transformers
!pip install transformers==4.37.2  # Or latest stable if needed

In [None]:
!pip uninstall -y torch torchvision torchaudio transformers numpy accelerate

# Step 5: Install PyTorch that matches your CUDA version (Colab uses CUDA 11.8)

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Step 6: Install transformers and accelerate

#!pip install transformers accelerate

In [1]:
import numpy as np
import pandas as pd
import re,warnings, gc,torch
import torch.nn as nn
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertForSequenceClassification,DistilBertTokenizerFast, RobertaTokenizerFast, RobertaForSequenceClassification, RobertaModel, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
import os

In [3]:

pharm_data = pd.read_csv("/content/classification_data.csv")

label_encoder = LabelEncoder()
#Encoding our labels to numbers
pharm_data['label_encoded'] = label_encoder.fit_transform(pharm_data['label'])

In [4]:
#converting text to lower case because roberta base is case sensitive.
pharm_data['text']= pharm_data.text.str.lower()
pharm_data.head()

Unnamed: 0,id,text,label,label_encoded
0,1,drugz caused severe rashes in some participants.,Adverse Effect,0
1,2,no significant side effects were observed duri...,Positive Outcome,2
2,3,participants were monitored every two weeks.,Neutral Observation,1
3,4,increased liver enzymes were noted post-treatm...,Adverse Effect,0
4,5,patients were instructed to maintain a food di...,Neutral Observation,1


In [17]:
# shuffling the data and splitting into training, testing datasts
pharm_data = pharm_data.sample(frac=1.0)
x_train,x_val,y_train,y_val = train_test_split(pharm_data['text'].tolist(),pharm_data['label_encoded'].tolist(),test_size = 0.01,shuffle=True,random_state = 42)

In [18]:
# garbase collection to free up cache
gc.collect()

159

In [19]:
# again split training dataset to save some original texts for prediction.
train,val,train_label,val_label = train_test_split(x_train,y_train,test_size = 0.15,shuffle=True,random_state = 42)

In [20]:
len(train)

841

In [21]:
# Tokenize the inputs and create a dataset class for converting the embeddings to tensor and collect input embeddings, attention masks and labels for each text in the corpus
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
train_encodings = tokenizer(train, truncation=True, padding=True)
val_encodings = tokenizer(val, truncation=True, padding=True)


torch.backends.cuda.matmul.allow_tf32 = True

class DataPrep(torch.utils.data.Dataset):
    def __init__(self, embed, labels):
        self.embed = embed
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.embed.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DataPrep(train_encodings, train_label)
val_dataset = DataPrep(val_encodings, val_label)

In [22]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_encoder.classes_))
lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,  # Token Classification
    inference_mode=False,
    r=8,              # Low-rank dimension
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 887,811 || all params: 125,535,750 || trainable%: 0.7072


In [23]:
# initiate the model and supply training arguments like learning rate, number of epochs and batch size so that the data will be processed in batches and does not take much time for fine tuning,
# After initiating arguments, pass them along with the model to trainer for fine tuning the LLM.


training_args = TrainingArguments(
    output_dir="/tmp/results",

    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="/tmp/logs",
    logging_steps=10,
    save_strategy="no",
    load_best_model_at_end=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()



No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,1.1027
20,1.0963
30,1.0832
40,1.0825
50,1.0859
60,1.0655
70,1.0601
80,1.0355
90,1.0036
100,0.9652


TrainOutput(global_step=265, training_loss=0.6697901786498304, metrics={'train_runtime': 464.6775, 'train_samples_per_second': 9.049, 'train_steps_per_second': 0.57, 'total_flos': 32749817899500.0, 'train_loss': 0.6697901786498304, 'epoch': 5.0})

In [24]:
# Evaluate the model's performance of validation dataset.
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.13370844721794128, 'eval_runtime': 7.8279, 'eval_samples_per_second': 19.035, 'eval_steps_per_second': 1.277, 'epoch': 5.0}


In [25]:
# Custome function to predict the label of each text. First tokenize the text, pass the encodings to the same device where model is running and collect the outputs.
def predict(text):
  with torch.no_grad():
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    device = model.device
    # Move the input tensors to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
  return label_encoder.inverse_transform([predicted_class])[0]



In [26]:
# Display outputs.
predicted = []
for i in range(len(x_val)):
  label = predict(x_val[i])
  le = label_encoder.transform([label])
  predicted.append(le)
  print(f"The text = {x_val[i]}, the predicted label of the text = {label}  , label's encoding = {le[0]}, and the original label = {y_val[i]}")

The text = enrollment criteria included age and weight specifications., the predicted label of the text = Neutral Observation  , label's encoding = 1, and the original label = 1
The text = the patient experienced nausea after taking drugx., the predicted label of the text = Adverse Effect  , label's encoding = 0, and the original label = 0
The text = patients were instructed to maintain a food diary., the predicted label of the text = Neutral Observation  , label's encoding = 1, and the original label = 1
The text = marked improvement in blood pressure control was achieved with druge., the predicted label of the text = Positive Outcome  , label's encoding = 2, and the original label = 2
The text = severe allergic reactions were observed following drugb administration., the predicted label of the text = Adverse Effect  , label's encoding = 0, and the original label = 0
The text = severe allergic reactions were observed following drugb administration., the predicted label of the text = A

In [27]:
#Evaluation metrics
print("accuracy of the classification = ",accuracy_score(y_val,predicted))
print("F1 score = ",f1_score(y_val,predicted,average='weighted'))
print("precision score = ",precision_score(y_val,predicted,average='weighted'))
print("recall score = ",recall_score(y_val,predicted,average='weighted'))

accuracy of the classification =  1.0
F1 score =  1.0
precision score =  1.0
recall score =  1.0


#Named Entity Recognition

In [36]:
warnings.filterwarnings("ignore")

#Named Entity Recognition - Same as above, we are reading, splitting and tokenizing the data. However, we need to recognize the specific parts of input rather than classification.
# Thus, in data preparation, we are trying to match each label to the word for fine tuning LLM

ner_data = pd.read_csv("/content/ner_data.csv")


ner_data['tag'].value_counts()

# Group data by sentence_id to reconstruct sentences and their tags
grouped_data = ner_data.groupby('sentence_id').agg(
    sentence=('word', lambda x: ' '.join(x)),
    tags=('tag', list)
).reset_index()

# Use LabelEncoder on the unique tags
le = LabelEncoder()
# Fit on ALL unique tags including 'O'
all_tags = sorted(list(ner_data['tag'].unique()))
le.fit(all_tags)

# Apply encoding to the grouped tags
grouped_data['encoded_tags'] = grouped_data['tags'].apply(lambda x: le.transform(x).tolist())

# Split data at the sentence level
train_sentences, val_sentences, train_tags, val_tags = train_test_split(
    grouped_data['sentence'].tolist(),
    grouped_data['encoded_tags'].tolist(),
    test_size=0.005,
    shuffle=True,
    random_state=42
)
actual_train,actual_val,actual_train_label,actual_val_label = train_test_split(train_sentences,train_tags,test_size = 0.15,shuffle=True,random_state = 42)

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# Tokenize the sentences and align labels
def tokenize_and_align_labels(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(sentences, max_length=50, truncation=True, padding=True, is_split_into_words=False,return_offsets_mapping=True)

    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # Initialize with -100 for all tokens
        label_ids = [-100] * len(word_ids)

        previous_word_idx = None
        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is not None:
                # Only label the first token of a word
                if word_idx != previous_word_idx:

                    if word_idx < len(label):
                         label_ids[token_idx] = label[word_idx]
                    else:
                         pass # or set to -100, or the O tag if available


            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_encodings_ner = tokenize_and_align_labels(actual_train, actual_train_label, tokenizer)
val_encodings_ner = tokenize_and_align_labels(actual_val, actual_val_label, tokenizer)

torch.backends.cuda.matmul.allow_tf32 = True

class DataPrepNER(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):

        item = {}
        for key, val in self.encodings.items():
             if isinstance(val[idx], list):
                 item[key] = torch.tensor(val[idx])
             else:
                 item[key] = val[idx]

        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


ner_train_df = DataPrepNER(train_encodings_ner)
ner_val_df = DataPrepNER(val_encodings_ner)


auto_model = AutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=len(le.classes_))
# Pass id2label and label2id to the model config
id2label = {i: tag for i, tag in enumerate(all_tags)}
label2id = {tag: i for i, tag in enumerate(all_tags)}

auto_model = AutoModelForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=len(le.classes_),
    id2label=id2label,
    label2id=label2id
)


ner_train_args = TrainingArguments(
    output_dir="/tmp/results",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="/tmp/logs",
    logging_steps=10,
    save_strategy="no",
    load_best_model_at_end=False,
    # Add evaluation strategy

)
ner_train = Trainer(
    model = auto_model,
    args = ner_train_args,
    train_dataset = ner_train_df,
    eval_dataset = ner_val_df
)
ner_train.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.5295
20,0.1071
30,0.0136
40,0.0025
50,0.0017


TrainOutput(global_step=53, training_loss=0.1235515240070252, metrics={'train_runtime': 4.8888, 'train_samples_per_second': 172.844, 'train_steps_per_second': 10.841, 'total_flos': 6899929863840.0, 'train_loss': 0.1235515240070252, 'epoch': 1.0})

In [37]:
# Evaluating Model on the validation dataset..
ner_train.evaluate()

{'eval_loss': 0.0005763536901213229,
 'eval_runtime': 0.1512,
 'eval_samples_per_second': 992.032,
 'eval_steps_per_second': 66.135,
 'epoch': 1.0}

In [38]:
# Custome function to predict the label of each word
def ner_predict(text):
  with torch.no_grad():
    inputs = tokenizer(text, max_length = "max_length",padding=True, truncation=True, return_tensors="pt")
    device = auto_model.device
    # Move the input tensors to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = auto_model(**inputs)
    logits = outputs.logits
    return logits

In [39]:
# Trying to tokenize the input text and send the embeddings to the same device where model is running. Further, we are trying to get the label of each word in the input sentence.
def word_label_predict(text):
  inputs = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True)
  offsets = inputs.pop("offset_mapping")  # Not needed for model but needed for word alignment
  device = auto_model.device
  # Move the input tensors to the same device as the model
  inputs = {key: val.to(device) for key, val in inputs.items()}
  # Run model
  with torch.no_grad():
      outputs = auto_model(**inputs)

      predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()

  # Convert token predictions to labels
  predicted_labels = le.inverse_transform(predictions)

  # Align tokens with original words
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
  word_mapping = tokenizer(text, return_offsets_mapping=True)["offset_mapping"]
  print("word tokens = ",tokens)
  print("word mapping = ",word_mapping)
  print("predicted labels = ",predicted_labels)
  h_m = {}
  for token, offset, label in zip(tokens, word_mapping, predicted_labels):
      if token.startswith("Ġ"):  # Roberta adds "Ġ" to indicate word starts
          token = token[1:]
      if offset != (0, 0):  # Skip special tokens like <s>, </s>
          word = text[offset[0]:offset[1]]
          h_m[word] = label
  return h_m


In [40]:
# The outputs represents word tokens, the tokenized indexes (word mapping) and predicted label for each tokenized word.
for i in val_sentences:
  dictionary = word_label_predict(i)

  print(f"The sentence = {i}")
  for key in dictionary:
    print(f"{key} : {dictionary[key]}")
  print("-------------------------------------------------")

word tokens =  ['<s>', 'Pat', 'ients', 'Ġexperienced', 'Ġdizz', 'iness', 'Ġafter', 'Ġtaking', 'Ġ500', 'mg', 'Ġof', 'ĠDrug', 'A', '</s>']
word mapping =  [(0, 0), (0, 3), (3, 8), (9, 20), (21, 25), (25, 30), (31, 36), (37, 43), (44, 47), (47, 49), (50, 52), (53, 57), (57, 58), (0, 0)]
predicted labels =  ['O' 'O' 'O' 'O' 'B-SYMPTOM' 'B-SYMPTOM' 'O' 'O' 'B-DOSAGE' 'O' 'O' 'O'
 'O' 'O']
The sentence = Patients experienced dizziness after taking 500mg of DrugA
Pat : O
ients : O
experienced : O
dizz : B-SYMPTOM
iness : B-SYMPTOM
after : O
taking : O
500 : B-DOSAGE
mg : O
of : O
Drug : O
A : O
-------------------------------------------------
word tokens =  ['<s>', 'Pat', 'ients', 'Ġexperienced', 'Ġnausea', 'Ġafter', 'Ġadministration', 'Ġof', 'Ġ500', 'mg', 'Ġof', 'ĠDrug', 'B', '</s>']
word mapping =  [(0, 0), (0, 3), (3, 8), (9, 20), (21, 27), (28, 33), (34, 48), (49, 51), (52, 55), (55, 57), (58, 60), (61, 65), (65, 66), (0, 0)]
predicted labels =  ['O' 'O' 'O' 'O' 'B-SYMPTOM' 'O' 'O' 'O' '

# *Transparency*
## Google Colab uses AI to automatically generate the code and I took the help of code generated in such way for Named Entity Recognition Task.

