#Dependanceis

In [1]:
!nvidia-smi

Tue Dec 31 03:38:16 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8             10W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [2]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet tokenizers

In [3]:
from typing import List, Dict
import tqdm.notebook as tq
from tqdm.notebook import tqdm
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import DataLoader,Dataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
    get_linear_schedule_with_warmup
    )



In [4]:
pl.seed_everything(42)

42

In [5]:
!pip --quiet install datasets

In [6]:
from datasets import load_dataset
dataset = load_dataset("race",'all')

README.md:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

#Explore the Dataset

> Add blockquote



In [7]:
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 4934
    })
    train: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 87866
    })
    validation: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 4887
    })
})


In [8]:
# dataset structre


# 1- article
print("Main article \n",dataset['train']['article'][2])

# 2- question
print("Question \n",dataset['train']['question'][2])

# 3- options
print("Options \n",dataset['train']['options'][2])

# 4- answer
print("Answer \n",dataset['train']['answer'][2])

Main article 
 Last week I talked with some of my students about what they wanted to do after they graduated, and what kind of job prospects  they thought they had.
Given that I teach students who are training to be doctors, I was surprised do find that most thought that they would not be able to get the jobs they wanted without "outside help". "What kind of help is that?" I asked, expecting them to tell me that they would need a   or family friend to help them out.
"Surgery ," one replied.
I was pretty alarmed by that response. It seems that the graduates of today are increasingly willing to go under the knife to get ahead of others when it comes to getting a job .
One girl told me that she was considering surgery to increase her height. "They break your legs, put in special extending screws, and slowly expand the gap between the two ends of the bone as it re-grows, you can get at least 5 cm taller!"
At that point, I was shocked. I am short, I can't deny that, but I don't think I woul

In [9]:
print(list(set(dataset['train']['answer'])))

['A', 'C', 'D', 'B']


In [10]:
dataset['train']['options'][2]

['everyone should purchase perfection, whatever the cost',
 "it's right for graduates to ask for others to help them out in hunting for jobs",
 "it is one's appearance instead of skills that really matters in one's career",
 'media are to blame for misleading young people in their seeking for surgery']

#Create The dataset for training

In [11]:
def create_dataset(old_dataset):
    correct_answer_indices = [ord(x) - 65 for x in old_dataset['answer']]
    correct_answers = [options.pop(idx) for options, idx in zip(old_dataset['options'], correct_answer_indices)]
    new_data = pd.DataFrame({
        'context': old_dataset['article'],
        'question': old_dataset['question'],
        'correct': correct_answers,
        'incorrect1': [options[0] for options in old_dataset['options']],
        'incorrect2': [options[1] for options in old_dataset['options']],
        'incorrect3': [options[2] for options in old_dataset['options']]
    })
    return new_data

In [12]:
race_train_df = create_dataset(dataset['train'])
race_val_df = create_dataset(dataset['validation'])
race_test_df = create_dataset(dataset['test'])

In [13]:
train_df = race_train_df
val_df = race_val_df
test_df = race_test_df

train_df.to_csv('race_train_df.csv', index=False)
val_df.to_csv('race_val_df.csv', index=False)
test_df.to_csv('race_test_df.csv', index=False)

# Max Token Size

In [14]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]



In [None]:
context_token_lens = []
question_token_lens = []
answer_token_lens = []
incorrect_token_lens = []

for i in tq.tqdm(range(len(train_df))):
    context_token_lens.append(len(tokenizer(train_df.iloc[i]['context'])['input_ids']))
    question_token_lens.append(len(tokenizer(train_df.iloc[i]['question'])['input_ids']))
    answer_token_lens.append(len(tokenizer(train_df.iloc[i]['correct'])['input_ids']))
    incorrect_token_lens.append(len(tokenizer(train_df.iloc[i]['incorrect1'] + train_df.iloc[i]['incorrect2'] + train_df.iloc[i]['incorrect3'])['input_ids']))

for i in tq.tqdm(range(len(test_df))):
    context_token_lens.append(len(tokenizer(test_df.iloc[i]['context'])['input_ids']))
    question_token_lens.append(len(tokenizer(test_df.iloc[i]['question'])['input_ids']))
    answer_token_lens.append(len(tokenizer(test_df.iloc[i]['correct'])['input_ids']))
    incorrect_token_lens.append(len(tokenizer(test_df.iloc[i]['incorrect1'] + test_df.iloc[i]['incorrect2'] + test_df.iloc[i]['incorrect3'])['input_ids']))


for i in tq.tqdm(range(len(val_df))):
    context_token_lens.append(len(tokenizer(val_df.iloc[i]['context'])['input_ids']))
    question_token_lens.append(len(tokenizer(val_df.iloc[i]['question'])['input_ids']))
    answer_token_lens.append(len(tokenizer(val_df.iloc[i]['correct'])['input_ids']))
    incorrect_token_lens.append(len(tokenizer(val_df.iloc[i]['incorrect1'] + val_df.iloc[i]['incorrect2'] + val_df.iloc[i]['incorrect3'])['input_ids']))

In [None]:
#Context
pd.DataFrame(context_token_lens).describe()

In [None]:
#Question
pd.DataFrame(question_token_lens).describe()

In [None]:
#Correct Answer
pd.DataFrame(answer_token_lens).describe()

In [None]:
#Incorrect Answer
pd.DataFrame(incorrect_token_lens).describe()

# Training

In [15]:
train_df.head()

Unnamed: 0,context,question,correct,incorrect1,incorrect2,incorrect3
0,Last week I talked with some of my students ab...,We can know from the passage that the author w...,teacher,doctor,model,teacher
1,Last week I talked with some of my students ab...,Many graduates today turn to cosmetic surgery ...,get an advantage over others in job-hunting,marry a better man/woman,become a model,get an advantage over others in job-hunting
2,Last week I talked with some of my students ab...,"According to the passage, the author believes ...",media are to blame for misleading young people...,"everyone should purchase perfection, whatever ...",it's right for graduates to ask for others to ...,it is one's appearance instead of skills that ...
3,Last week I talked with some of my students ab...,Which' s the best title for the passage?.,Young Graduates Look to Surgery for Better Jobs,Young Graduates Have Higher Expectations,Young Graduates Look to Surgery for Better Jobs,Young Graduates' Opinion About Cosmetic Surgery
4,"YUZHOU, HENAN -An accident in a central China ...",What could be the best title for this passage?,A Coal Mine Accident in Central China,Death Toll Rises in an Accident in China,A Coal Mine Accident in Central China,An Accident in Central China


## Pytorch Modules

In [16]:
SEP_TOKEN = '<sep>'

In [20]:
class QGDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer: T5Tokenizer, source_max_token_len: int, target_max_token_len: int):
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        source = f"{data_row['question']} {SEP_TOKEN} {data_row['context']} {SEP_TOKEN} {data_row['correct']}"
        target = f"{data_row['incorrect1']} {SEP_TOKEN} {data_row['incorrect2']} {SEP_TOKEN} {data_row['incorrect3']}"

        source_encoding = self.tokenizer(
            source,
            max_length=self.source_max_token_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            target,
            max_length=self.target_max_token_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100

        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": labels.squeeze(),
        }


In [21]:
class QGDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size,
        source_max_token_len: int,
        target_max_token_len: int
        ):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self,stage=None):
        if stage == 'fit' or stage is None:
            self.train_dataset = QGDataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
            self.val_dataset = QGDataset(self.val_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        if stage == 'test' or stage is None:
            self.test_dataset = QGDataset(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size = self.batch_size, shuffle=True, num_workers = 2)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

In [22]:
# Hyperparameters
MODEL_NAME = 't5-base'
SOURCE_MAX_TOKEN_LEN = 512
TARGET_MAX_TOKEN_LEN = 64

N_EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

MODEL_SAVE_NAME = '100200'
RUN_TRAINING_CELLS=True

In [23]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
print('tokenizer len before: ', len(tokenizer))
tokenizer.add_tokens(SEP_TOKEN)
print('tokenizer len after: ', len(tokenizer))
TOKENIZER_LEN = len(tokenizer)

data_module = QGDataModule(train_df, val_df, test_df, tokenizer, BATCH_SIZE, SOURCE_MAX_TOKEN_LEN, TARGET_MAX_TOKEN_LEN)
data_module.setup()

tokenizer len before:  32100
tokenizer len after:  32101


In [24]:
class QGModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        self.model.resize_token_embeddings(TOKENIZER_LEN) #resizing after adding new tokens to the tokenizer

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), eps=1e-08)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=self.trainer.estimated_stepping_batches
        )
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

In [25]:
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='Distractor-generation-checkpoints',
    save_top_k=-1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

# Define the Trainer
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=N_EPOCHS,
    accelerator='gpu',
    devices=1,
    enable_progress_bar=True,
)

In [26]:
!pip install gdown
!gdown --id "1_7I3EpkaTDZDVxCO4I3oJBIjsokN0f_k"

Downloading...
From (original): https://drive.google.com/uc?id=1_7I3EpkaTDZDVxCO4I3oJBIjsokN0f_k
From (redirected): https://drive.google.com/uc?id=1_7I3EpkaTDZDVxCO4I3oJBIjsokN0f_k&confirm=t&uuid=6f73b38a-a6a8-4a76-9879-14542748c4a7
To: /kaggle/working/Distractor-generation-checkpoints-v4.ckpt
100%|███████████████████████████████████████| 2.67G/2.67G [00:20<00:00, 130MB/s]


In [32]:
checkpoint_path = "/kaggle/working/Distractor-generation-checkpoints-v4.ckpt"
model = QGModel.load_from_checkpoint(checkpoint_path)
model.freeze()
model.eval()
#checkpoint = "Distractor-generation-checkpoints.ckpt"
#trainer.fit(model, data_module,ckpt_path=checkpoint)
# trainer.fit(model, data_module)

QGModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32101, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32101, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_features=30

In [None]:
!ls checkpoints

In [1]:
from IPython.display import FileLink
FileLink(r'checkpoints.tar.gz')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model = QGModel()
checkpoint='/content/drive/MyDrive/Distractors_generation/checkpoints/Distractor-generation-checkpoints-v1.ckpt'
trainer.fit(model, data_module)

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/checkpoints/model.pth')

In [None]:
model = QGModel()
model.load_state_dict(torch.load('/content/drive/MyDrive/checkpoints/model.pth'))
model.eval()

#Test The Model

In [33]:
tokenizer.add_tokens(SEP_TOKEN)

0

In [55]:
def generate(disGModel: QGModel, question: str, context: str, correct: str) -> str:
   
    source = f"{question} {SEP_TOKEN} {context} {SEP_TOKEN} {correct}"

    source_encoding = tokenizer(
        source,
        max_length=SOURCE_MAX_TOKEN_LEN,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    source_encoding = {key: val.to(disGModel.device) for key, val in source_encoding.items()}

    generated_ids = disGModel.model.generate(
        input_ids=source_encoding['input_ids'],
        attention_mask=source_encoding['attention_mask'],
        num_beams=3,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )

    preds = [
        tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    ]

    return ''.join(preds)  


In [56]:
def show_result(generated: str, answer: str, context:str, incorrect: List[str] = [], question: str = ''):
    print('Context:')
    print(context)
    print()

    if question: print('Question: ', question)
    print('Answer : ', answer)

    print()
    print('Original : ', incorrect)
    print('Generated: ', generated)
    print('-----------------------------')

In [62]:
sample = {
    'context': "The theory of evolution by natural selection was first articulated by Charles Darwin in 1859. Darwin proposed that species evolve over time through the process of natural selection, where organisms with traits better suited to their environment are more likely to survive and reproduce.",
    'question': "Charles Darwin is best known for developing the theory of _ .",
    'correct': "Evolution by Natural Selection",
    'incorrect1': "Genetic Engineering and Modification",
    'incorrect2': "Lamarckism and Inheritance of Acquired Traits",
    'incorrect3': "The Geocentric Theory of the Universe"
}






generated = generate(model, sample['question'], sample['context'] , sample['correct'])
show_result(generated, sample['correct'], sample['context'], [sample['incorrect1'], sample['incorrect2'], sample['incorrect3']], sample['question'])

Context:
The theory of evolution by natural selection was first articulated by Charles Darwin in 1859. Darwin proposed that species evolve over time through the process of natural selection, where organisms with traits better suited to their environment are more likely to survive and reproduce.

Question:  Charles Darwin is best known for developing the theory of _ .
Answer :  Evolution by Natural Selection

Original :  ['Genetic Engineering and Modification', 'Lamarckism and Inheritance of Acquired Traits', 'The Geocentric Theory of the Universe']
Generated:  Natural Selection<sep> Evolution by Natural Selection<sep> Evolution by Natural Selection
-----------------------------


In [37]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_bleu_as_table(model, dataloader, tokenizer):
   
    model.eval()
    smoothing_function = SmoothingFunction().method1
    bleu_scores_per_distractor = {1: {1: [], 2: [], 3: [], 4: []},
                                  2: {1: [], 2: [], 3: [], 4: []},
                                  3: {1: [], 2: [], 3: [], 4: []}}

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['labels'].to(model.device)

            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=TARGET_MAX_TOKEN_LEN)

            decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True).split(SEP_TOKEN) for pred in outputs]
            decoded_refs = [
                tokenizer.decode(label[label != -100], skip_special_tokens=True).split(SEP_TOKEN) for label in labels
            ]

            for pred_distractors, ref_distractors in zip(decoded_preds, decoded_refs):
                for idx, (pred, ref) in enumerate(zip(pred_distractors, ref_distractors)):
                    pred_tokens = pred.split()
                    ref_tokens = [ref.split()]

                    for n in range(1, 5):
                        score = sentence_bleu(ref_tokens, pred_tokens, weights=tuple([1.0 / n] * n), smoothing_function=smoothing_function)
                        bleu_scores_per_distractor[idx + 1][n].append(score)

    # Compute average BLEU scores for each distractor
    table_data = {
        "Distractor": [],
        "BLEU-1": [],
        "BLEU-2": [],
        "BLEU-3": [],
        "BLEU-4": []
    }

    for distractor, scores_by_n in bleu_scores_per_distractor.items():
        table_data["Distractor"].append(f"Distractor {distractor}")
        for n in range(1, 5):
            avg_score = sum(scores_by_n[n]) / len(scores_by_n[n])
            table_data[f"BLEU-{n}"].append(avg_score * 100)  # Convert to percentage format

    return pd.DataFrame(table_data)


In [42]:
test_dataset =QGDataset(test_df, tokenizer, 512, 64)
test_dataloader = DataLoader(test_dataset, batch_size=32, num_workers=2)
# Evaluate BLEU scores and get a table
bleu_table = evaluate_bleu_as_table(model.model, test_dataloader, tokenizer)

# Print the table
print(bleu_table)

     Distractor     BLEU-1     BLEU-2     BLEU-3     BLEU-4
0  Distractor 1  29.590329  21.547915  17.861560  15.752239
1  Distractor 2  25.209694  16.805715  12.999345  10.781881
2  Distractor 3  23.987300  15.782022  12.349592  10.516922
