### Load the dataset from csv

In [1]:
from tqdm import tqdm
from datasets import load_dataset
from sklearn.utils import shuffle
import pandas as pd
import os

def load_squad_dataset(csv_file_path):
    # Read the CSV file into a DataFrame
    dataset = pd.read_csv(csv_file_path)
    
    # Initialize an empty DataFrame with specified columns
    df_dataset = pd.DataFrame(columns=['context', 'question', 'answer'])
    num_of_answer = 0
    
    # Iterate over the rows of the DataFrame
    for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
        context = row['context']
        question = row['question']
        answer = str(row['answer'])
        number_of_words = len(answer.split())
        df_dataset.loc[num_of_answer] = [context] + [question] + [answer]
        num_of_answer += 1
    
    return df_dataset


pd.set_option('display.max_colwidth', None)
print("Loading Dataset...")
df = load_squad_dataset("./merged_dataset.csv")

# Split the dataframe into train and validation
print("Splitting into Train and Validation...")
train_size = int(0.95 * len(df))
val_size = len(df) - train_size
df_train = df.iloc[:train_size]
df_validation = df.iloc[train_size:]

print('Shuffling DataFrames...')
df_train = shuffle(df_train)
df_validation = shuffle(df_validation)

Loading Dataset...


100%|██████████| 362399/362399 [56:17<00:00, 107.28it/s]


Splitting into Train and Validation...
Shuffling DataFrames...


### Save the dataset as CSV

In [2]:
print('Saving dataset as csv...')
dataset_save_path = 'datasets/'
if not os.path.exists(dataset_save_path):
    os.makedirs(dataset_save_path)
df_train.to_csv(dataset_save_path + 'squad_train.csv', index=False)
df_validation.to_csv(dataset_save_path + 'squad_validation.csv', index=False)
print('All done.')

Saving dataset as csv...
All done.


## Training a QG-Algorithm, Model: T5

### Imports & Initial Settings

In [1]:
import os
import time
import copy
import argparse
import torch
import pytorch_lightning as pl
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
from torch.optim import AdamW


train_file_path = 'datasets/squad_train.csv'
validation_file_path = 'datasets/squad_validation.csv'
save_model_path = 'model/'
save_tokenizer_path = 'tokenizer/'
pretrained_model = 't5-large'

# Set training arguments
args = {
    'num_workers': 2,
    'batch_size': 16,  # Original: 4
    'learning_rate': 1e-4,  # Original: 3e-5
    'eps': 1e-8,
    'weight_decay': 0.0
}


### Define the dataset class


In [2]:
class QGDataset(Dataset):
    def __init__(self, tokenizer, file_path, max_len_input=512, max_len_output=128):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(file_path)
        self.max_len_input = max_len_input
        self.max_len_output = max_len_output
        self.context_column = 'context'
        self.answer_column = 'answer'
        self.question_column = 'question'
        self.inputs = []
        self.targets = []
        self._load_data()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]['input_ids'].squeeze()
        target_ids = self.targets[index]['input_ids'].squeeze()
        source_mask = self.inputs[index]['attention_mask'].squeeze()
        target_mask = self.targets[index]['attention_mask'].squeeze()
        labels = copy.deepcopy(target_ids)
        labels[labels == 0] = -100
        return {'source_ids': source_ids, 'source_mask': source_mask, 'target_ids': target_ids, 'target_mask': target_mask, 'labels': labels}

    def _load_data(self):
        for idx in tqdm(range(len(self.data))):
            context, answer, target = self.data.loc[idx, self.context_column], self.data.loc[idx, self.answer_column], self.data.loc[idx, self.question_column]
            input_text = '<answer> %s <context> %s ' % (answer, context)
            target = str(target)

            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_text],
                max_length=self.max_len_input,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target],
                max_length=self.max_len_output,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

### Define the model class


#### New (chrF loss included)

In [3]:
import torch
import torch.nn.functional as F
from torchmetrics.text import CHRFScore
import pytorch_lightning as pl
from torch.optim import AdamW
import wandb
from pytorch_lightning.loggers import WandbLogger

class T5FineTuner(pl.LightningModule):
    def __init__(self, model, tokenizer, args, chrf_weight=0.1):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.args = args
        self.chrf_weight = chrf_weight
        self.chrf_metric = CHRFScore()
        self.save_hyperparameters(ignore=['model', 'tokenizer'])

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['source_ids'],
            attention_mask=batch['source_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss

        # Generate questions
        generated_ids = self.model.generate(
            input_ids=batch['source_ids'],
            attention_mask=batch['source_mask'],
            max_length=128,
            num_return_sequences=1,
        )

        # Decode generated questions and ground truth
        generated_questions = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        # true_questions = self.tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
        true_questions = self.tokenizer.batch_decode(batch['labels'][batch['labels'] != -100], skip_special_tokens=True)

        max_length = max(len(generated_questions), len(true_questions))
        generated_questions = generated_questions + [''] * (max_length - len(generated_questions))
        true_questions = true_questions + [''] * (max_length - len(true_questions))
        
        # Calculate CHRF score
        chrf_score = self.chrf_metric(generated_questions, true_questions)

        # Combine losses
        combined_loss = (1 - self.chrf_weight) * loss + self.chrf_weight * chrf_score

        # Log metrics
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
        self.log('train_chrf', chrf_score, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
        self.log('train_combined_loss', combined_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)

        return combined_loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['source_ids'],
            attention_mask=batch['source_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss

        # Generate questions
        generated_ids = self.model.generate(
            input_ids=batch['source_ids'],
            attention_mask=batch['source_mask'],
            max_length=128,
            num_return_sequences=1,
        )
        
        # Decode generated questions and ground truth
        generated_questions = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        # true_questions = self.tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
        true_questions = self.tokenizer.batch_decode(batch['labels'][batch['labels'] != -100], skip_special_tokens=True)

        max_length = max(len(generated_questions), len(true_questions))
        generated_questions = generated_questions + [''] * (max_length - len(generated_questions))
        true_questions = true_questions + [''] * (max_length - len(true_questions))
        
        # Calculate CHRF score
        chrf_score = self.chrf_metric(generated_questions, true_questions)

        # Combine losses
        combined_loss = (1 - self.chrf_weight) * loss + self.chrf_weight * chrf_score

        # Log metrics
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
        self.log('val_chrf', chrf_score, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
        self.log('val_combined_loss', combined_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)

        return combined_loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.args['batch_size'], num_workers=self.args['num_workers'])

    def val_dataloader(self):
        return DataLoader(validation_dataset, batch_size=self.args['batch_size'], num_workers=self.args['num_workers'])

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.args['learning_rate'], eps=self.args['eps'])

    def save_model(self, save_model_path):
        # Ensure the directory exists
        if not os.path.exists(save_model_path):
            os.makedirs(save_model_path)
        # Save the model
        self.model.save_pretrained(save_model_path)
        
    def save_tokenizer(self, save_tokenizer_path):
        # Ensure the directory exists
        if not os.path.exists(save_tokenizer_path):
            os.makedirs(save_tokenizer_path)
        # Save the tokenizer
        self.tokenizer.save_pretrained(save_tokenizer_path)

#### Old

In [9]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, model, tokenizer, args):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.args = args

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['source_ids'],
            attention_mask=batch['source_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['source_ids'],
            attention_mask=batch['source_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.args['batch_size'], num_workers=self.args['num_workers'])

    def val_dataloader(self):
        return DataLoader(validation_dataset, batch_size=self.args['batch_size'], num_workers=self.args['num_workers'])

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.args['learning_rate'], eps=self.args['eps'])
    
    def save_model(self, save_model_path):
        # Ensure the directory exists
        if not os.path.exists(save_model_path):
            os.makedirs(save_model_path)
        # Save the model
        self.model.save_pretrained(save_model_path)
        
    def save_tokenizer(self, save_tokenizer_path):
        # Ensure the directory exists
        if not os.path.exists(save_tokenizer_path):
            os.makedirs(save_tokenizer_path)
        # Save the tokenizer
        self.tokenizer.save_pretrained(save_tokenizer_path)


### Train

In [4]:
wandb.init(project="t5-question-generation", name="training-run-pl-3")

[34m[1mwandb[0m: Currently logged in as: [33msrhnylmz14[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
start_time = time.time()
pl.seed_everything(99)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

print('Loading pre-trained model...')
tokenizer = T5Tokenizer.from_pretrained(pretrained_model, model_max_length=512)
tokenizer.add_special_tokens(
    {'additional_special_tokens': ['<answer>', '<context>']}
)
model = T5ForConditionalGeneration.from_pretrained(pretrained_model).to(device)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)


[rank: 0] Seed set to 99


Using device: cuda
Loading pre-trained model...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
print('Preparing dataset...')
train_dataset = QGDataset(tokenizer, train_file_path)
validation_dataset = QGDataset(tokenizer, validation_file_path)

print('train_dataset: ', len(train_dataset))
print('validation_dataset: ', len(validation_dataset))

Preparing dataset...


100%|██████████| 344279/344279 [15:21<00:00, 373.48it/s]
100%|██████████| 18120/18120 [00:36<00:00, 491.05it/s]

train_dataset:  344279
validation_dataset:  18120





In [7]:
torch.set_float32_matmul_precision('high') # original: highest - available: highest, medium, high
# see more on that: https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html

In [8]:
print('Initializing model...')
model = T5FineTuner(model, tokenizer, args, chrf_weight=0.2)
# model = T5FineTuner(model, tokenizer, args)

wandb_logger = WandbLogger(project="t5-question-generation")

trainer = pl.Trainer(
    max_epochs=3,
    accelerator='gpu',
    devices=1,
    callbacks=[EarlyStopping(monitor="val_loss")],
    logger=wandb_logger  # Add this line
)

Initializing model...


/cta/users/serhan.yilmaz/.conda/envs/unsloth_env/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /cta/users/serhan.yilmaz/.local/lib/python3.10/site- ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
print('Fine tuning...')
trainer.fit(model)

/cta/users/serhan.yilmaz/.conda/envs/unsloth_env/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                       | Params
-----------------------------------------------------------
0 | model       | T5ForConditionalGeneration | 737 M 
1 | chrf_metric | CHRFScore                  | 0     
-----------------------------------------------------------
737 M     Trainable params
0         Non-trainable params
737 M     Total params
2,950.566 Total estimated model params size (MB)


Fine tuning...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  total_n_grams[n] = tensor(sum(n_grams_counts[n].values()))
  matching_n_grams[n] = tensor(


Training: |          | 0/? [00:00<?, ?it/s]

In [18]:
print('Saving model...')
if not os.path.exists(save_model_path):
    os.makedirs(save_model_path)
if not os.path.exists(save_tokenizer_path):
    os.makedirs(save_tokenizer_path)
model.model.save_pretrained(save_model_path)
tokenizer.save_pretrained(save_tokenizer_path)

end_time = time.time() - start_time
print('Total time: %s hours' % (end_time / 60 / 60))
wandb.finish()
print('All done.')

Saving model...
Total time: 44.03426079471906 hours
All done.


## Generating Questions

### Sentence Transformers

In [19]:
from sentence_transformers import SentenceTransformer, util

trained_model = 'all-distilroberta-v1'
# trained_model = 'all-roberta-large-v1'
class SentenceEmbeddings:

    def __init__(self):
        self.embedder = SentenceTransformer(trained_model)

    def encode(self, text):
        return self.embedder.encode(text, convert_to_tensor=True)

    def get_most_similar(self, context:str, qa_list:list):
        context_embeddings = self.encode(context)
        top1 = {'idx': 0, 'score': float('-inf')}
        for i in range(len(qa_list)):
            qa_str = qa_list[i]['question'] + ' ' + qa_list[i]['answer']
            qa_embeddings = self.encode(qa_str)
            cos_score = util.pytorch_cos_sim(context_embeddings, qa_embeddings)
            # print(cos_score[0][0], qa_list[i])
            if cos_score[0][0] > top1['score']:
                top1['score'] = cos_score[0][0]
                top1['idx'] = i
        return qa_list[top1['idx']]

### QG-Class

In [20]:

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

trained_model_path = save_model_path
trained_tokenizer_path = save_tokenizer_path

class QuestionGeneration:

    def __init__(self, model_dir=None):
        self.model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
        self.tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer_path)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(self.device)
        self.model.eval()

    def generate(self, answer: str, context: str):
        input_text = '<answer> %s <context> %s ' % (answer, context)
        encoding = self.tokenizer.encode_plus(
            input_text,
            return_tensors='pt'
        ).to(self.device)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        outputs = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            num_beams = 3,
            num_return_sequences = 1
        )
        question_list = []
        for output in outputs:
            question = self.tokenizer.decode(
                output,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            question_list.append({'question': question, 'answer': answer, 'context': context})
        return question_list

In [21]:
context = '''
Serhan has fine-tuned T5 on SQuAD dataset for question generation.
'''
answer_list = ['Serhan', 'SQuAD', 'question generation']

QG = QuestionGeneration()
SE = SentenceEmbeddings()

for answer in answer_list:
    qa_pair_list = QG.generate(answer, context)
    most_similar = SE.get_most_similar(context, qa_pair_list)
    print(most_similar)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'question': 'Who has fine-tuned T5 on SQuAD?', 'answer': 'Serhan', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'What is the name of the dataset that Serhan uses for question generation?', 'answer': 'SQuAD', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'What is T5 used for?', 'answer': 'question generation', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}


In [22]:
context = '''
Serhan has fine-tuned T5 on SQuAD dataset for question generation.
'''
answer_list = ['Serhan', 'SQuAD', 'question generation', 'Serhan']

QG = QuestionGeneration()
SE = SentenceEmbeddings()

for answer in answer_list:
    qa_pair_list = QG.generate(answer, context)
    most_similar = SE.get_most_similar(context, qa_pair_list)
    print(most_similar)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'question': 'Who has fine-tuned T5 on SQuAD?', 'answer': 'Serhan', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'What is the name of the dataset that Serhan uses for question generation?', 'answer': 'SQuAD', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'What is T5 used for?', 'answer': 'question generation', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'Who has fine-tuned T5 on SQuAD?', 'answer': 'Serhan', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}


In [27]:
class QuestionGeneration:
    def __init__(self, model_dir=None):
        self.model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
        self.tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer_path)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(self.device)
        self.model.eval()

    def generate(self, answer: str, context: str, temperature: float = 2.5):
        input_text = '<answer> %s <context> %s ' % (answer, context)
        encoding = self.tokenizer.encode_plus(
            input_text,
            return_tensors='pt'
        ).to(self.device)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        outputs = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            num_beams=3,
            num_return_sequences=1,
            temperature=temperature,  # Add temperature parameter
            do_sample=True  # Enable sampling
        )
        question_list = []
        for output in outputs:
            question = self.tokenizer.decode(
                output,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            question_list.append({'question': question, 'answer': answer, 'context': context})
        return question_list

In [28]:
context = '''
Serhan has fine-tuned T5 on SQuAD dataset for question generation.
'''
answer_list = ['Serhan', 'SQuAD', 'question generation', 'Serhan']

QG = QuestionGeneration()
SE = SentenceEmbeddings()

for answer in answer_list:
    qa_pair_list = QG.generate(answer, context)
    most_similar = SE.get_most_similar(context, qa_pair_list)
    print(most_similar)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'question': 'Which of the following people is not an entity: Serhan, the data, or question generation', 'answer': 'Serhan', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'What platform is T5 built on?', 'answer': 'SQuAD', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'What was T5 used for in the final step of development?', 'answer': 'question generation', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'Who fine-tuned T5 on SQuAD?', 'answer': 'Serhan', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}


In [29]:
context = '''
Serhan has fine-tuned T5 on SQuAD dataset for question generation.
'''
answer_list = ['Serhan', 'SQuAD', 'question generation', 'Serhan']

QG = QuestionGeneration()
SE = SentenceEmbeddings()

for answer in answer_list:
    qa_pair_list = QG.generate(answer, context)
    most_similar = SE.get_most_similar(context, qa_pair_list)
    print(most_similar)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'question': 'Who has fine tuned the T5 on the SQuAD dataset?', 'answer': 'Serhan', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'What was the dataset used for question generation?', 'answer': 'SQuAD', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'what uses the data?', 'answer': 'question generation', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
{'question': 'Which organization has adjusted and updated T5 on SQuAD?', 'answer': 'Serhan', 'context': '\nSerhan has fine-tuned T5 on SQuAD dataset for question generation.\n'}
