In [6]:
!pip install datasets peft transformers torch accelerate #The code is installing Python packages necessary for working with datasets, parameter-efficient fine-tuning, transformers, PyTorch, and model acceleration.

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

This code reads a CSV file containing questions and answers, shuffles the data, adds a prefix to each question, creates four variants of the dataset with different sizes, converts these variants into datasets suitable for use with the Hugging Face library, and prints the sizes of these datasets.

In [1]:
import pandas as pd
from datasets import Dataset
import torch
from transformers import  T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig

csv_file = "/content/cleaned_output.csv"
qa_df = pd.read_csv(csv_file)


assert 'Question' in qa_df.columns and 'Answer' in qa_df.columns, "CSV file must contain 'Question' and 'Answer' columns"

qa_df = qa_df.sample(frac=1).reset_index(drop=True)

qa_df['Question'] = qa_df['Question'].astype(str).fillna('')

prefix = 'Answer the question: '
for i in range(len(qa_df)):
    qa_df.loc[i, 'Question'] = prefix + qa_df.loc[i, 'Question']


variant_1 = qa_df.iloc[:2500]
variant_2 = qa_df.iloc[:5000]
variant_3 = qa_df.iloc[:7500]
variant_4 = qa_df.iloc[:10000]

print(f"Variant 1 size: {len(variant_1)}")
print(f"Variant 2 size: {len(variant_2)}")
print(f"Variant 3 size: {len(variant_3)}")
print(f"Variant 4 size: {len(variant_4)}")


dataset_1 = Dataset.from_pandas(variant_1)
dataset_2 = Dataset.from_pandas(variant_2)
dataset_3 = Dataset.from_pandas(variant_3)
dataset_4 = Dataset.from_pandas(variant_4)

print(f"Dataset 1 size: {len(dataset_1)}")
print(f"Dataset 2 size: {len(dataset_2)}")
print(f"Dataset 3 size: {len(dataset_3)}")
print(f"Dataset 4 size: {len(dataset_4)}")


Variant 1 size: 2500
Variant 2 size: 5000
Variant 3 size: 7500
Variant 4 size: 10000
Dataset 1 size: 2500
Dataset 2 size: 5000
Dataset 3 size: 7500
Dataset 4 size: 10000


This code loads the T5 tokenizer and model, defines a function to clean text by removing unwanted characters and extra whitespace, and a tokenization function that concatenates, cleans, and tokenizes questions and answers from the datasets. It then applies this tokenization function to four variants of the dataset, created previously, using the map method to process each example in the datasets. Finally, the code sets the format of these tokenized datasets to be compatible with PyTorch by specifying the necessary columns (input_ids, attention_mask, and labels), thereby preparing the datasets for training a T5 model for text generation tasks.

In [2]:
from transformers import  T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import re
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)



def clean_text(text):
    if text is None:
        return ""
    cleaned_text = re.sub(r'[^A-Za-z0-9\s.,?!;:]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def tokenize_function(examples):
    concatenated_examples = [
        clean_text(q) + " " + clean_text(a) for q, a in zip(examples['Question'], examples['Answer'])
    ]
    tokenized = tokenizer(concatenated_examples, padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset_1 = dataset_1.map(tokenize_function, batched=True)
tokenized_dataset_2 = dataset_2.map(tokenize_function, batched=True)
tokenized_dataset_3 = dataset_3.map(tokenize_function, batched=True)
tokenized_dataset_4 = dataset_4.map(tokenize_function, batched=True)

tokenized_dataset_1.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_dataset_2.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_dataset_3.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_dataset_4.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

The code imports the LoraConfig and get_peft_model functions from the peft library and sets up a configuration for Low-Rank Adaptation (LoRA) with specified parameters: a scaling factor (lora_alpha) of 16, a dropout rate (lora_dropout) of 0.1, and a rank (lora_r) of 64. The bias parameter is set to "none," indicating no additional bias terms are used, and the task_type is set to "CAUSAL_LM" for causal language modeling tasks. This configuration (peft_config) is designed to apply LoRA to a model, facilitating efficient fine-tuning with lower computational costs by utilizing low-rank matrices and controlled dropout.

In [3]:
from peft import LoraConfig, get_peft_model

# Define the parameters for LoRA (Low-Rank Adaptation)
lora_alpha = 16  # Scaling factor for the LoRA weights
lora_dropout = 0.1  # Dropout rate to be applied to the LoRA weights
lora_r = 64  # Rank of the low-rank matrices used in LoRA

# Create a configuration object for LoRA with the specified parameters
peft_config = LoraConfig(
    lora_alpha=lora_alpha,  # Set the scaling factor
    lora_dropout=lora_dropout,  # Set the dropout rate
    r=lora_r,  # Set the rank of the low-rank matrices
    bias="none",  # Specify that no additional bias terms are to be used
    task_type="CAUSAL_LM"  # Specify the task type as causal language modeling
)

In [5]:
from trl import SFTTrainer

def fine_tune_and_save(dataset, output_dir, model, tokenizer, peft_config):
    # Print the size of the dataset being used for fine-tuning
    print(f"Fine-tuning on dataset with size: {len(dataset)}")

    # Set up training arguments for the fine-tuning process
    training_args = TrainingArguments(
        output_dir='./results',  # Directory where the model checkpoints and other outputs will be saved
        num_train_epochs=1,  # Number of epochs to train the model
        per_device_train_batch_size=4,  # Batch size per device (GPU/CPU)
        save_steps=500,  # Number of steps between each model checkpoint save
        save_total_limit=2,  # Maximum number of checkpoints to keep; older ones will be deleted
        logging_dir='./newlogs',  # Directory where the logs will be saved
        logging_steps=100,  # Number of steps between each log write
        evaluation_strategy="no",  # Evaluation strategy; "no" means no evaluation during training
        optim="paged_adamw_32bit",  # Optimizer to be used; here, AdamW with 32-bit precision
    )

    # Initialize the SFTTrainer for fine-tuning the model
    trainer = SFTTrainer(
        model=model,  # The model to be fine-tuned
        train_dataset=dataset,  # The dataset to be used for training
        peft_config=peft_config,  # The configuration for parameter-efficient fine-tuning (LoRA)
        dataset_text_field="text",  # The field in the dataset containing the text data
        max_seq_length=1024,  # Maximum sequence length for the input text
        tokenizer=tokenizer,  # The tokenizer to be used for encoding the text
        args=training_args,  # The training arguments defined above
    )

    # Train the model using the specified dataset and training arguments
    trainer.train()
    # Save the trained model to the specified output directory
    trainer.save_model(output_dir)


In [6]:
fine_tune_and_save(tokenized_dataset_1, './checkpoint_2500', model, tokenizer,peft_config)
fine_tune_and_save(tokenized_dataset_2, './checkpoint_5000', model, tokenizer,peft_config)
fine_tune_and_save(tokenized_dataset_3, './checkpoint_7500', model, tokenizer,peft_config)
fine_tune_and_save(tokenized_dataset_4, './checkpoint_10000', model, tokenizer,peft_config)

Fine-tuning on dataset with size: 2500




Step,Training Loss
100,1.0452
200,0.6513
300,0.4039
400,0.2726
500,0.1633
600,0.1196




Fine-tuning on dataset with size: 5000


Step,Training Loss
100,1.0335
200,0.6228
300,0.3579
400,0.1497
500,0.0901
600,0.0725
700,0.0661
800,0.0637
900,0.0546
1000,0.0605




Fine-tuning on dataset with size: 7500


Step,Training Loss
100,1.0136
200,0.5819
300,0.3052
400,0.1189
500,0.0772
600,0.0722
700,0.0607
800,0.0547
900,0.0474
1000,0.0427




Fine-tuning on dataset with size: 10000


Step,Training Loss
100,1.0631
200,0.5889
300,0.3007
400,0.1151
500,0.0824
600,0.0592
700,0.061
800,0.0513
900,0.0498
1000,0.0465




In [9]:
validation_df_1 = qa_df.iloc[10000:10210]
validation_df_2 = qa_df.iloc[20000:20420]
validation_df_3 = qa_df.iloc[30000:30800]
validation_df_4 = qa_df.iloc[58000:]


validation_dataset_1 = Dataset.from_pandas(validation_df_1)
validation_dataset_2 = Dataset.from_pandas(validation_df_2)
validation_dataset_3 = Dataset.from_pandas(validation_df_3)
validation_dataset_4 = Dataset.from_pandas(validation_df_4)


tokenized_val_dataset_1 = validation_dataset_1.map(tokenize_function, batched=True)
tokenized_val_dataset_2 = validation_dataset_2.map(tokenize_function, batched=True)
tokenized_val_dataset_3 = validation_dataset_3.map(tokenize_function, batched=True)
tokenized_val_dataset_4 = validation_dataset_4.map(tokenize_function, batched=True)

tokenized_val_dataset_1.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val_dataset_2.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val_dataset_3.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val_dataset_4.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_metric
import csv

# Define a function to compute evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Load the ROUGE metric for text summarization evaluation
    rouge = load_metric("rouge")
    # Compute ROUGE scores for the predictions compared to the reference labels
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Extract and return the F-measure from the computed ROUGE scores
    result = {key: value.mid.fmeasure for key, value in result.items()}
    return result

# Define a function to evaluate the model and save evaluation metrics to a CSV file
def evaluate_model(model_dir, tokenizer, eval_dataset):
    # Load the trained model from the specified directory
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
    # Define training arguments for evaluation
    training_args = Seq2SeqTrainingArguments(
        output_dir='./results',  # Directory to save evaluation results
        evaluation_strategy="no",  # Disable evaluation during training
        per_device_eval_batch_size=8,  # Batch size for evaluation
    )

    # Initialize the Seq2SeqTrainer for evaluation
    trainer = Seq2SeqTrainer(
        model=model,  # The model to be evaluated
        args=training_args,  # Training arguments for evaluation
        compute_metrics=compute_metrics,  # Use the compute_metrics function for evaluation
        tokenizer=tokenizer,  # Tokenizer for encoding text
    )

    # Evaluate the model on the provided evaluation dataset
    results = trainer.evaluate(eval_dataset)

    # Write evaluation results to a CSV file
    with open('/content/eval_metrics.csv', mode='a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=results.keys())
        if file.tell() == 0:
            writer.writeheader()
        writer.writerow(results)

    # Print and return evaluation results
    print(f"Evaluation results for {model_dir}: {results}")
    return results


In [17]:
import torch  # Import the PyTorch library
import os  # Import the os module for operating system-related functions

torch.cuda.empty_cache()  # Clear the CUDA memory cache to free up GPU memory

# Enable gradient checkpointing in the model for memory-efficient training
model.gradient_checkpointing_enable()

# Set an environment variable to configure CUDA memory allocation to use expandable segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [None]:
# Evaluate the model trained up to 2500 checkpoints using the first validation dataset
evaluate_model('/content/checkpoint_2500', tokenizer, tokenized_val_dataset_1)

# Evaluate the model trained up to 5000 checkpoints using the second validation dataset
evaluate_model('/content/checkpoint_5000', tokenizer, tokenized_val_dataset_2)

# Evaluate the model trained up to 7500 checkpoints using the third validation dataset
evaluate_model('/content/checkpoint_7500', tokenizer, tokenized_val_dataset_3)

# Evaluate the model trained up to 10000 checkpoints using the fourth validation dataset
evaluate_model('/content/checkpoint_10000', tokenizer, tokenized_val_dataset_4)
