# If you are using google colab or otehr supported environment you can use the follwoing code view the available resources used by the system

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Jan 20 01:41:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0              48W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


# Fine tuning LLA using PEFT/LoRA

In [3]:
"""
# Upgrades the pip package installer to the latest version.
# This ensures that the latest features and security patches are used when installing packages.
%pip install --upgrade pip

# Installs specific versions of the 'torch' and 'torchdata' packages quietly without showing the progress bars.
# 'torch' is a scientific computing framework with wide support for machine learning algorithms,
# and 'torchdata' provides data loading utilities.
# The '--disable-pip-version-check' option is used to disable the check for pip's own updates, speeding up the installation process.
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet
"""

# Installs specific versions of various machine learning and natural language processing libraries quietly.
# 'transformers' provides state-of-the-art general-purpose architectures for Natural Language Understanding and Generation.
# 'datasets' is a library for easily accessing and sharing datasets for machine learning tasks.
# 'evaluate' is a library for evaluating machine learning models.
# 'rouge_score' is used for evaluating text summarization and machine translation.
# 'loralib' and 'peft' are libraries that might be specific to your project or domain.
%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet


In [4]:
# Imports the 'load_dataset' function from the 'datasets' library.
# This function is used to easily load and preprocess datasets for machine learning and natural language processing tasks.
from datasets import load_dataset

# Imports several classes from the 'transformers' library:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer

# Imports the 'torch' library, which is a scientific computing framework with broad support for machine learning algorithms.
import torch

# Imports the 'time' module, which provides various time-related functions.
import time

# Imports the 'evaluate' module from the 'evaluate' library.
# This is used for evaluating models with various metrics.
import evaluate

# Imports the 'pandas' library and gives it the alias 'pd'.
# Pandas is a powerful data manipulation and analysis tool, particularly useful for working with structured data (like CSV, SQL, etc.).
import pandas as pd

# Imports the 'numpy' library and gives it the alias 'np'.
# NumPy is a fundamental package for scientific computing in Python, particularly useful for working with arrays and matrices.
import numpy as np


In [5]:
# Assigns the string "knkarthick/dialogsum" to the variable 'huggingface_dataset_name'.
# This string represents the name of a dataset available on the Hugging Face dataset hub.
# In this case, "dialogsum" is a dataset for dialogue summarization, hosted by the user 'knkarthick' on Hugging Face.
huggingface_dataset_name = "knkarthick/dialogsum"

# Loads the dataset specified by 'huggingface_dataset_name' using the 'load_dataset' function from the 'datasets' library.
# The 'load_dataset' function fetches the dataset from the Hugging Face dataset hub and prepares it for use.
# The loaded dataset is stored in the variable 'dataset'.
dataset = load_dataset(huggingface_dataset_name)

# This line outputs the structure and details of the loaded dataset.
# When you run this, it shows information like the number of samples, the structure of each sample, dataset splits (e.g., train, test), etc.
dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [6]:
# Filter the dataset to only include examples where the index is divisible by 100.
# This is done to reduce the dataset size for quicker processing or to select a specific subset.
# 'with_indices=True' passes the index of each example to the filter function.
# The lambda function checks if the index modulo 100 is 0, which means every 100th sample is selected.
datasets = dataset.filter(lambda example, index: index % 10 == 0, with_indices=True)

# The filtered dataset is stored in the 'dataset' variable.
# This dataset can now be used for further processing, training, or evaluation.
# Remember, the filtering process itself is not GPU-accelerated as it's a data preprocessing step.
datasets



DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1246
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 150
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 50
    })
})

In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Define the model name from the Hugging Face Transformers library.
# In this case, 'google/flan-t5-base' is a pre-trained model from Google's FLAN (Few-shot Learning with Adversarial Networks) series.
model_name = 'google/flan-t5-base'

# Load the original model using the specified model name.
# The 'torch_dtype=torch.bfloat16' is used to load the model in bfloat16 precision, which is beneficial for memory efficiency on GPUs that support it.
# Note: bfloat16 is typically supported on newer NVIDIA GPUs (like the A100, V100). Ensure your GPU supports bfloat16.
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

# Load the tokenizer corresponding to your model.
# Tokenizers are responsible for converting text into a format that's understandable by the model (e.g., converting text to token IDs).
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [8]:
def print_number_of_trainable_model_parameters(model):
    # Initialize counters for trainable and total model parameters.
    trainable_model_params = 0
    all_model_params = 0

    # Iterate over all the parameters in the model.
    for _, param in model.named_parameters():
        # Add the total number of elements in the parameter to the total count.
        all_model_params += param.numel()

        # Check if the parameter is trainable (i.e., if gradients are calculated for it).
        if param.requires_grad:
            # Add the total number of elements in the trainable parameter to the trainable count.
            trainable_model_params += param.numel()

    # Return a formatted string showing the number of trainable parameters, total parameters,
    # and the percentage of parameters that are trainable.
    return (f"trainable model parameters: {trainable_model_params}\n"
            f"all model parameters: {all_model_params}\n"
            f"percentage of trainable model parameters: "
            f"{100 * trainable_model_params / all_model_params:.2f}%")

# Calls the function with the loaded model 'original_model'.
# This will calculate and print the total number of parameters in the model,
# the number of trainable parameters, and the percentage of parameters that are trainable.
print(print_number_of_trainable_model_parameters(original_model))


trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


# Test the model with zero-shot inference

In [9]:
# Selecting an index to pick a specific example from the 'test' split of the dataset.
index = 200

# Extracting the dialogue and the summary for the selected index from the test dataset.
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

# Creating a prompt by appending a request to summarize the conversation to the dialogue.
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

# Tokenizing the prompt to convert it into a format suitable for the model.
# 'return_tensors='pt'' indicates that the output will be PyTorch tensors.
inputs = tokenizer(prompt, return_tensors='pt')

# Move the tokenized inputs to the same device as the model (GPU or CPU).
inputs = inputs.to(original_model)

# Generating the summary using the model.
# 'max_new_tokens=200' specifies the maximum length of the new tokens to be generated.
# The output is decoded into human-readable text, skipping special tokens like padding or EOS.
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

# A dashed line string for better output readability.
dash_line = '-' * 100
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')


Attempting to cast a BatchEncoding to type T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_fea

----------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

------------------------------------------------------------------

In [10]:
def tokenize_function(example):
    # This function will tokenize each example in the dataset.

    # Define the start and end prompts for the summarization task.
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '

    # Construct the full prompt for each dialogue in the batch.
    # This combines the start prompt, the dialogue, and the end prompt.
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]

    # Tokenize the prompts. Padding and truncation are applied to handle variable lengths.
    # 'return_tensors="pt"' returns PyTorch tensors which are suitable for model input.
    # The tokenized input IDs are stored in the 'input_ids' key.
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids

    # Tokenize the summaries (labels) with the same padding and truncation strategy.
    # These are the target outputs for the summarization task.
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# Apply the tokenization function to all splits of the dataset in a batched manner.
# This will process the data in batches for efficiency.
# 'batched=True' enables processing multiple examples at once, which is faster than one-by-one.
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove columns that are not needed after tokenization.
# This includes original text fields like 'id', 'topic', 'dialogue', and 'summary'.
# After this step, the dataset will only contain the processed 'input_ids' and 'labels'.
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

# At this point, the 'tokenized_datasets' is ready for model training or evaluation.
# Remember to move batches of this tokenized data to the GPU during model training or inference.




Map:   0%|          | 0/1500 [00:00<?, ? examples/s]



In [11]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (12460, 2)
Validation: (500, 2)
Test: (1500, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
})


# Parameter efficient fine tuning (PEFT) uisng Low-Rank Adaptation (LoRA)

In [12]:
from peft import LoraConfig, get_peft_model, TaskType

# Create a LoraConfig object for configuring the LoRA (Low-Rank Adaptation) layers.
# LoRA is a parameter-efficient training method that modifies only a small portion of the model during fine-tuning.
lora_config = LoraConfig(
    r=32,  # Rank: Determines the size of the low-rank matrices in LoRA.
    lora_alpha=32,  # LoRA alpha: A scaling factor for the LoRA matrices.
    target_modules=["q", "v"],  # Target modules: Parts of the model to be adapted, here 'q' and 'v' denote query and value.
    lora_dropout=0.05,  # LoRA dropout: Dropout rate applied to the LoRA layers.
    bias="none",  # Bias mode for LoRA layers.
    task_type=TaskType.SEQ_2_SEQ_LM  # Task type: Specifies the model type, here sequence-to-sequence language model.
)


In [13]:
# Apply the LoRA configuration to the original model.
# This function modifies the original model according to the LoRA settings specified in 'lora_config'.
# The result is a new model ('peft_model') that is ready for parameter-efficient fine-tuning.
import copy
peft_model = copy.deepcopy(original_model)
peft_model = get_peft_model(peft_model, lora_config)

# Print the number of trainable parameters in the modified model.
# This function calculates and prints the total number of parameters,
# the number of trainable parameters, and the percentage of trainable parameters in 'peft_model'.
# It's useful to understand the impact of applying LoRA on the model's parameter efficiency.
print(print_number_of_trainable_model_parameters(peft_model))


trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [14]:
# Define the directory where the training outputs (like model checkpoints) will be saved.
# The directory name includes a timestamp to make it unique.
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

# Set up the training arguments.
# 'TrainingArguments' configures various aspects of the training process.
peft_training_args = TrainingArguments(
    output_dir=output_dir,        # Directory for saving outputs.
    auto_find_batch_size=True,    # Automatically find an appropriate batch size.
    learning_rate=1e-3,           # Set a specific learning rate, higher than typical fine-tuning as it's PEFT.
    num_train_epochs=3,           # Number of training epochs.
    logging_steps=1,              # Log metrics every this many steps.
    # max_steps=1                   # Maximum number of training steps. Set to 1 for a quick test.
)

# Initialize the Trainer.
# 'Trainer' is a Hugging Face utility that simplifies the training process.
peft_trainer = Trainer(
    model=peft_model,                     # The PEFT model for training.
    args=peft_training_args,              # The training configuration.
    train_dataset=tokenized_datasets["train"],  # The training dataset.
)

In [15]:
# Start training the model.
# The 'train()' method of the 'peft_trainer' object begins the training process.
# This method will use the training arguments, model, and dataset defined earlier.
# The training process includes forward and backward passes, parameter updates, and logging metrics.
peft_trainer.train()

# Define the path where the trained model checkpoint will be saved.
# The path includes a directory name indicating that it's a checkpoint for the PEFT dialogue summary model.
peft_model_path = "./peft-dialogue-summary-checkpoint-local"

# Save the trained model to the specified path.
# This step saves the model's weights that have been fine-tuned during the training process.
# The saved model can be reloaded later for inference or further training.
peft_trainer.model.save_pretrained(peft_model_path)

# Save the tokenizer associated with the model to the same path.
# It's important to save the tokenizer as it needs to be consistent with the model.
# The tokenizer contains the vocabulary and rules for pre-processing text that the model was trained with.
tokenizer.save_pretrained(peft_model_path)




Step,Training Loss
1,50.0
2,46.25
3,41.0
4,38.0
5,33.0
6,30.25
7,27.125
8,25.75
9,22.125
10,19.375


('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

# Human centric evaluation

In [16]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

# Tokenize the prompt and ensure the tensor is on the same device as the model.
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(original_model.device)  # Move input_ids to the same device as the model

# Now generate the outputs using the original model.
original_model_outputs = original_model.generate(input_ids=input_ids, max_new_tokens=200, num_beams=1)
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

input_ids = input_ids.to(peft_model.device)  # Move input_ids to the same device as the model
peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
----------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1#: I'm thinking of upgrading my computer.
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
PEFT MODEL: #Person2# wants to upgrade the system and #Person1# suggests adding a painting program to #Person2#'s software. #Person2# also wants to add a CD-ROM drive.


In [17]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """


    # Tokenize the prompt and ensure the tensor is on the same device as the model.
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    input_ids = input_ids.to(original_model.device)  # Move input_ids to the same device as the model

    human_baseline_text_output = human_baseline_summaries[idx]

    # Now generate the outputs using the original model.
    original_model_outputs = original_model.generate(input_ids=input_ids, max_new_tokens=200, num_beams=1)
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    input_ids = input_ids.to(peft_model.device)  # Move input_ids to the same device as the model
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)



In [18]:
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1#: I need to take a dictation for you.,#Person1# asks Ms. Dawson to take a dictation ...
1,In order to prevent employees from wasting tim...,#Person1#: I need to take a dictation for you.,#Person1# asks Ms. Dawson to take a dictum to ...
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1#: I need to take a dictation for you.,#Person1# tells Ms. Dawson that the memo shoul...
3,#Person2# arrives late because of traffic jam....,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and #Person1# s...
4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and got stuck i...
5,#Person2# complains to #Person1# about the tra...,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and got stuck i...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,#Person1# tells Kate Masha and Hero are gettin...
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced.,Kate tells #Person2# they are having a separat...
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Kate tells #Person1# that Masha and Hero are g...
9,#Person1# and Brian are at the birthday party ...,"#Person1#: Happy birthday, Brian. #Person2#: I...",Brian invites #Person1# to the party and Brian...


In [19]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)


peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)

print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.24089921652421653, 'rouge2': 0.11769053708439897, 'rougeL': 0.22001958689458687, 'rougeLsum': 0.22134175465057818}
PEFT MODEL:
{'rouge1': 0.4183235826421081, 'rouge2': 0.12759352304430877, 'rougeL': 0.28904303757386635, 'rougeLsum': 0.2911334800444658}
