In [None]:
!pip install accelerate==0.21.0
!pip install peft==0.4.0
!pip install bitsandbytes==0.40.2
!pip install transformers==4.31.0
!pip install trl==0.4.7
!pip install evaluate
!pip install rouge_score

Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl.metadata (17 kB)
Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.1.1
    Uninstalling accelerate-1.1.1:
      Successfully uninstalled accelerate-1.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
trl 0.12.1 requires accelerate>=0.34.0, but you have accelerate 0.21.0 which is i

In [None]:
import os
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from evaluate import load
from tqdm import tqdm

## Loading Dataset

In [None]:
train_dataset = pd.read_csv("samsum-train.csv")
val_dataset = pd.read_csv("samsum-validation.csv")
test_dataset = pd.read_csv("samsum-test.csv")

## Formatting the Dataset

In [None]:
def format_samsum_prompt(dialogue, summary=None):
    prompt = f"<s>[INST] Summarize the following dialogue:\n\n{dialogue} [/INST]"
    if summary:
        prompt += f"\n{summary}</s>"
    return prompt

train_texts = []
val_texts = []
test_texts = []

for _, row in train_dataset.iterrows():
    train_texts.append(format_samsum_prompt(row['dialogue'], row['summary']))
train_dataset['text'] = train_texts

for _, row in val_dataset.iterrows():
    val_texts.append(format_samsum_prompt(row['dialogue'], row['summary']))
val_dataset['text'] = val_texts

for _, row in test_dataset.iterrows():
    test_texts.append(format_samsum_prompt(row['dialogue'], row['summary']))
test_dataset['text'] = test_texts

train_dataset = Dataset.from_pandas(train_dataset)
val_dataset = Dataset.from_pandas(val_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

## Loading Model

In [None]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "Llama-2-7b-TextSummarization-finetune"
# model_name = "meta-llama/Llama-2-7b"

## QLoRa Parameters

In [None]:
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

## Bits and Bytes Parameters

In [None]:
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

## SFT Parameters

In [None]:
max_seq_length = None
packing = False
device_map = {"": 0}

## Training Arguments

In [None]:
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

## Loading Model and Tokenizer

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Setting Arguments

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=200,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_total_limit=2,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msaadsohail[0m ([33msaadsohail-fast-nuces[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
100,1.6249,1.783589
200,1.4794,1.724462
300,1.4968,1.702782
400,1.5433,1.681062
500,1.4942,1.684006
600,1.4695,1.673607
700,1.4677,1.667094
800,1.4689,1.662679
900,1.4218,1.663931
1000,1.4454,1.656218


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=3683, training_loss=1.5859971296324407, metrics={'train_runtime': 4084.7468, 'train_samples_per_second': 3.607, 'train_steps_per_second': 0.902, 'total_flos': 6.889090876096512e+16, 'train_loss': 1.5859971296324407, 'epoch': 1.0})

## Testing the Finetuned Model

In [None]:
pipe = pipeline(task="text-generation",
               model=model,
               tokenizer=tokenizer,
               max_length=200,
               temperature=0.7,
               top_p=0.9,
               repetition_penalty=1.2)

In [None]:
def generate_summary(dialogue):
    prompt = format_samsum_prompt(dialogue)
    result = pipe(prompt)
    return result[0]['generated_text'].split("[/INST]")[1].strip()

In [None]:
def get_summary_by_index(index):
    if index < 0 or index >= len(test_dataset):
        print(f"Invalid index. Please enter a number between 0 and {len(test_dataset)-1}")
        return

    test_dialogue = test_dataset[index]['dialogue']
    actual_summary = test_dataset[index]['summary']
    generated_summary = generate_summary(test_dialogue)

    print(f"\nExample at index {index}:")
    print("-" * 50)
    print("Dialogue:")
    print(test_dialogue)
    print("\nGenerated Summary:")
    print(generated_summary)
    print("\nActual Summary:")
    print(actual_summary)
    print("=" * 80)

In [None]:
get_summary_by_index(604)


Example at index 604:
--------------------------------------------------
Dialogue:
Milena: How was the presentation?
Kate: very good
Regina: yes, we had a good feedback
Milena: how many people came to listen to you?
Regina: 3 but we new it will be very cosy, friendly thing
Milena: sometimes a small but interested audience is better than random bored people
Kate: exactly

Generated Summary:
The presentation went well. Only three people attended.</s>

Actual Summary:
Kate, Milena and Regina's presentation went well. Three people came to listen to them.


In [None]:
get_summary_by_index(89)


Example at index 89:
--------------------------------------------------
Dialogue:
Tom: Ben. We've decided. 2pm in the Oval Room.
Ben: Ok, I'll be there
Tom: Take all your papers, it's going to be a fight! And remember: take no prisoners, shoot to kill!
Ben: hahaha, we have to win this battle.
Tom: We will, the justice is on our side.

Generated Summary:
Tom and Ben are meeting at 2 pm in the Oval Room.</s> They want to win their case.</s> Tom advises Ben not to take any prisoners or to shoot anyone.</s> He wants them to win the battle for justice.</s>

Actual Summary:
Tom will meet Ben in the Oval Room at 2pm and tells him to bring the papers. 


In [None]:
get_summary_by_index(558)


Example at index 558:
--------------------------------------------------
Dialogue:
Blair: Remember we are seeing the wedding planner after work
Chuck: Sure, where are we meeting her?
Blair: At Nonna Rita's
Chuck: Can I order their seafood tagliatelle or are we just having coffee with her? I've been dreaming about it since we went there last month
Blair: Haha sure why not
Chuck: Well we both remmber the spaghetti pomodoro disaster from our last meeting with Diane
Blair: Omg hahaha it was all over her white blouse
Chuck: :D
Blair: :P

Generated Summary:
After work Blair and Chuck will meet a wedding planner at Nonna Rita's. They want to have some food as

Actual Summary:
Blair and Chuck are going to meet the wedding planner after work at Nonna Rita's. The tagliatelle served at Nonna Rita's are very good.


In [None]:
get_summary_by_index(758)


Example at index 758:
--------------------------------------------------
Dialogue:
Tomas: Has anybody received the grant yet?
Sierra: no, not yet
Jeremy: I haven't checked even
Tomas: I'm completely broke
Tomas: checking my bank account every hour
Tomas: but nothing's happening there
Sierra: lol
Sierra: be patient. If you need money I can lend you some, don't worry
Tomas: Thanks, I hope they'll arrive any minute

Generated Summary:
Tomas is waiting for a grant and he has been checking his bank account every hour.</s> Sierra will lend Tomas some money if needed.</s> Jeremy hasn't checked anything either.</s>  </s>

Actual Summary:
Tomas, Sierra and Jeremy have still not received the grant. Tomas is broke and is checking his bank account every hour. Sierra offers to lend him some money.


In [None]:
pipe = pipeline(task="text-generation",
               model=model,
               tokenizer=tokenizer,
               max_length=650,
               temperature=0.7,
               top_p=0.9,
               repetition_penalty=1.2)

In [None]:
rouge = load('rouge')

MAX_ROUGE_SAMPLES = 50

predictions = []
references = []

for item in list(test_dataset)[:MAX_ROUGE_SAMPLES]:
    dialogue = item['dialogue']
    prompt = format_samsum_prompt(dialogue)
    result = pipe(prompt)
    generated_summary = result[0]['generated_text'].split("[/INST]")[1].strip()
    predictions.append(generated_summary)
    references.append(item['summary'])

print(f"\nCalculating ROUGE scores on {len(predictions)} samples...")

rouge_scores = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True
)

print("\nROUGE Scores:")
for metric, value in rouge_scores.items():
    print(f"{metric}: {value:.4f}")


Calculating ROUGE scores on 50 samples...

ROUGE Scores:
rouge1: 0.3115
rouge2: 0.1179
rougeL: 0.2296
rougeLsum: 0.2297


## Saving Model To HuggingFace Hub

In [None]:
adapter_path = os.path.join(new_model, "adapter")
os.makedirs(adapter_path, exist_ok=True)

trainer.model.save_pretrained(
    adapter_path,
    safe_serialization=True
)

tokenizer.save_pretrained(new_model)

model.config.save_pretrained(new_model)

print(f"Model adapter saved to: {adapter_path}")
print(f"Tokenizer and config saved to: {new_model}")

repo_name = "saadsohail/llama2-7b-text-summarization"
try:

    trainer.model.push_to_hub(
        repo_name,
        use_auth_token=True,
        safe_serialization=True
    )

    tokenizer.push_to_hub(
        repo_name,
        use_auth_token=True
    )
    print(f"Successfully pushed to hub: {repo_name}")
except Exception as e:
    print(f"Hub push failed: {e}")
    print("Model saved locally only")


Model adapter saved to: Llama-2-7b-TextSummarization-finetune/adapter
Tokenizer and config saved to: Llama-2-7b-TextSummarization-finetune


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Successfully pushed to hub: saadsohail/llama2-7b-text-summarization


In [None]:
"""## Loading Saved Model for Testing"""

model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    device_map="auto",
    token=os.environ["HUGGING_FACE_HUB_TOKEN"],
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )
)

repo_id = "saadsohail/llama2-7b-text-summarization"
model = PeftModel.from_pretrained(
    model,
    repo_id,
    token=os.environ["HUGGING_FACE_HUB_TOKEN"]
)

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b",
    token=os.environ["HUGGING_FACE_HUB_TOKEN"]
)

inference_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.95,
    num_return_sequences=1
)

test_dialogue = "John: Hey, how are you? Mary: I'm good, just finished my exams! John: That's great, how did they go? Mary: Pretty well, I think I passed everything!"
test_prompt = format_samsum_prompt(test_dialogue)
result = inference_pipe(test_prompt)[0]['generated_text']
summary = result.split("[/INST]")[1].strip()

print("\nTest Dialogue:")
print(test_dialogue)
print("\nGenerated Summary:")
print(summary)