<a href="https://colab.research.google.com/github/sha2992/LLM-Fine-Tune/blob/main/ModelFineTuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Library ###

In [1]:
!pip install -q bitsandbytes trl datasets accelerate rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.0/348.0 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m118.9 MB/s[0m eta 

In [2]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
from transformers import pipeline
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, PeftConfig
from trl import SFTTrainer
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import statistics

In [3]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

#### Pre Trained Model ####

In [4]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
#"NousResearch/Llama-2-7b-chat-hf"

##### Quantize #####

In [5]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

#### Model Load ####

In [6]:
device_map = {"": 0}

In [33]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
# pip install -U bitsandbytes

In [34]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#### Dealing Dataset ####

In [35]:
datasets = load_dataset("SAGI-1/reasoningData_200k")

In [10]:
# Taking a sample of data
dataset = datasets['train'].shuffle(seed=42).select(range(2468))

In [11]:
test_modelwithData = datasets['train'].shuffle(seed=29).select(range(100))

In [12]:
def clean_text(text):
    # Remove unwanted characters
    text = re.sub(r"[\n\r\-_(){}\[\]\\/]", " ", text)
    text = re.sub(r"\s+", " ", text)  # remove extra spaces
    return text.strip()

In [None]:
# def truncate_instruction(text):
#     # Keep only the part before the first stop punctuation
#     text = clean_text(text)
#     stop_match = re.split(r"[.,!?]", text, maxsplit=1)
#     return stop_match[0].strip()

In [36]:
def preprocess_example(example):
    return {
        "subject": clean_text(example["instruction"]),
        "email": clean_text(example["answer"])
    }

cleaned_dataset = dataset.map(preprocess_example, remove_columns=["instruction", "answer"])

In [37]:
def convert_to_instruction_format(example):
    return {
        "text": f"<s>[INST] {example['email']} [/INST] {example['subject']} </s>"
    }

formatted_dataset = cleaned_dataset.map(convert_to_instruction_format)
formatted_dataset = formatted_dataset.remove_columns(['subject', 'email'])

In [15]:
new_model = "llama-subjectline"

### LoRA config & Train ###

In [16]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [32]:
training_args = TrainingArguments(
    output_dir="./llama-subject-finetuned",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    save_steps = 0,
    optim="paged_adamw_32bit",
    logging_steps=20,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio = 0.03,
    group_by_length = True,
    lr_scheduler_type="cosine",
    report_to="tensorboard"
)

In [38]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=lora_config,
    args=training_args,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [39]:
trainer.train()

Step,Training Loss
20,2.2897
40,2.6093
60,2.5029
80,2.2533
100,2.157
120,2.0954
140,2.0666
160,1.9823
180,2.1125
200,1.9173


TrainOutput(global_step=3090, training_loss=1.6900952552128765, metrics={'train_runtime': 1942.7809, 'train_samples_per_second': 12.703, 'train_steps_per_second': 1.591, 'total_flos': 4.008109805091226e+16, 'train_loss': 1.6900952552128765})

#### Load and Test New Model ####

In [41]:
trainer.model.save_pretrained(new_model)

In [42]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000)

Device set to use cuda:0


In [43]:
def test_model(email):
    result = pipe(f"<s>[INST] {email} [/INST]")
    result_text = result[0]['generated_text']

    if "[INST]" in result_text and "[/INST]" in result_text:
        subject = result_text.split("[/INST]")[-1].strip()
        return f"Email:\n{email}\n\nSubject:\n{subject}"

    else:
      return f"Subject Can not be generated for given Email body: {email}"

In [46]:
email = 'The idiom "it’s raining cats and dogs" means that it is raining heavily'
print(test_model(email))

Email:
The idiom "it’s raining cats and dogs" means that it is raining heavily

Subject:
What is the idiom "it’s raining cats and dogs"?


#### Evaluation ####

In [47]:
def pred_sub(email):
    result = pipe(f"<s>[INST] {email} [/INST]")
    result_text = result[0]['generated_text']

    if "[INST]" in result_text and "[/INST]" in result_text:
        subject = result_text.split("[/INST]")[-1].strip()
        return subject

In [48]:
cleaned_dataset_test = test_modelwithData.map(preprocess_example, remove_columns=["instruction", "answer"])

In [49]:
bleu_scores = []
rouge_scores = []
meteor_scores = []

for i in range(len(cleaned_dataset_test)):
  reference = cleaned_dataset_test[i]['subject']
  candidate = pred_sub(cleaned_dataset_test[i]['email'])

  print(f"{i+1} \nReference: {reference}\nCandidate: {candidate}\n")
  smoothie = SmoothingFunction().method4
  bleu_score = sentence_bleu([reference], candidate, smoothing_function=smoothie)
  bleu_scores.append(bleu_score)

  scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
  rge_scores = scorer.score(reference, candidate)
  rouge_scores.append(rge_scores['rougeL'].fmeasure)

  mt_scores = meteor_score([word_tokenize(reference)], word_tokenize(candidate))
  meteor_scores.append(mt_scores)

  print(f"BLEU Score: {bleu_score:.4f}")
  print(f"ROUGE Score: {rge_scores['rougeL'].fmeasure:.4f}")
  print(f"METEOR Score: {mt_scores:.4f}\n\n")

1 
Reference: What is the difference between an exothermic and endothermic reaction?
Candidate: What is the difference between exothermic and endothermic reactions?

BLEU Score: 0.9416
ROUGE Score: 0.9474
METEOR Score: 0.9138


2 
Reference: How can you get more song birds to visit your yard? put out some hamburgers to attract them adopt some large dogs to stay in the yard call Here birdy birdy! out the window hang an open box containing tasty seeds from a tree Which is the correct answer?
Candidate: I am trying to find a solution to a problem. I have a garbage can in my backyard. I want to attract song birds to it. What should I do?

BLEU Score: 0.1982
ROUGE Score: 0.1013
METEOR Score: 0.1886


3 
Candidate: In this task, you are given a text from a social media post. Your task is to classify the given post into two categories: 1 yes if the given post contains sexually explicit content 2 no if the given post does not contain any sexually explicit content. Classify the given post into 

In [50]:
print(f"##### Final Scores #####")
print(f"Avg. BLEU Score: {statistics.mean(bleu_scores):.4f}")
print(f"Avg. ROUGE Score: {statistics.mean(rouge_scores):.4f}")
print(f"Avg. METEOR Score: {statistics.mean(meteor_scores):.4f}")

##### Final Scores #####
Avg. BLEU Score: 0.3305
Avg. ROUGE Score: 0.3031
Avg. METEOR Score: 0.3092
