In [5]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")

dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]


In [6]:
train_dataset,val_dataset

(Dataset({
     features: ['input', 'output', 'instruction'],
     num_rows: 30559
 }),
 Dataset({
     features: ['input', 'output', 'instruction'],
     num_rows: 3396
 }))

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 30559
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 3396
    })
})

In [10]:
# Function to format prompts as per chat-style fine-tuning
def format_prompt(example):
    """Format input as a ChatBot interaction (User ↔ Model)."""
    prompt = f"<|user|>\n{example['instruction']}\n\n"
    
    if example['input']:  # If context is available
        prompt += f"{example['input']}\n\n"

    prompt += "<|model|>\n" + example["output"]
    return prompt

# Select a few samples from the dataset and format them
formatted_prompts = [{"formatted_prompt": format_prompt(sample)} for sample in dataset["test"].select(range(10))]

# Convert to DataFrame for display
df_formatted_prompts = pd.DataFrame(formatted_prompts)


In [11]:
df_formatted_prompts.iloc[0]['formatted_prompt']

'<|user|>\nAnswer this question truthfully\n\nWhat type of injury to the arm/elbow most often leads to supracondylar fractures?\n\n<|model|>\nSupracondylar fractures most often occur after hyperextension injuries of the arm/elbow.'

In [45]:
# #  Tokenization function (Fixed for Batched Processing)
# def tokenize_function(examples):
#     """Tokenizes dataset examples in User ↔ Assistant format, ensuring correct type handling."""
    
#     #  Ensure 'instruction' is processed correctly in batched mode
#     instruction_texts = [ " ".join(inst) if isinstance(inst, list) else inst for inst in examples["instruction"] ]
    
#     #  Ensure 'input' (context) is processed correctly
#     input_texts = [ " ".join(inp) if isinstance(inp, list) else inp for inp in examples["input"] ]
    
#     #  Ensure 'output' (response) is processed correctly
#     output_texts = [ " ".join(out) if isinstance(out, list) else out for out in examples["output"] ]

#     #  Format the conversation as a chatbot exchange
#     prompts = []
#     for instr, inp, out in zip(instruction_texts, input_texts, output_texts):
#         prompt = f"<|user|>\n{instr.strip()}"
#         if inp.strip():
#             prompt += f"\n{inp.strip()}"
#         prompt += f"\n\n<|assistant|>\n{out.strip()}"
#         prompts.append(prompt)

#     #  Tokenize the formatted prompts
#     tokenized = tokenizer(
#         prompts,
#         padding="max_length",  # Ensures uniform batch size
#         truncation=True,
#         max_length=512
#     )

#     #  Convert tensors into lists to avoid TypeErrors
#     tokenized["input_ids"] = [ids.tolist() if isinstance(ids, torch.Tensor) else ids for ids in tokenized["input_ids"]]
#     tokenized["attention_mask"] = [mask.tolist() if isinstance(mask, torch.Tensor) else mask for mask in tokenized["attention_mask"]]

#     #  Ensure labels match input_ids for causal LM training
#     tokenized["labels"] = tokenized["input_ids"].copy()

#     return tokenized

# #  Tokenize dataset with the fixed function
# tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

# # Assign tokenized datasets
# train_dataset_tok = tokenized_datasets["train"]
# val_dataset_tok = tokenized_datasets["test"]

# # Take only the first 5 examples from the dataset for debugging
# sample_data = dataset["train"].select(range(5))

# # Tokenize the selected samples
# tokenized_samples = sample_data.map(tokenize_function, remove_columns=dataset["train"].column_names)

# # Print properly formatted prompts for verification
# for i, example in enumerate(sample_data):
#     print(f"🔹 **Sample {i+1} Prompt:**")
#     print(tokenize_function(example)["input_ids"])  # Show tokenized input IDs
#     print("=" * 100)


In [41]:
# val_dataset_tok[0],sample_data[0]

In [None]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset

#  Load Model and Tokenizer
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Use EOS as PAD token (Fix for LLaMA padding issue)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Select Single Device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#  Load Model on Single Device with 4-bit QLoRA (Super Efficient)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,  #  More memory-efficient than 8-bit
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

#  Prepare model for QLoRA fine-tuning (Memory Efficient)
model = prepare_model_for_kbit_training(model)

#  Apply LoRA Configuration (Optimized)
peft_config = LoraConfig(
    r=8,  
    lora_alpha=16,  
    lora_dropout=0.05,  
    bias="none",
    task_type="CAUSAL_LM"
)

#  Apply LoRA to Model
model = get_peft_model(model, peft_config).to(device)

# Enable Gradient Checkpointing for Memory Savings
model.gradient_checkpointing_enable()

# Load and split the dataset into train/validation (90:10 split)
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

# Tokenization function (Optimized)
def tokenize_function(examples):
    """Tokenizes dataset in User ↔ Assistant format, ensuring correct type handling."""
    
    instruction_texts = [" ".join(inst) if isinstance(inst, list) else inst for inst in examples["instruction"]]
    input_texts = [" ".join(inp) if isinstance(inp, list) else inp for inp in examples["input"]]
    output_texts = [" ".join(out) if isinstance(out, list) else out for out in examples["output"]]

    prompts = []
    for instr, inp, out in zip(instruction_texts, input_texts, output_texts):
        prompt = f"<|user|>\n{instr.strip()}"
        if inp.strip():
            prompt += f"\n{inp.strip()}"
        prompt += f"\n\n<|assistant|>\n{out.strip()}"
        prompts.append(prompt)

    # Tokenize efficiently
    tokenized = tokenizer(
        prompts,
        padding="longest",  # Efficient batch processing
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    #  Ensure labels match input_ids
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

#  Tokenize dataset (Faster processing)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
train_dataset_tok = tokenized_datasets["train"]
val_dataset_tok = tokenized_datasets["test"]

#  Select First 1000 Samples for Training, 200 for Validation (Faster Debugging)
small_train = train_dataset_tok.select(range(3000))
small_val = val_dataset_tok.select(range(600))

#  Data Collator with Optimized Padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model, 
    padding="longest"  #  More efficient padding
)

# Optimized Training Arguments
training_args = TrainingArguments(
    output_dir="./llama3-medical-chatbot",
    per_device_train_batch_size=4,  #  Increased batch size
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=50,  # Less frequent evaluation
    save_steps=50,  # Less frequent saving
    logging_steps=10,  # Log progress every 10 steps
    learning_rate=5e-4,
    num_train_epochs=1,  # 1 Epoch for fast debugging
    weight_decay=0.01,
    fp16=True,  # Mixed precision training for speedup
    push_to_hub=False
)

# Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,  
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start Training
print("🚀 Starting optimized training on 1000 samples...")
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./llama3-medical-chatbot")
tokenizer.save_pretrained("./llama3-medical-chatbot")

# Debugging: Check Training Progress
print("\n🔍 **Training Log History:**")
print(trainer.state.log_history)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


🚀 Starting optimized training on 1000 samples...




Step,Training Loss,Validation Loss
50,0.4353,0.41454
100,0.4551,0.402736
150,0.4632,0.396485
200,0.4696,0.392564
250,0.4192,0.390941
300,0.41,0.3871
350,0.4001,0.384891
400,0.3664,0.38352
450,0.4094,0.382544
500,0.4407,0.380411



🔍 **Training Log History:**
[{'loss': 3.3203, 'grad_norm': 1.6854817867279053, 'learning_rate': 0.0004946666666666667, 'epoch': 0.013333333333333334, 'step': 10}, {'loss': 0.5523, 'grad_norm': 0.4944961965084076, 'learning_rate': 0.000488, 'epoch': 0.02666666666666667, 'step': 20}, {'loss': 0.5386, 'grad_norm': 0.8257476687431335, 'learning_rate': 0.00048133333333333334, 'epoch': 0.04, 'step': 30}, {'loss': 0.4996, 'grad_norm': 0.4939257502555847, 'learning_rate': 0.0004746666666666667, 'epoch': 0.05333333333333334, 'step': 40}, {'loss': 0.4353, 'grad_norm': 0.4518594443798065, 'learning_rate': 0.00046800000000000005, 'epoch': 0.06666666666666667, 'step': 50}, {'eval_loss': 0.4145403802394867, 'eval_runtime': 37.3167, 'eval_samples_per_second': 16.079, 'eval_steps_per_second': 4.02, 'epoch': 0.06666666666666667, 'step': 50}, {'loss': 0.4914, 'grad_norm': 0.5553702712059021, 'learning_rate': 0.00046133333333333334, 'epoch': 0.08, 'step': 60}, {'loss': 0.4304, 'grad_norm': 0.66844624280

## Inference on LORA

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load the fine-tuned model path
model_path = "./llama3-medical-chatbot"  # Path where fine-tuned model is saved
base_model_id = "meta-llama/Llama-3.2-1B"  # Original base model

# Load the tokenizer from the fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_path)

#  Ensure the tokenizer has a pad token to avoid warnings
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Avoids padding issues

# Select device (Prefer GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

# Load the fine-tuned LoRA adapter on top of the base model
model = PeftModel.from_pretrained(base_model, model_path)

# Ensure model embeddings match tokenizer
model.resize_token_embeddings(len(tokenizer))

# Move model to correct device
model.to(device)


def generate_response(user_query, max_length=256):
    """Generates a medical response using the fine-tuned chatbot."""
    
    # Format the query using Chat-based structure
    prompt = f"<|user|>\n{user_query}\n\n<|assistant|>\n"

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)  # Explicitly pass attention mask

    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,  # Fixes warning
            max_length=max_length,
            do_sample=True,  # Enable randomness for diverse responses
            temperature=0.4,  # Lower temp = more factual response
            top_p=0.9,  # Controls response diversity
            pad_token_id=tokenizer.pad_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)


# **Test the chatbot with a medical query**
query = "What are the symptoms and treatments for apnea?"
response = generate_response(query)

print("\n🔍 **Medical Chatbot Response:**")
print(response)



🔍 **Medical Chatbot Response:**
<|user|>
What are the symptoms and treatments for apnea?

<|assistant|>
Apnea is a condition where a person's breathing stops for a short period of time. There are several possible causes of apnea, including sleep apnea, which is a more serious condition that can lead to other health problems if not treated. Treatment for apnea may involve using a CPAP machine, which helps to keep the airways open during sleep, or using other devices to prevent apnea. In some cases, surgery may be necessary to correct the underlying cause of apnea.


## Inference on Base

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the base model
base_model_id = "meta-llama/Llama-3.2-1B"  # Original LLaMA-3.2 1B model
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Ensure tokenizer has a pad token to avoid warnings
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Avoid padding issues

#  Select device (Prefer GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the base model on the chosen device
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
).to(device)

def generate_response_base(user_query, max_length=256):
    """Generates a response using the base LLaMA-3 model (without fine-tuning)."""

    # Format the query in Chat-style prompt
    prompt = f"<|user|>\n{user_query}\n\n<|assistant|>\n"

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)  #  Explicitly pass attention mask

    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,  # Fixes warning
            max_length=max_length,
            do_sample=True,  # Enable randomness for diverse responses
            temperature=0.4,  # Lower temp = more factual response
            top_p=0.9,  # Controls response diversity
            pad_token_id=tokenizer.pad_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)


# **Test the base model with the same medical query**
query = "What are the symptoms and treatments for apnea?"
response_base = generate_response_base(query)

print("\n🔍 **Base Model Response:**")
print(response_base)



🔍 **Base Model Response:**
<|user|>
What are the symptoms and treatments for apnea?

<|assistant|>
The most common symptoms of apnea are:
• Unexplained sleepiness or drowsiness during the day
• Unexplained fatigue or weakness
• Unexplained headaches or migraines
• Unexplained weight gain or weight loss
• Unexplained changes in mood or behavior
• Unexplained changes in appetite or eating habits
• Unexplained changes in sleep habits or patterns
• Unexplained changes in sex drive or libido
• Unexplained changes in sexual performance or ability
• Unexplained changes in energy levels or stamina
• Unexplained changes in concentration or focus
• Unexplained changes in memory or cognitive function
• Unexplained changes in attention span or focus
• Unexplained changes in reaction time or reaction speed
• Unexplained changes in judgment or decision-making ability
• Unexplained changes in emotional stability or emotional outbursts
• Unexplained changes in personality or personality traits
• Unex

# Benchmarking

In [2]:
import nltk
nltk.download('punkt')  # Required for BLEU score tokenization


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
import torch
import time
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load Fine-tuned LoRA Model
fine_tuned_model_path = "./llama3-medical-chatbot"
base_model_id = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 
# Load base model
device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForCausalLM.from_pretrained(base_model_id).to(device)

# Load fine-tuned LoRA model
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_model_path).to(device)

# Load Benchmark Dataset
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")["train"]
benchmark_dataset = dataset.select(range(3600, 4000))  # Data outside training

# Load Sentence Transformer for Semantic Similarity
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Benchmark Function
def benchmark_model(model, queries, tokenizer, model_name="LoRA"):
    results = []
    
    for query in queries:
        prompt = f"<|user|>\n{query}\n\n<|assistant|>\n"

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        # Measure inference time
        start_time = time.time()
        with torch.no_grad():
            output = model.generate(
                inputs.input_ids,
                max_length=256,
                do_sample=True,
                temperature=0.4,
                top_p=0.9,
                pad_token_id=tokenizer.pad_token_id
            )
        end_time = time.time()

        response = tokenizer.decode(output[0], skip_special_tokens=True)

        # Compute Inference Time & Token Speed
        inference_time = end_time - start_time
        token_count = output.shape[1]  # Number of tokens generated
        tokens_per_second = token_count / inference_time if inference_time > 0 else 0

        # Store results
        results.append({
            "Model": model_name,
            "Query": query,
            "Response": response,
            "Inference Time (s)": round(inference_time, 4),
            "Tokens Per Second": round(tokens_per_second, 2)
        })
    
    return results

# Run Benchmark for Both Models
benchmark_queries = [
    f"{instr} {inp}" if inp.strip() else instr 
    for instr, inp in zip(benchmark_dataset["instruction"], benchmark_dataset["input"])
][:100]  # Take first 100 samples

print("🚀 Benchmarking Base Model...")
base_results = benchmark_model(base_model, benchmark_queries, tokenizer, model_name="Base")

print("🚀 Benchmarking LoRA Model...")
lora_results = benchmark_model(fine_tuned_model, benchmark_queries, tokenizer, model_name="LoRA Fine-tuned")

# Convert Results to DataFrame
df_base = pd.DataFrame(base_results)
df_lora = pd.DataFrame(lora_results)

# Combine and Display Results
df_benchmark = pd.concat([df_base, df_lora], ignore_index=True)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


🚀 Benchmarking Base Model...
🚀 Benchmarking LoRA Model...


In [9]:
df_base.rename(columns={"Response": "Base Response", "Inference Time (s)": "Base Time", "Tokens Per Second": "Base Tokens/sec"}, inplace=True)
df_lora.rename(columns={"Response": "LoRA Response", "Inference Time (s)": "LoRA Time", "Tokens Per Second": "LoRA Tokens/sec"}, inplace=True)

df_benchmark = df_base.merge(df_lora, on="Query", suffixes=("_Base", "_LoRA"))


In [10]:
# import pandas as pd
# import os
# csv_file = "time_benchmark.csv"
# try:
#     df_benchmark  # Check if df_benchmark exists
# except NameError:
#     if os.path.exists(csv_file):
#         df_benchmark = pd.read_csv(csv_file)  # Load data from CSV if the file exists
#     else:
#         df_benchmark = pd.DataFrame()  # Initialize an empty DataFrame
# else:
#     df_benchmark.to_csv(csv_file, index=False)  # Save DataFrame if it already exists

# df_benchmark.drop(columns=["Model_Base","Model_LoRA"],inplace=True)

In [11]:
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics.pairwise import cosine_similarity

# Compute BLEU Scores
def compute_bleu(responses, references):
    scores = []
    for response, reference in zip(responses, references):
        scores.append(sentence_bleu([reference.split()], response.split()))
    return scores

#  Compute Embedding Similarity
def compute_embedding_similarity(responses, references, model):
    response_embeddings = model.encode(responses)
    reference_embeddings = model.encode(references)
    
    similarities = []
    for resp_emb, ref_emb in zip(response_embeddings, reference_embeddings):
        similarity = cosine_similarity([resp_emb], [ref_emb])[0][0]
        similarities.append(similarity)
    
    return similarities

#  Extract Ground Truth Answers
ground_truth = benchmark_dataset["output"]#[:100]  # Get ground truth answers

def extract_assistant_response(output_text):
    """Removes the prompt and extracts only the assistant's response."""
    if "<|assistant|>" in output_text:
        return output_text.split("<|assistant|>\n", 1)[1].strip()  # Keep only text after <|assistant|> and remove extra spaces
    return output_text.strip()  # If no <|assistant|> tag, just strip whitespace

#  Apply Cleaning on Existing DataFrame
df_benchmark["Base Response Clean"] = df_benchmark["Base Response"].apply(extract_assistant_response)
df_benchmark["LoRA Response Clean"] = df_benchmark["LoRA Response"].apply(extract_assistant_response)


#  Compute BLEU Scores for Base and LoRA Responses
df_benchmark["BLEU Score (Base)"] = compute_bleu(df_benchmark["Base Response"], ground_truth)
df_benchmark["BLEU Score (LoRA)"] = compute_bleu(df_benchmark["LoRA Response"], ground_truth)

# Compute Embedding Similarity for Base and LoRA Responses
df_benchmark["Embedding Similarity (Base)"] = compute_embedding_similarity(df_benchmark["Base Response"], ground_truth, embedding_model)
df_benchmark["Embedding Similarity (LoRA)"] = compute_embedding_similarity(df_benchmark["LoRA Response"], ground_truth, embedding_model)

from rouge_score import rouge_scorer
import torch.nn.functional as F

# Compute ROUGE-L Score
def compute_rouge_l(responses, references):
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    scores = [scorer.score(ref, resp)["rougeL"].fmeasure for resp, ref in zip(responses, references)]
    return scores

# Compute Perplexity
def compute_perplexity(model, tokenizer, texts):
    perplexities = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda" if torch.cuda.is_available() else "cpu")
        with torch.no_grad():
            outputs = model(**inputs)
            log_probs = F.log_softmax(outputs.logits, dim=-1)
            perplexity = torch.exp(-log_probs.mean())
        perplexities.append(perplexity.item())
    return perplexities

# Add Ground Truth to df_benchmark
df_benchmark["Ground Truth"] = benchmark_dataset["output"][:len(df_benchmark)]


# Compute ROUGE-L Scores for Base and LoRA
df_benchmark["ROUGE-L (Base)"] = compute_rouge_l(df_benchmark["Base Response Clean"], df_benchmark["Ground Truth"])
df_benchmark["ROUGE-L (LoRA)"] = compute_rouge_l(df_benchmark["LoRA Response Clean"], df_benchmark["Ground Truth"])

# Compute Perplexity for Base and LoRA
df_benchmark["Perplexity (Base)"] = compute_perplexity(base_model, tokenizer, df_benchmark["Base Response Clean"])
df_benchmark["Perplexity (LoRA)"] = compute_perplexity(fine_tuned_model, tokenizer, df_benchmark["LoRA Response Clean"])



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [12]:
df_benchmark['Base Response Clean'].loc[0]

"Gestational hypertension is a condition that occurs during pregnancy that can lead to high blood pressure and other complications. It is defined as a blood pressure of 140/90 mmHg or higher during the second or third trimester of pregnancy. Gestational hypertension can be caused by a variety of factors, including hormonal changes, changes in the body's blood vessels, and changes in the placenta. It is important to monitor blood pressure during pregnancy, and to seek medical attention if it is not under control. Treatment may include lifestyle changes, medication, and in some cases, surgery."

In [18]:
df_benchmark.to_csv("benchmark_res.csv",index=False)

In [13]:
# Select 10 random samples from df_benchmark
df_random_samples = df_benchmark[["Query","Base Response Clean", "LoRA Response Clean", "Ground Truth"]].sample(n=1, random_state=1)

# Print full text for each selected sample
for index, row in df_random_samples.iterrows():
    print(f"\nSample {index+1}:")
    print("-" * 50)
    print(f"🔹 Query:\n{row['Query']}")
    print(f"🔹 Base Response:\n{row['Base Response Clean']}")
    print(f"🔹 LoRA Response:\n{row['LoRA Response Clean']}")
    print(f"🔹 Ground Truth:\n{row['Ground Truth']}")
    print("-" * 50)



Sample 81:
--------------------------------------------------
🔹 Query:
Answer this question truthfully What is fenoldopam, and how does it work to prevent kidney damage in patients with hypertensive emergency? What is the role of the D1 receptor in this process, and what are some potential benefits and risks associated with the use of fenoldopam?
🔹 Base Response:
Fenoldopam is a vasopressor drug that works by inhibiting the D1 receptor, which is involved in the regulation of blood pressure. In patients with hypertensive emergency, fenoldopam can help to prevent kidney damage by reducing blood pressure and improving blood flow to the kidneys. However, fenoldopam can also cause side effects such as hypotension, bradycardia, and flushing. Therefore, it is important to monitor patients closely for signs of hypotension and bradycardia, and to adjust the dosage as needed to avoid these side effects.
🔹 LoRA Response:
Fenoldopam is a medication that is used to prevent kidney damage in patient

In [14]:
df_random_samples

Unnamed: 0,Query,Base Response Clean,LoRA Response Clean,Ground Truth
80,Answer this question truthfully What is fenold...,Fenoldopam is a vasopressor drug that works by...,Fenoldopam is a medication that is used to pre...,Fenoldopam is a medication that acts as a D1 r...


In [15]:
df_benchmark.head(1)

Unnamed: 0,Model_Base,Query,Base Response,Base Time,Base Tokens/sec,Model_LoRA,LoRA Response,LoRA Time,LoRA Tokens/sec,Base Response Clean,LoRA Response Clean,BLEU Score (Base),BLEU Score (LoRA),Embedding Similarity (Base),Embedding Similarity (LoRA),Ground Truth,ROUGE-L (Base),ROUGE-L (LoRA),Perplexity (Base),Perplexity (LoRA)
0,Base,Answer this question truthfully What is gestat...,<|user|>\nAnswer this question truthfully What...,4.1923,34.83,LoRA Fine-tuned,<|user|>\nAnswer this question truthfully What...,3.8634,44.26,Gestational hypertension is a condition that o...,Gestational hypertension is a condition that o...,0.100785,0.106908,0.895864,0.905449,Gestational hypertension is a type of high blo...,0.301676,0.285714,52274240000.0,88546760000.0


In [16]:
# Compute statistics for all numeric columns in df_benchmark
df_stats = df_benchmark.describe().round(4)


In [17]:
df_stats

Unnamed: 0,Base Time,Base Tokens/sec,LoRA Time,LoRA Tokens/sec,BLEU Score (Base),BLEU Score (LoRA),Embedding Similarity (Base),Embedding Similarity (LoRA),ROUGE-L (Base),ROUGE-L (LoRA),Perplexity (Base),Perplexity (LoRA)
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,2.418,59.3243,2.308,66.5116,0.0925,0.0905,0.7665,0.7564,0.2913,0.2907,23247310000.0,22271800000.0
std,1.4872,20.7117,1.5684,22.9522,0.0705,0.0719,0.1115,0.1147,0.1149,0.1177,24734450000.0,22861610000.0
min,0.3812,29.78,0.3079,43.13,0.0,0.0,0.4421,0.4624,0.0882,0.0492,403442300.0,127236000.0
25%,0.9156,47.0725,0.6928,47.53,0.043,0.0401,0.7083,0.6953,0.2268,0.2222,2758798000.0,1973241000.0
50%,2.4708,51.445,2.3836,55.105,0.079,0.09,0.7883,0.7755,0.2831,0.2785,15736880000.0,17868440000.0
75%,3.3438,73.68,3.5354,84.5075,0.1222,0.1202,0.8577,0.8445,0.3204,0.3273,35778830000.0,36664670000.0
max,6.9843,111.75,5.936,139.68,0.3407,0.3407,0.9146,0.9278,0.7778,0.7778,120595600000.0,98922470000.0
