<a href="https://colab.research.google.com/github/saharshgaddam/InLawGpt/blob/main/lawGpt_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ================================
# 1️⃣ Install Dependencies
# ================================
!pip install transformers datasets accelerate peft bitsandbytes matplotlib scikit-learn -q
!pip install rouge_score sacrebleu evaluate -q # Added evaluate

# ================================
# 2️⃣ Mount Google Drive
# ================================
from google.colab import drive
drive.mount('/content/drive')

# Folder to save everything
SAVE_DIR = "/content/drive/MyDrive/qwen_legal_finetune"
import os
os.makedirs(SAVE_DIR, exist_ok=True)

# ================================
# 3️⃣ Import Libraries
# ================================
import json, random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset # Removed load_metric here
from peft import LoraConfig, get_peft_model
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import numpy as np
import evaluate # Imported evaluate

# ================================
# 4️⃣ Merge Datasets
# ================================
data1 = []
with open("/content/standard.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open("/content/with_reasoning (2).jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data2.append(json.loads(line))

merged_data = []
for d1, d2 in zip(data1, data2):
    if d1["input"].strip() != d2["input"].strip():
        print("Warning: Mismatch in inputs!")
    merged_entry = {
        "input": d1["input"].strip(),
        "output": d1["output"].strip() + " " + d2["output"].strip()
    }
    merged_data.append(merged_entry)

random.shuffle(merged_data)

# Split
train_size = int(0.8 * len(merged_data))
val_size = int(0.1 * len(merged_data))

train_data = merged_data[:train_size]
val_data = merged_data[train_size:train_size + val_size]
test_data = merged_data[train_size + val_size:]

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

# Save merged datasets
train_dataset.to_json(os.path.join(SAVE_DIR, "train.json"))
val_dataset.to_json(os.path.join(SAVE_DIR, "val.json"))
test_dataset.to_json(os.path.join(SAVE_DIR, "test.json"))
print(f"Datasets saved in {SAVE_DIR}")

# ================================
# 5️⃣ Load Qwen-2.5 Model and Tokenizer
# ================================
model_name = "Qwen/Qwen1.5-0.5B" # Corrected model name to a publically available one
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)

# ================================
# 6️⃣ Apply LoRA
# ================================
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# ================================
# 7️⃣ Tokenize Dataset
# ================================
def tokenize_fn(examples):
    texts = [i + tokenizer.eos_token + o for i, o in zip(examples["input"], examples["output"])]
    tokenized = tokenizer(texts, truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy() # Add labels
    return tokenized

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

# ================================
# 8️⃣ Training Arguments
# ================================
training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    eval_strategy="steps", # Corrected argument name
    eval_steps=100,
    logging_steps=50,
    save_strategy="steps",
    save_total_limit=3,
    report_to="none",  # disable wandb
)

# ================================
# 9️⃣ Trainer
# ================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# ================================
# 🔟 Fine-Tune
# ================================
trainer.train()

# Save final model
model.save_pretrained(os.path.join(SAVE_DIR, "final_model"))
tokenizer.save_pretrained(os.path.join(SAVE_DIR, "final_model"))
print(f"Final model saved to {SAVE_DIR}/final_model")

# ================================
# 1️⃣1️⃣ Evaluate on Test Set
# ================================
rouge_metric = evaluate.load("rouge") # Corrected import
bleu_metric = evaluate.load("sacrebleu") # Corrected import

predictions = []
references = test_dataset["output"]

for text in test_dataset["input"]:
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
    output_ids = model.generate(input_ids, max_new_tokens=128)
    pred_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    predictions.append(pred_text)

# Compute ROUGE
rouge_results = rouge_metric.compute(predictions=predictions, references=references)
print("ROUGE-L:", rouge_results["rougeL"].mid.fmeasure)

# Compute BLEU
bleu_results = bleu_metric.compute(predictions=predictions, references=[[r] for r in references])
print("BLEU:", bleu_results["score"])

# Optional: simple accuracy (if output is exact match)
accuracy = np.mean([pred.strip() == ref.strip() for pred, ref in zip(predictions, references)])
print("Exact match accuracy:", accuracy)

# Save all metrics
metrics = {
    "ROUGE-L": rouge_results["rougeL"].mid.fmeasure,
    "BLEU": bleu_results["score"],
    "ExactMatch": accuracy
}

with open(os.path.join(SAVE_DIR, "metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)

print(f"Metrics saved to {SAVE_DIR}/metrics.json")

# ================================
# 1️⃣2️⃣ Plot Training Loss
# ================================
logs = trainer.state.log_history
steps = [x["step"] for x in logs if "loss" in x]
losses = [x["loss"] for x in logs if "loss" in x]

plt.plot(steps, losses)
plt.xlabel("Steps")
plt.ylabel("Training Loss")
plt.title("Training Loss vs Steps")
plt.savefig(os.path.join(SAVE_DIR, "training_loss.png"))
plt.show()
print(f"Training loss plot saved to {SAVE_DIR}/training_loss.png")



  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m^C


KeyboardInterrupt: 

In [None]:
# =========================================
# 1. Install dependencies
# =========================================
!pip install -q transformers datasets peft accelerate evaluate tensorboard wandb

# =========================================
# 2. Hugging Face Login
# =========================================
from huggingface_hub import notebook_login
notebook_login()   # Enter your HF token here

# =========================================
# 3. Imports
# =========================================
import json
import torch
from datasets import Dataset, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import evaluate
import os

# =========================================
# 4. Mount Google Drive
# =========================================
from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = "/content/drive/MyDrive/qwen_finetune_colab"
os.makedirs(SAVE_DIR, exist_ok=True)

# =========================================
# 5. Merge datasets
# =========================================
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

file1 = load_jsonl("/content/drive/MyDrive/data/file1.jsonl")
file2 = load_jsonl("/content/drive/MyDrive/data/file2.jsonl")

# Combine
merged = file1 + file2
dataset = Dataset.from_list(merged)

# Split train/val
dataset = dataset.train_test_split(test_size=0.1, seed=42)

# =========================================
# 6. Load tokenizer and model
# =========================================
MODEL_ID = "Qwen/Qwen2.5-1.5B"   # ✅ Correct model ID

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# =========================================
# 7. Tokenization
# =========================================
def preprocess(example):
    text = example["instruction"] + " " + example["input"] + " " + example["output"]
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

# =========================================
# 8. Metrics
# =========================================
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return metric.compute(predictions=preds, references=labels)

# =========================================
# 9. Training Arguments
# =========================================
training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_dir=f"{SAVE_DIR}/logs",
    logging_steps=100,
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    fp16=True,                       # ✅ Mixed precision
    gradient_checkpointing=True,     # ✅ Save memory
    push_to_hub=True,                # ✅ Auto upload to Hugging Face
    hub_model_id="your-username/qwen2.5-1.5B-finetuned",  # replace with your HF repo
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to=["wandb", "tensorboard"],  # ✅ Logs to WandB & TensorBoard
    logging_first_step=True
)

# =========================================
# 10. Data Collator
# =========================================
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# =========================================
# 11. Trainer
# =========================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# =========================================
# 12. Train
# =========================================
trainer.train()

# =========================================
# 13. Save & Push
# =========================================
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
trainer.push_to_hub()
