In [15]:
import os
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset, concatenate_datasets
from trl import SFTTrainer, SFTConfig  # Import SFTTrainer from trl
from peft import LoraConfig, get_peft_model

In [16]:
DATA_DIR   = "../dataset/processed-IN-Ext"
RESULT_DIR = {"full": "../results_full", "lora": "../results_lora"}
SEED       = 42

In [2]:
!pip install dataset trl peft

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.54-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=3.0.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=3.0.0->trl)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=3.0.0->trl)
  Down

In [17]:

from huggingface_hub import login

login("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")

In [None]:
preprocessed_data_dir = "../dataset/processed-IN-Ext/"

In [6]:
import torch, os, subprocess, re, json, sys
print("torch sees CUDA", torch.version.cuda)
!nvcc --version | head -n 1

torch sees CUDA 12.4
nvcc: NVIDIA (R) Cuda compiler driver


In [7]:
!pip uninstall -y bitsandbytes
!pip install --no-cache-dir --upgrade bitsandbytes==0.45.5

[0mCollecting bitsandbytes==0.45.5
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m185.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [9]:
!python -m bitsandbytes

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++ BUG REPORT INFORMATION ++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++ OTHER +++++++++++++++++++++++++++
CUDA specs: CUDASpecs(highest_compute_capability=(8, 0), cuda_version_string='124', cuda_version_tuple=(12, 4))
PyTorch settings found: CUDA_VERSION=124, Highest Compute Capability: (8, 0).
To manually override the PyTorch CUDA version please see: https://github.com/TimDettmers/bitsandbytes/blob/main/docs/source/nonpytorchcuda.mdx
The directory listed in your path is found to be non-existent: /usr/local/lib/python3.11/dist-packages/cv2/../../lib64
The directory listed in your path is found to be non-existent: /sys/fs/cgroup/memory.events /var/colab/cgroup/jupyter-children/memory.events
The directory listed in your path is found to be non-existent: //172.28.0.1
The directory listed in your path is found to be non-existent: 8013
The directory liste

In [13]:
# 1⃣  Point the dynamic loader to the CUDA 12.4 libs
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.4/targets/x86_64-linux/lib

# 2⃣  Make it permanent for the current Colab session
!echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.4/targets/x86_64-linux/lib' >> ~/.bashrc


In [3]:
import ctypes, os, bitsandbytes as bnb
print("bitsandbytes version:", bnb.__version__)
ctypes.cdll.LoadLibrary("libcudart.so")
print("✓  libcudart loaded — CUDA runtime visible")

from transformers.quantizers.quantizer_bnb_8bit import is_bitsandbytes_available
print("transformers sees bitsandbytes:", is_bitsandbytes_available())


bitsandbytes version: 0.45.5
✓  libcudart loaded — CUDA runtime visible
transformers sees bitsandbytes: True


In [9]:
def load_dataset(jsonl_file):
    """
    Load preprocessed data and format it into a structured text field.
    """
    with open(jsonl_file, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    # Define a system prompt (instruction)
    system_prompt = "Summarize the following legal text."

    # Format each example with clear distinction
    texts = []
    for item in data:
        text = f"""### Instruction: {system_prompt}

### Input:
{item['judgement'].strip()[:10000]}

### Response:
{item['summary'].strip()}
""".strip()
        texts.append(text)

    # Create a dataset with a single "text" column
    dataset = Dataset.from_dict({"text": texts})
    return dataset

In [None]:
# Load datasets
train_file_A1 = os.path.join(preprocessed_data_dir, "full_summaries_A1.jsonl")
train_file_A2 = os.path.join(preprocessed_data_dir, "full_summaries_A2.jsonl")

train_dataset_A1 = load_dataset(train_file_A1)
train_dataset_A2 = load_dataset(train_file_A2)

In [None]:
train_data = concatenate_datasets([train_dataset_A1, train_dataset_A2])

In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch, os

model_name = "meta-llama/Llama-2-7b-hf"

bnb_cfg = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,      # defaults are fine – tweak if you like
)

tok = AutoTokenizer.from_pretrained(model_name)
tok.pad_token = tok.eos_token

model_full = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_cfg,
    device_map="auto",           # puts everything on the A100
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
full_cfg = SFTConfig(
    output_dir=RESULT_DIR["full"],
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=50,
    learning_rate=5e-3,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    dataset_text_field="text",
    max_seq_length=4096
)

trainer_full = SFTTrainer(
    model=model_full,
    train_dataset=train_data,
    tokenizer=tok,
    args=full_cfg
)

print("=== Phase 1: full fine-tune (2 epochs) ===")
trainer_full.train()
trainer_full.save_state()


NameError: name 'train_data' is not defined

In [None]:
model_lora = AutoModelForCausalLM.from_pretrained(
    RESULT_DIR["full"],
    quantization_config=bnb_cfg,
    device_map="auto"
)

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model_lora = get_peft_model(model_lora, lora_cfg)
model_lora.print_trainable_parameters()

In [13]:
!pip install evaluate rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f41d747ba4abfa9c682276992a7ced374d1a95ab1b43b0653f0d1e28ad80bbc2
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import evaluate
rouge = evaluate.load("rouge")
def compute_metrics_fn(eval_preds):
    gen_ids, labels = eval_preds
    preds = tok.batch_decode(gen_ids, skip_special_tokens=True)
    refs  = tok.batch_decode(labels, skip_special_tokens=True)
    res   = rouge.compute(predictions=preds, references=refs)
    return {
        "rouge1": res["rouge1"].mid.fmeasure * 100,
        "rouge2": res["rouge2"].mid.fmeasure * 100,
        "rougeL": res["rougeL"].mid.fmeasure * 100,
    }

lora_cfg_sft = SFTConfig(
    output_dir=RESULT_DIR["lora"],
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=50,
    learning_rate=5e-3,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    dataset_text_field="text",
    max_seq_length=4096,
    predict_with_generate=True,
    generation_max_length=512
)

trainer_lora = SFTTrainer(
    model=model_lora,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=lora_cfg,
    tokenizer=tok,
    args=lora_cfg_sft,
    compute_metrics=compute_metrics_fn
)

print("=== Phase 2: LoRA fine-tune (3 epochs) w/ evaluation ===")
trainer_lora.train()
trainer_lora.save_state()

# ─── 5. Save final adapter + tokenizer ────────────────────────────────────────
print("Saving final LoRA adapters…")
model_lora.save_pretrained(RESULT_DIR["lora"])
tok.save_pretrained(RESULT_DIR["lora"])
print("Done.")