<a href="https://colab.research.google.com/github/ritusingh-29/Hybrid-Legal-Document-Summarization/blob/main/Hybrid_Sbert_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================================================
# 1. SETUP: Install all necessary libraries
# ==============================================================================
print("Installing libraries...")
!pip install transformers datasets pandas matplotlib seaborn rouge-score sacrebleu bert-score sentencepiece torch tqdm evaluate --quiet

import os
import datasets
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BartTokenizer, BartForConditionalGeneration,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from datasets import load_dataset
import evaluate
from tqdm.notebook import tqdm
from google.colab import drive

print("--- All libraries installed and imported successfully! ---")

Installing libraries...
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
--- All libraries installed and imported successfully! ---


In [None]:
# ==============================================================================
# 2. CONNECT TO YOUR GOOGLE DRIVE
# ==============================================================================
print("\n[STEP 1] Connecting to Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive successfully mounted!")
except Exception as e:
    print(f"Error mounting drive: {e}")
    # Stop the script if drive can't be mounted
    raise SystemExit("Google Drive mount failed. Please fix connection issues and restart.")

# ==============================================================================
# 3. LOAD YOUR PRE-PROCESSED HYBRID DATA
# ==============================================================================
# Make sure this path is correct. This is where your friend saved the data.
hybrid_data_path = "/content/drive/MyDrive/Hybrid_Dataset_Saved_SBERT"

print(f"\n[STEP 2] Loading your condensed data from: {hybrid_data_path}")
try:
    loaded_hybrid_train_dataset = datasets.load_from_disk(f"{hybrid_data_path}/train")
    loaded_hybrid_test_dataset = datasets.load_from_disk(f"{hybrid_data_path}/test")
    print("--- Condensed data loaded successfully! ---")
    print(loaded_hybrid_train_dataset)
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please check that the path is correct and the data was saved properly.")
    raise SystemExit("Failed to load hybrid data.")


[STEP 1] Connecting to Google Drive...
Mounted at /content/drive
Google Drive successfully mounted!

[STEP 2] Loading your condensed data from: /content/drive/MyDrive/Hybrid_Dataset_Saved_SBERT
--- Condensed data loaded successfully! ---
Dataset({
    features: ['text', 'summary'],
    num_rows: 18949
})


In [None]:
# ==============================================================================
# 4. DATA PREPARATION CLASS (CORRECTED)
# ==============================================================================
class BillSumDataset(Dataset):
    # --- FIX: These must all have DOUBLE underscores ---
    def __init__(self, docs, summaries, tokenizer,
                 max_input_len=1024,
                 max_output_len=128):
    # ---------------------------------------------------
        self.docs = docs
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    # --- FIX: Double underscore ---
    def __len__(self):
    # ----------------------------
        return len(self.docs)

    # --- FIX: Double underscore ---
    def __getitem__(self, idx):
    # ----------------------------
        doc = str(self.docs[idx])
        summary = str(self.summaries[idx])

        inputs = self.tokenizer(
            doc,
            max_length=self.max_input_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        targets = self.tokenizer(
            summary,
            max_length=self.max_output_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        labels = targets["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels
        }

In [None]:

print("\n[STEP 3] Loading a fresh BART model and preparing for training...")

model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

train_dataset = BillSumDataset(loaded_hybrid_train_dataset['text'], loaded_hybrid_train_dataset['summary'], tokenizer)
test_dataset = BillSumDataset(loaded_hybrid_test_dataset['text'], loaded_hybrid_test_dataset['summary'], tokenizer)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

batch_size = 8


epochs = 4
lr = 5e-5

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
optimizer = AdamW(model.parameters(), lr=lr)

total_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)

base_save_dir = "/content/drive/MyDrive/My_ML_Project/"
print(f"--- Setup complete. Using batch size {batch_size}. Ready for training. ---")

print(f"\n[STEP 4] Starting SBERT-HYBRID model training for {epochs} epochs on {device}...")
model.train()
for epoch in range(epochs):
    print(f"\n--- Epoch {epoch+1}/{epochs} ---")
    epoch_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Training Hybrid Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            return_dict=True
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Average training loss for epoch {epoch+1}: {avg_loss:.4f}")

    print(f"--- Saving checkpoint for Epoch {epoch+1} ---")
    epoch_save_dir = os.path.join(base_save_dir, f"sbert_hybrid_bart_model_epoch_{epoch+1}")
    os.makedirs(epoch_save_dir, exist_ok=True)

    model.save_pretrained(epoch_save_dir)
    tokenizer.save_pretrained(epoch_save_dir)
    print(f"Model saved to {epoch_save_dir}")

print("\n--- Training Complete! ---")

print("\n[STEP 5] Starting Final Evaluation on the SBERT-HYBRID model...")
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")

model.eval()
preds, refs = [], []
test_loader = DataLoader(test_dataset, batch_size=4)

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating Hybrid Model"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]

        gen_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

        batch_preds = tokenizer.batch_decode(gen_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        labels[labels == -100] = tokenizer.pad_token_id
        batch_refs = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        preds.extend(batch_preds)
        refs.extend(batch_refs)

# Compute metrics
rouge_scores = rouge.compute(predictions=preds, references=refs)
bleu_score = bleu.compute(predictions=preds, references=[[r] for r in refs])
bert_scores = bertscore.compute(predictions=preds, references=refs, lang="en")

print("\n\n=== FINAL EVALUATION RESULTS (SBERT-HYBRID MODEL) ===")
print("ROUGE:", rouge_scores)
print("BLEU:", bleu_score)
print(f"BERTScore F1 mean: {sum(bert_scores['f1']) / len(bert_scores['f1']):.4f}")

print("\n[STEP 6] Saving the FINAL fine-tuned SBERT-HYBRID model...")
try:

    output_dir = "/content/drive/MyDrive/My_ML_Project/sbert_hybrid_bart_model_FINAL"
    os.makedirs(output_dir, exist_ok=True)

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"\nFinal model and tokenizer permanently saved at {output_dir}")
except Exception as e:
    print(f"Error saving to Drive: {e}. Model was not saved permanently.")

print("\n--- Hybrid Model Experiment Complete! ---")


[STEP 3] Loading a fresh BART model and preparing for training...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

--- Setup complete. Using batch size 8. Ready for training. ---

[STEP 4] Starting SBERT-HYBRID model training for 4 epochs on cuda...

--- Epoch 1/4 ---


Training Hybrid Epoch 1:   0%|          | 0/2369 [00:00<?, ?it/s]

Average training loss for epoch 1: 2.1751
--- Saving checkpoint for Epoch 1 ---




Model saved to /content/drive/MyDrive/My_ML_Project/sbert_hybrid_bart_model_epoch_1

--- Epoch 2/4 ---


Training Hybrid Epoch 2:   0%|          | 0/2369 [00:00<?, ?it/s]

Average training loss for epoch 2: 1.7309
--- Saving checkpoint for Epoch 2 ---
Model saved to /content/drive/MyDrive/My_ML_Project/sbert_hybrid_bart_model_epoch_2

--- Epoch 3/4 ---


Training Hybrid Epoch 3:   0%|          | 0/2369 [00:00<?, ?it/s]

Average training loss for epoch 3: 1.5622
--- Saving checkpoint for Epoch 3 ---
Model saved to /content/drive/MyDrive/My_ML_Project/sbert_hybrid_bart_model_epoch_3

--- Epoch 4/4 ---


Training Hybrid Epoch 4:   0%|          | 0/2369 [00:00<?, ?it/s]

Average training loss for epoch 4: 1.4586
--- Saving checkpoint for Epoch 4 ---
Model saved to /content/drive/MyDrive/My_ML_Project/sbert_hybrid_bart_model_epoch_4

--- Training Complete! ---

[STEP 5] Starting Final Evaluation on the SBERT-HYBRID model...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluating Hybrid Model:   0%|          | 0/818 [00:00<?, ?it/s]