<a href="https://colab.research.google.com/github/ritusingh-29/Hybrid-Legal-Document-Summarization/blob/main/Hybrid_Sbert_Part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

print("Installing libraries...")
!pip install transformers datasets pandas torch tqdm --quiet
!pip install bert-extractive-summarizer --quiet
!pip install sentence-transformers --quiet

import os
import datasets
from datasets import load_dataset
from summarizer.sbert import SBertSummarizer
from tqdm.notebook import tqdm
from google.colab import drive

print("--- All libraries installed and imported successfully! ---")

print("\n[STEP 1] Connecting to your Google Drive...")
print("Please authorize this Colab session. You MUST sign in to YOUR Google account.")

try:
    drive.mount('/content/drive', force_remount=True)
    print("\n--- Google Drive successfully mounted! ---")
except Exception as e:
    print(f"Error mounting drive: {e}")
    print("Please fix any authorization issues and run this cell again.")

save_path = "/content/drive/MyDrive/Hybrid_Dataset_Saved_SBERT"
os.makedirs(save_path, exist_ok=True)
print(f"Data will be permanently saved to: {save_path}")

print("\n[STEP 2] Loading the BillSum dataset...")
ds = load_dataset("FiscalNote/billsum")
train_ds = ds['train']
test_ds = ds['test']
print(f"Loaded {len(train_ds)} training and {len(test_ds)} test examples.")

print("\n[STEP 3] Loading the SBERT 'Highlighter' model...")

extractive_model = SBertSummarizer('paraphrase-MiniLM-L6-v2')
print("SBERT Extractive model loaded successfully.")

def create_hybrid_input(long_document, num_sentences=20):
    try:

        return extractive_model(long_document, num_sentences=num_sentences)

        return long_document


original_train_texts = train_ds['text']
original_train_summaries = train_ds['summary']
hybrid_train_texts = []
for doc in tqdm(original_train_texts, desc="Condensing train texts"):
    hybrid_train_texts.append(create_hybrid_input(doc))
print(f"Created {len(hybrid_train_texts)} hybrid training texts.")

print("\n[STEP 5] Starting condensation of TEST data. This will take ~30-40 minutes...")
original_test_texts = test_ds['text']
original_test_summaries = test_ds['summary']
hybrid_test_texts = []
for doc in tqdm(original_test_texts, desc="Condensing test texts"):
    hybrid_test_texts.append(create_hybrid_input(doc))
print(f"Created {len(hybrid_test_texts)} hybrid test texts.")

print("\n--- Condensation complete! ---")

print(f"\n[STEP 6] Saving your new condensed SBERT dataset to {save_path}...")

# 1. Create new Dataset objects from your lists
hybrid_train_dataset_obj = datasets.Dataset.from_dict({
    'text': hybrid_train_texts,
    'summary': original_train_summaries
})

hybrid_test_dataset_obj = datasets.Dataset.from_dict({
    'text': hybrid_test_texts,
    'summary': original_test_summaries
})

# 2. Save the datasets to your Drive
hybrid_train_dataset_obj.save_to_disk(f"{save_path}/train")
hybrid_test_dataset_obj.save_to_disk(f"{save_path}/test")

print("\n\n--- SUCCESS! ---")
print("Your condensed data (created with SBERT) is now permanently saved.")
print("You can safely close this notebook. You are ready for the training phase.")

Installing libraries...
--- All libraries installed and imported successfully! ---

[STEP 1] Connecting to your Google Drive...
Please authorize this Colab session. You MUST sign in to YOUR Google account.
Mounted at /content/drive

--- Google Drive successfully mounted! ---
Data will be permanently saved to: /content/drive/MyDrive/Hybrid_Dataset_Saved_SBERT

[STEP 2] Loading the BillSum dataset...
Loaded 18949 training and 3269 test examples.

[STEP 3] Loading the SBERT 'Highlighter' model...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SBERT Extractive model loaded successfully.

[STEP 4] Starting condensation of TRAINING data. This will take several hours...


Condensing train texts:   0%|          | 0/18949 [00:00<?, ?it/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Created 18949 hybrid training texts.

[STEP 5] Starting condensation of TEST data. This will take ~30-40 minutes...


Condensing test texts:   0%|          | 0/3269 [00:00<?, ?it/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Created 3269 hybrid test texts.

--- Condensation complete! ---

[STEP 6] Saving your new condensed SBERT dataset to /content/drive/MyDrive/Hybrid_Dataset_Saved_SBERT...


Saving the dataset (0/1 shards):   0%|          | 0/18949 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3269 [00:00<?, ? examples/s]



--- SUCCESS! ---
Your condensed data (created with SBERT) is now permanently saved.
You can safely close this notebook. You are ready for the training phase.
