In [None]:

print("Installing libraries for bert-extractive-summarizer...")
!pip install transformers datasets pandas torch tqdm --quiet
!pip install bert-extractive-summarizer spacy --quiet
!python -m spacy download en_core_web_sm --quiet

import os
import datasets
from datasets import load_dataset
from summarizer import Summarizer
from tqdm.notebook import tqdm
from google.colab import drive

print("--- All libraries installed and imported successfully! ---")

print("\n[STEP 1] Connecting to your Google Drive...")
print("Please authorize this Colab session. You MUST sign in to YOUR Google account.")

try:
    drive.mount('/content/drive', force_remount=True)
    print("\n--- Google Drive successfully mounted! ---")
except Exception as e:
    print(f"Error mounting drive: {e}")
    print("Please fix any authorization issues and run this cell again.")

save_path = "/content/drive/MyDrive/My_ML_Project/Hybrid_Dataset_Saved_BERT"
os.makedirs(save_path, exist_ok=True)
print(f"Data will be permanently saved to: {save_path}")

print("\n[STEP 2] Loading the BillSum dataset...")
ds = load_dataset("FiscalNote/billsum")
train_ds = ds['train']
test_ds = ds['test']
print(f"Loaded {len(train_ds)} training and {len(test_ds)} test examples.")

print("\n[STEP 3] Loading the BERT 'Highlighter' model...")

extractive_model = Summarizer()
print("BERT Extractive model loaded successfully.")

def create_hybrid_input(long_document, num_sentences=20):
    try:
        return extractive_model(long_document, num_sentences=num_sentences)
    except:
        # Fallback for very short or empty documents
        return long_document

print("\n[STEP 4] Starting condensation of TRAINING data. This will take several hours...")

original_train_texts = train_ds['text']
original_train_summaries = train_ds['summary']
hybrid_train_texts = []
for doc in tqdm(original_train_texts, desc="Condensing train texts"):
    hybrid_train_texts.append(create_hybrid_input(doc))
print(f"Created {len(hybrid_train_texts)} hybrid training texts.")

print("\n[STEP 5] Starting condensation of TEST data. This will take ~30-40 minutes...")
original_test_texts = test_ds['text']
original_test_summaries = test_ds['summary']
hybrid_test_texts = []
for doc in tqdm(original_test_texts, desc="Condensing test texts"):
    hybrid_test_texts.append(create_hybrid_input(doc))
print(f"Created {len(hybrid_test_texts)} hybrid test texts.")

print("\n--- Condensation complete! ---")

print(f"\n[STEP 6] Saving your new condensed BERT dataset to {save_path}...")

hybrid_train_dataset_obj = datasets.Dataset.from_dict({
    'text': hybrid_train_texts,
    'summary': original_train_summaries
})

hybrid_test_dataset_obj = datasets.Dataset.from_dict({
    'text': hybrid_test_texts,
    'summary': original_test_summaries
})

hybrid_train_dataset_obj.save_to_disk(f"{save_path}/train")
hybrid_test_dataset_obj.save_to_disk(f"{save_path}/test")

print("\n\n--- SUCCESS! ---")
print("Your condensed data (created with BERT) is now permanently saved.")
print("You can safely close this notebook.")

Installing libraries for bert-extractive-summarizer...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m110.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
--- All libraries installed and imported successfully! ---

[STEP 1] Connecting to your Google Drive...
Please authorize this Colab session. You MUST sign in to YOUR Google account.
Error mounting drive: Error: credential propagation was unsuccessful
Please fix any authorization issues and run this cell again.
Data will be permanently saved to: /content/drive/MyDrive/My_ML_Project/Hybrid_Dataset_Saved_BERT

[STEP 2] Loading the BillSum dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

data/ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Loaded 18949 training and 3269 test examples.

[STEP 3] Loading the BERT 'Highlighter' model...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BERT Extractive model loaded successfully.

[STEP 4] Starting condensation of TRAINING data. This will take several hours...


Condensing train texts:   0%|          | 0/18949 [00:00<?, ?it/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Created 18949 hybrid training texts.

[STEP 5] Starting condensation of TEST data. This will take ~30-40 minutes...


Condensing test texts:   0%|          | 0/3269 [00:00<?, ?it/s]

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Created 3269 hybrid test texts.

--- Condensation complete! ---

[STEP 6] Saving your new condensed BERT dataset to /content/drive/MyDrive/My_ML_Project/Hybrid_Dataset_Saved_BERT...


Saving the dataset (0/1 shards):   0%|          | 0/18949 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3269 [00:00<?, ? examples/s]



--- SUCCESS! ---
Your condensed data (created with BERT) is now permanently saved.
You can safely close this notebook.


In [None]:
# ==============================================================================
# STEP 1: Connect to your CORRECT Google Drive
# ==============================================================================
from google.colab import drive
import datasets
import os

print("Connecting to Google Drive...")
print("Please sign in to the CORRECT drive where you want to save the data.")

try:
    # This 'force_remount=True' will force it to ask you to sign in again
    drive.mount('/content/drive', force_remount=True)
    print("\n--- Google Drive successfully mounted! ---")
except Exception as e:
    print(f"Error mounting drive: {e}")

# ==============================================================================
# STEP 2: Define your save path and save the data
# ==============================================================================
# This is the path where the data will be saved (in the drive you just connected)
save_path = "/content/drive/MyDrive"
os.makedirs(save_path, exist_ok=True)
print(f"Data will be permanently saved to: {save_path}")

try:
    # 1. Create new Dataset objects from the lists still in memory
    hybrid_train_dataset_obj = datasets.Dataset.from_dict({
        'text': hybrid_train_texts,
        'summary': original_train_summaries
    })

    hybrid_test_dataset_obj = datasets.Dataset.from_dict({
        'text': hybrid_test_texts,
        'summary': original_test_summaries
    })

    # 2. Save the datasets to your Drive
    hybrid_train_dataset_obj.save_to_disk(f"{save_path}/train")
    hybrid_test_dataset_obj.save_to_disk(f"{save_path}/test")

    print("\n\n--- SUCCESS! ---")
    print("Your condensed data is now permanently saved to your correct Google Drive.")

except NameError:
    print("\nError: The lists 'hybrid_train_texts' or 'hybrid_test_texts' were not found in memory.")
    print("This means your Colab session may have already restarted. You might need to re-run the full script.")

Connecting to Google Drive...
Please sign in to the CORRECT drive where you want to save the data.
Error mounting drive: Mountpoint must not already contain files
Data will be permanently saved to: /content/drive/MyDrive


Saving the dataset (0/1 shards):   0%|          | 0/18949 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3269 [00:00<?, ? examples/s]



--- SUCCESS! ---
Your condensed data is now permanently saved to your correct Google Drive.
