## 1. Setup and Dependencies

First, we clone the utility repository and install all necessary libraries, including the transformers library for BART and the SummerTime library required by the ResultsEval.py script.

In [1]:
# ==============================================================================
## 1. Setup, Dependencies, and Directory Change
# ==============================================================================
# Setup Dependencies
!pip install transformers
!pip install datasets
!pip install rouge_score
!pip install evaluate
!pip install accelerate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=291195a988054a9603a1b291ef0cad2223f4c3f175c4b4c2e94df1ca7a0f7994
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import torch
import numpy as np
import evaluate
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import os

# --- Configuration ---
MODEL_CHECKPOINT = "facebook/bart-base"
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128
OUTPUT_DIR = "./models/meetingbank"
EVAL_BATCH_SIZE = 8
NUM_TRAIN_EPOCHS = 1 # Reduced for quick run. Increase for better results.

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Change directory to a clean output location
GDRIVE_ROOT =  "/content/drive/MyDrive/CS_685/youtube-video-summarization/"

%cd $GDRIVE_ROOT

Mounted at /content/drive
/content/drive/MyDrive/CS_685/youtube-video-summarization


## 2. Data Loading and Preprocessing for BART

We will use the Hugging Face datasets library for easy access to the MeetingBank data and the transformers library for preprocessing (tokenization) which is the core preprocessing step for models like BART.

In [4]:
# ==============================================================================
## 2. Configuration and Preprocessing
# ==============================================================================
print("\n--- Section 2: Data Loading and Preprocessing ---")

# 1. Load Data and Tokenizer
print("Loading dataset and tokenizer...")
raw_datasets = load_dataset("huuuyeah/meetingbank")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# 2. Define Preprocessing Function (Tokenization)
def preprocess_function(examples):
    # Tokenize input (transcript)
    model_inputs = tokenizer(
        examples["transcript"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Tokenize label (summary)
    # The tokenizer's `as_target_tokenizer` context manager is used for label tokenization
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length"
        )

    # Replace padding token id with -100 for loss ignoring
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 3. Apply Preprocessing and Data Type Fix
print("Applying tokenization and data type fixes...")
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

# === Ensure all token ID arrays are standard integer types (int32) ===
# This prevents the OverflowError during prediction/decoding.
def ensure_int32(example):
    example["input_ids"] = np.array(example["input_ids"], dtype=np.int32)
    example["labels"] = np.array(example["labels"], dtype=np.int32)
    example["attention_mask"] = np.array(example["attention_mask"], dtype=np.int32)
    return example

tokenized_datasets = tokenized_datasets.map(ensure_int32)
tokenized_datasets.set_format("torch")
print("Preprocessing complete. Data types secured.")


--- Section 2: Data Loading and Preprocessing ---
Loading dataset and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.json:   0%|          | 0.00/88.4M [00:00<?, ?B/s]

validation.json:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/13.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5169 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/861 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/862 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Applying tokenization and data type fixes...


Map:   0%|          | 0/5169 [00:00<?, ? examples/s]



Map:   0%|          | 0/861 [00:00<?, ? examples/s]

Map:   0%|          | 0/862 [00:00<?, ? examples/s]

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

Map:   0%|          | 0/861 [00:00<?, ? examples/s]

Map:   0%|          | 0/862 [00:00<?, ? examples/s]

Preprocessing complete. Data types secured.


## 3. Training Setup and Fine-Tuning (Required to get a Fine-Tuned Model)

In [5]:
# ==============================================================================
## 3. Training Setup and Fine-Tuning
# ==============================================================================
print("\n--- Section 3: BART Fine-Tuning Setup ---")

# 1. Load the BART Model and Data Collator
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 2. Define Training Arguments (wandb disabled)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    learning_rate=2e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    predict_with_generate=True,
    load_best_model_at_end=True,
    report_to="none", # FIX: Disable WANDB logging
)

# 3. Initialize Trainer (FutureWarning note: using processing_class for future compatibility)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer, # Preferred over passing 'tokenizer=tokenizer'
)

# 4. Start Fine-Tuning
print(f"Starting BART fine-tuning ({NUM_TRAIN_EPOCHS} epoch)...")
trainer.train()

# Find the path to the best saved model checkpoint
best_checkpoint_path = trainer.state.best_model_checkpoint if trainer.state.best_model_checkpoint else OUTPUT_DIR

print(f"\nBest/Final checkpoint path: {best_checkpoint_path}")


--- Section 3: BART Fine-Tuning Setup ---


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Starting BART fine-tuning (1 epoch)...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,1.86,1.476869


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



Best/Final checkpoint path: ./models/meetingbank/checkpoint-1293


## 4. Evaluation Function

In [6]:
# Load the ROUGE metric
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Ensure predictions are within valid token ID range and are integers
    max_vocab_id = tokenizer.vocab_size - 1
    preds = np.clip(preds, 0, max_vocab_id).astype(np.int32)

    # Decode predictions and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id).astype(np.int32)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE
    result = rouge_metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    # Format score
    result = {f"rouge_{k}": round(v * 100, 4) for k, v in result.items()}

    return result

Downloading builder script: 0.00B [00:00, ?B/s]

## 5. Running Both Evaluations

In [7]:
# ==============================================================================
## 5. Running Both Evaluations
# ==============================================================================
print("\n--- Section 5: Running Evaluations ---")

# Define common evaluation arguments (wandb disabled)
eval_args = Seq2SeqTrainingArguments(
    output_dir="./evaluation_output",
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    predict_with_generate=True,
    disable_tqdm=False,
    report_to="none",
)

# --- A. Evaluation on the Pre-trained Model (Baseline) ---
print("\n" + "="*50)
print("Evaluating Pre-trained BART Model (Baseline)")
print("="*50)
pre_trained_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

pre_trained_trainer = Seq2SeqTrainer(
    model=pre_trained_model,
    args=eval_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

pre_trained_results = pre_trained_trainer.predict(
    test_dataset=tokenized_datasets["test"],
    # FIX: Pass generation kwargs directly (num_beams, max_length)
    num_beams=4,
    max_length=MAX_TARGET_LENGTH
)
print("Pre-trained Model ROUGE Results:")
pretrained_results = compute_metrics((pre_trained_results.predictions, pre_trained_results.label_ids))
# print(pretrained_results)

# Turn pretrained_results into dataframe
pretrained_results_df = pd.DataFrame(pretrained_results, index=[0])
print(pretrained_results_df)


--- Section 5: Running Evaluations ---

Evaluating Pre-trained BART Model (Baseline)


Pre-trained Model ROUGE Results:
   rouge_rouge1  rouge_rouge2  rouge_rougeL  rouge_rougeLsum
0       35.8944       24.4663       31.0136          30.9908


In [8]:
# --- B. Evaluation on the Fine-Tuned Model ---
print("\n" + "="*50)
print("Evaluating Fine-Tuned BART Model")
print("="*50)
# Load the best model found during training
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(best_checkpoint_path)

fine_tuned_trainer = Seq2SeqTrainer(
    model=fine_tuned_model,
    args=eval_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

fine_tuned_results = fine_tuned_trainer.predict(
    test_dataset=tokenized_datasets["test"],
    # FIX: Pass generation kwargs directly (num_beams, max_length)
    num_beams=4,
    max_length=MAX_TARGET_LENGTH
)
print("Fine-Tuned Model ROUGE Results:")
finetuned_results = compute_metrics((fine_tuned_results.predictions, fine_tuned_results.label_ids))
# print(finetuned_results)

# Turn fine-tuned results into dataframe
finetuned_results_df = pd.DataFrame(finetuned_results, index=[0])
print(finetuned_results_df)


Evaluating Fine-Tuned BART Model


Fine-Tuned Model ROUGE Results:
   rouge_rouge1  rouge_rouge2  rouge_rougeL  rouge_rougeLsum
0       64.0744       54.1751       61.5078          61.4724


## 6. Save the Evaluation Results

In [21]:
# ==============================================================================
## 6. Save Both Evaluation Results to CSV
# ==============================================================================
print("\n--- Section 6: Saving Predictions to CSV ---")

def save_predictions_to_csv(prediction_output, model_name, raw_data, tokenizer):
    """
    Decodes predictions, aligns them with inputs/references, and saves to CSV.
    """
    print(f"Saving predictions for {model_name}...")

    # 1. Decode Predictions (Generated Summaries)
    pred_ids = prediction_output.predictions
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]

    # Ensure predictions are within valid token ID range and are integers
    # Add clipping before casting to int32 to prevent OverflowError
    max_vocab_id = tokenizer.vocab_size - 1
    pred_ids = np.clip(pred_ids, 0, max_vocab_id).astype(np.int32)
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    # 2. Decode Labels (Reference Summaries)
    label_ids = prediction_output.label_ids
    # Replace -100 padding with pad_token_id before decoding
    label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)

    # Cast to int32 before decoding
    label_ids = label_ids.astype(np.int32)
    decoded_labels = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # 3. Get Original Inputs
    original_inputs = raw_datasets["test"]["transcript"]

    # 4. Create DataFrame and Save
    results_df = pd.DataFrame({
        'Input_Transcript': original_inputs,
        'Reference_Summary': decoded_labels,
        'Predicted_Summary': decoded_preds
    })

    RESULTS_FILE_PATH = f"{model_name.lower().replace(' ', '_')}_predictions.csv"
    DIR_PATH = os.path.dirname('./data/meetingbank/')
    SAVE_PATH = os.path.join(DIR_PATH, RESULTS_FILE_PATH)
    results_df.to_csv(SAVE_PATH, index=False)

    print(f"Successfully saved to: {SAVE_PATH}")
    return SAVE_PATH

def save_evaluations_to_csv(results_df, model_name, base_dir='./data/meetingbank/'):
    """
    Saves a ROUGE score dataframe to a CSV file.
    """
    # Create directory if it doesn't exist
    os.makedirs(base_dir, exist_ok=True)

    # Sanitize model_name for filename
    sanitized_model_name = model_name.lower().replace(' ', '_').replace('-', '_')
    eval_path = os.path.join(base_dir, f"{sanitized_model_name}_rouge_scores.csv")
    results_df.to_csv(eval_path, index=False)
    print(f"Successfully saved {model_name} ROUGE scores to: {eval_path}")

# --- Execute Saving ---

# 1. Save Pre-trained Model Results
save_predictions_to_csv(
    pre_trained_results,
    "Pre-trained BART",
    raw_datasets,
    tokenizer
)

# 2. Save Fine-Tuned Model Results
save_predictions_to_csv(
    fine_tuned_results,
    "Fine-Tuned BART",
    raw_datasets,
    tokenizer
)

# 3. Save Pre-trained Evaluation Results (ROUGE scores)
save_evaluations_to_csv(pretrained_results_df, "Pre-trained BART")

# 4. Save Fine-tuned Evaluation Results (ROUGE scores)
save_evaluations_to_csv(finetuned_results_df, "Fine-Tuned BART")

print("\nAll prediction and evaluation files have been generated in the current directory.")


--- Section 6: Saving Predictions to CSV ---
Saving predictions for Pre-trained BART...
Successfully saved to: ./data/meetingbank/pre-trained_bart_predictions.csv
Saving predictions for Fine-Tuned BART...
Successfully saved to: ./data/meetingbank/fine-tuned_bart_predictions.csv
Successfully saved Pre-trained BART ROUGE scores to: ./data/meetingbank/pre_trained_bart_rouge_scores.csv
Successfully saved Fine-Tuned BART ROUGE scores to: ./data/meetingbank/fine_tuned_bart_rouge_scores.csv

All prediction and evaluation files have been generated in the current directory.


## 7. Combine Segment Summaries (The "Combine" Step)
This step decodes the model's predictions, sorts them by uid, and groups them by source to create the full predicted summary for each meeting.

In [26]:
import pandas as pd
import numpy as np

# Assuming the following objects are defined and available in your Notebook:
# fine_tuned_results: The prediction output from trainer.predict()
# raw_datasets: The loaded segmented dataset (must contain 'uid', 'summary', 'transcript')
# tokenizer: The BART Tokenizer

def combine_segments_to_full_summaries(prediction_output, raw_datasets, tokenizer):
    """
    Decodes predictions, extracts the Meeting ID from 'uid', sorts by 'uid',
    and concatenates segment summaries into full meeting summaries.
    """
    # 1. Decode Predictions
    pred_ids = prediction_output.predictions
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]

    # CRITICAL FIX: Explicitly cast to np.int32 to prevent OverflowError
    # Add clipping to ensure values are within valid token ID range
    max_vocab_id = tokenizer.vocab_size - 1
    pred_ids = np.clip(pred_ids, 0, max_vocab_id).astype(np.int32)
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    # 2. Prepare Metadata DataFrame
    test_metadata = pd.DataFrame(raw_datasets["test"])
    test_metadata['generated_summary_segment'] = decoded_preds

    if 'uid' not in test_metadata.columns:
        raise ValueError("DataFrame must contain the 'uid' column.")

    # Explicitly ensure the 'uid' column is of string type to prevent potential TypeError
    test_metadata['uid'] = test_metadata['uid'].astype(str)

    # 3. CRITICAL: Extract Meeting ID (location_date) from 'uid' (location_date_item)
    # rsplit('_', 1) splits once from the right (separating the segment index)
    # Changed to .apply() to avoid a mysterious TypeError with .str.rsplit()
    test_metadata['meeting_id'] = test_metadata['uid'].apply(lambda x: x.rsplit('_', 1)[0])

    # 4. Sort by 'uid' to ensure correct segment order (e.g., _001, _002...)
    test_metadata = test_metadata.sort_values(by=['uid'])

    # 5. Group by 'meeting_id' and concatenate the predicted summaries
    grouped_summaries = test_metadata.groupby('meeting_id')['generated_summary_segment'].apply(
        lambda x: ' '.join(x.tolist())
    ).reset_index(name='concatenated_full_summary')

    return grouped_summaries

# Execution Example:
full_generated_summaries_df = combine_segments_to_full_summaries(
    fine_tuned_results,
    raw_datasets,
    tokenizer
)

print("\nFull Generated Meeting Summaries (Combined):")
print(full_generated_summaries_df.head())


Full Generated Meeting Summaries (Combined):
           meeting_id                          concatenated_full_summary
0  AlamedaCC_01052021  A bill for an ordinance changing the zoning cl...
1  AlamedaCC_01062015  Recommendation to authorize City Manager or hi...
2  AlamedaCC_01072020  Recommendation to receive CDBG funding for the...
3  AlamedaCC_01162018  A MOTION approving the parcel map on 1700 Park...
4  AlamedaCC_01192016  Recommendation to adopt resolutions for the Al...


## 8. Evaluate Full Meeting Summaries (The "Evaluate" Step)
This step consists of two parts: building the Golden Reference Summary by concatenating the segment summaries, and then performing the final ROUGE comparison.

### 8A: Function to Construct Golden Reference Summary
This function implements your logic: using source as the key and concatenating the summary fields, sorted by uid.

In [29]:
def load_full_reference_summaries(meeting_ids, raw_datasets):
    """
    Constructs the full Golden Reference Summary by combining the segment 'summary'
    fields, using the 'uid' for ordering and grouping by extracted Meeting ID.
    """

    # 1. Access the test set metadata
    test_metadata = pd.DataFrame(raw_datasets["test"])

    # Explicitly ensure the 'uid' column is of string type
    test_metadata['uid'] = test_metadata['uid'].astype(str)

    # 2. Extract Meeting ID from 'uid' using .apply() to avoid potential TypeError
    test_metadata['meeting_id'] = test_metadata['uid'].apply(lambda x: x.rsplit('_', 1)[0])

    # 3. Filter data for the required meeting IDs
    test_metadata = test_metadata[test_metadata['meeting_id'].isin(meeting_ids)].copy()

    # 4. Sort by 'uid' to ensure correct segment order
    test_metadata = test_metadata.sort_values(by=['uid'])

    # 5. Group by 'meeting_id' and concatenate the 'summary' fields
    grouped_references = test_metadata.groupby('meeting_id')['summary'].apply(
        lambda x: ' '.join(x.tolist())
    ).reset_index(name='concatenated_full_reference')

    # Convert to dictionary {meeting_id: full_summary_text}
    full_references_dict = grouped_references.set_index('meeting_id')['concatenated_full_reference'].to_dict()

    print(f"\nSuccessfully constructed full reference summaries for {len(full_references_dict)} meetings.")

    return full_references_dict

### 8B: Execute Meeting-Level ROUGE Evaluation

In [31]:
# Assuming full_generated_summaries_df (from Step 4) and rouge_metric are available

def evaluate_full_meeting_summaries(generated_df, rouge_metric, raw_datasets):
    """
    Compares the combined predicted summaries against the constructed Golden Summaries
    and computes the final Meeting-Level ROUGE scores.
    """
    # 1. Get list of Meeting IDs to evaluate
    meeting_ids = generated_df['meeting_id'].tolist()

    # 2. Load/Construct Golden Reference Summaries
    full_references_dict = load_full_reference_summaries(meeting_ids, raw_datasets)

    # 3. Map the reference summaries onto the generated DataFrame
    generated_df['full_reference_summary'] = generated_df['meeting_id'].map(full_references_dict)

    # Drop rows where the full reference summary was not found
    generated_df.dropna(subset=['full_reference_summary'], inplace=True)

    # 4. Extract lists for ROUGE calculation
    predictions = generated_df['concatenated_full_summary'].tolist()
    references = generated_df['full_reference_summary'].tolist()

    # 5. Compute ROUGE
    result = rouge_metric.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True
    )

    # 6. Format and Print Results
    # Convert scores to percentage and round to 4 decimal places
    result = {f"rouge_{k}": round(v * 100, 4) for k, v in result.items()}

    print("\n" + "="*50)
    print("FINAL MEETING-LEVEL ROUGE SCORES (Divide-and-Conquer)")
    print("="*50)

    return result

# Execution Example:
final_meeting_rouge_scores = evaluate_full_meeting_summaries(
    full_generated_summaries_df.copy(),
    rouge_metric,
    raw_datasets
)

final_meeting_rouge_scores_df = pd.DataFrame(final_meeting_rouge_scores, index=[0])
print(final_meeting_rouge_scores_df)

# Save final_meeting_rouge_scores to csv
save_evaluations_to_csv(final_meeting_rouge_scores_df, 'Fine-Tuned BART')


Successfully constructed full reference summaries for 559 meetings.

FINAL MEETING-LEVEL ROUGE SCORES (Divide-and-Conquer)
   rouge_rouge1  rouge_rouge2  rouge_rougeL  rouge_rougeLsum
0       62.9654       52.3095       59.8264          59.7857
Successfully saved Fine-Tuned BART ROUGE scores to: ./data/meetingbank/fine_tuned_bart_rouge_scores.csv
