# Baseline Implementation: BART

- Load data: `01_data_exploration`

- Data Preprocessing

- Baseline Model: BART

## Load Data

Copy from `01_data_exploration.ipynb`

- Mount the Google Drive

- Github Setup

- Packages Setup

- Load Data

In [2]:
### Mount the Google Drive

from google.colab import drive
import os

drive.mount('/content/drive')

# Set the working directory to the project folder in Google Drive
GDRIVE_PATH = '/content/drive/MyDrive/CS_685/youtube-video-summarization'

Mounted at /content/drive


### Github Setup

In [3]:
username = 'shigenogoro'
repo_name = 'YouTube-Video-Summarization'
branch = 'kyle'

# Check if the destination directory already exists
import os
if os.path.exists(repo_name):
    print(f"The directory '{repo_name}' already exists. Skipping clone.")

    # Pull the latest changes
    %cd {repo_name}
    !git pull origin {branch}
else:
    # Clone the repository
    !git clone https://github.com/{username}/{repo_name}.git -b {branch}

    # Change directory to the cloned repository
    %cd {repo_name}

Cloning into 'YouTube-Video-Summarization'...
remote: Enumerating objects: 115, done.[K
remote: Counting objects: 100% (115/115), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 115 (delta 38), reused 101 (delta 25), pack-reused 0 (from 0)[K
Receiving objects: 100% (115/115), 312.30 KiB | 1.87 MiB/s, done.
Resolving deltas: 100% (38/38), done.
/content/YouTube-Video-Summarization


### Packages Setup

In [4]:
!pip install -r requirements.txt

Collecting evaluate (from -r requirements.txt (line 11))
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score (from -r requirements.txt (line 12))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score (from -r requirements.txt (line 13))
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting jiwer (from -r requirements.txt (line 15))
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer->-r requirements.txt (line 15))
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/6

In [5]:
# Download NLTK resources

!python -m spacy download en_core_web_sm

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m145.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Load Data

In [6]:
from datasets import load_dataset

meetingbank = load_dataset("huuuyeah/meetingbank")

train_data = meetingbank['train']
val_data = meetingbank['validation']
test_data = meetingbank['test']

# print the shape of the datasets
print(f"Train dataset shape: {train_data.shape}")
print(f"Validation dataset shape: {val_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.json:   0%|          | 0.00/88.4M [00:00<?, ?B/s]

validation.json:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/13.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5169 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/861 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/862 [00:00<?, ? examples/s]

Train dataset shape: (5169, 4)
Validation dataset shape: (861, 4)
Test dataset shape: (862, 4)


## Data Preprocessing

In [7]:
import torch

# Determine if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [8]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import os
from src.preprocess import preprocess_dataset

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')
print("BART tokenizer loaded.")

# Load the pre-trained BART model for summarization and move to device
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn').to(device)
print("BART model loaded and moved to device.")

# Define the path for the preprocessed data in data/preprocessed/meetingBank
preprocessed_data_path = os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank')

# Preprocess the entire dataset and save to .tsv files
if not os.path.exists(preprocessed_data_path):
    os.makedirs(preprocessed_data_path, exist_ok=True)
    preprocess_dataset(train_data, os.path.join(preprocessed_data_path, os.path.join(preprocessed_data_path, 'train')))
    preprocess_dataset(val_data, os.path.join(preprocessed_data_path, os.path.join(preprocessed_data_path, 'validation')))
    preprocess_dataset(test_data, os.path.join(preprocessed_data_path, os.path.join(preprocessed_data_path, 'test')))
    print("Data preprocessing completed and saved.")
else:
    print("Preprocessed data has already existed.")

SpaCy model 'en_core_web_sm' loaded successfully for preprocessing.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

BART tokenizer loaded.


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

BART model loaded and moved to device.
Preprocessed data has already existed.


#### Tokenization & Formatting

- Tokenize the preprocessed transcript chunks (inputs) and summaries (labels) using the loaded BART tokenizer.

- Convert the tokenized data into Hugging Face Dataset objects, structured for the Transformer architecture

  - Create PyTorch DataLoaders to efficiently handle batching during training and inference.

In [11]:
import pandas as pd
import os
import glob
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
from tqdm.auto import tqdm

# --- Setup Paths ---
device = "cuda" if torch.cuda.is_available() else "cpu"
PREPROCESSED_TEST_DIR = os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank/test')

# --- 1. Load Model and Tokenizer (Same as before) ---
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)

# Define generation parameters
MAX_INPUT_LENGTH = 1024
MAX_SUMMARY_LENGTH = 150
BATCH_SIZE = 16

# --- 2. Load ALL Preprocessed Test Data Files ---
all_files = glob.glob(os.path.join(PREPROCESSED_TEST_DIR, "*.csv"))
list_of_dfs = []

if not all_files:
    print(f"Error: No CSV files found in {PREPROCESSED_TEST_DIR}")
    # Handle the error, maybe raise an exception or create an empty DataFrame
    raise FileNotFoundError(f"No preprocessed CSV files found in {PREPROCESSED_TEST_DIR}")


for filename in all_files:
    try:
        df = pd.read_csv(filename, index_col=None, header=0)
        list_of_dfs.append(df)
    except Exception as e:
        print(f"Error reading {filename}: {e}")

# Concatenate all DataFrames into one master DataFrame
test_df = pd.concat(list_of_dfs, axis=0, ignore_index=True)

# Remove any unnamed index columns that might have been created during saving
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^__index_level_0__')]


test_dataset = Dataset.from_pandas(test_df)
print(f"Successfully loaded and combined ALL data. Total {len(test_df)} test samples for baseline.")

Successfully loaded and combined ALL data. Total 17898 test samples for baseline.


In [12]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

# --- Prerequisites (Ensure these variables are defined in your environment) ---
# Assuming 'tokenizer' (AutoTokenizer.from_pretrained) is defined
# Assuming 'model' (AutoModelForSeq2SeqLM.from_pretrained) is defined
# Assuming 'test_dataset' (Hugging Face Dataset from your combined CSVs) is defined

MAX_INPUT_LENGTH = 1024
MAX_SUMMARY_LENGTH = 150
BATCH_SIZE = 16

# --- 1. Define and Apply Tokenization Function ---
def tokenize_function(examples):
    # Tokenize the transcript chunks (inputs)
    model_inputs = tokenizer(
        examples["transcript"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Tokenize the summaries (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=MAX_SUMMARY_LENGTH,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
tokenized_test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    # Adjust columns to remove based on what exists in your test_dataset
    remove_columns=[col for col in test_dataset.column_names if col not in ['input_ids', 'attention_mask', 'labels', 'id']]
)

# Set the format to PyTorch tensors
tokenized_test_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

print("\nTokenization Complete.")
print(f"Tokenized Test Dataset Features: {tokenized_test_dataset.features}")
print(f"Total number of tokenized samples: {len(tokenized_test_dataset)}")

# --- 2. Create PyTorch DataLoader ---
# The data collator prepares the batches (e.g., handles padding)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='max_length')

test_dataloader = DataLoader(
    tokenized_test_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
    shuffle=False
)

print(f"test_dataloader successfully created with {len(test_dataloader)} batches.")

Map:   0%|          | 0/17898 [00:00<?, ? examples/s]




Tokenization Complete.
Tokenized Test Dataset Features: {'id': Value('int64'), 'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64'))}
Total number of tokenized samples: 17898
test_dataloader successfully created with 1119 batches.


In [13]:
# Save tokenized test dataset
if not os.path.exists(os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank/tokenized')):
    os.makedirs(os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank/tokenized'))
else:
    print("Tokenized test dataset already exists.")


Tokenized test dataset already exists.


## Baseline

- Baseline Inference:

  - Run the pre-trained BART model on the test set chunks to generate summaries.

  - Calculate ROUGE scores for these summaries to establish a performance baseline against which your fine-tuned model will be compared.

- Model Fine-Tuning and Saving

  - Define Training Configuration:

    - Set up Training Arguments (e.g., learning rate, batch size, number of epochs) using the Hugging Face TrainingArguments class.

    - Configure the Trainer object, passing in the model, training arguments, training dataset, and validation dataset.

      - Training Dataset (for learning)

      - Validation Dataset (for monitoring/early stopping)

- Fine-Tuning Execution:

    - Execute the `trainer.train()` method, which starts the fine-tuning process on the train set.

    - The model learns the specific patterns of meeting transcripts, with performance monitored against the validation set.

- Save Fine-Tuned Model:

  - Use `.save_pretrained()` to save the fine-tuned model weights, configuration, and updated tokenizer vocabulary.

  - Move this directory from the temporary Colab storage to a permanent location (e.g., Google Drive) for later use.

### Baseline Inference

In [14]:
import torch
import pandas as pd
from tqdm.auto import tqdm

# Constants used in the generation call (must match your intended summary lengths)
MAX_SUMMARY_LENGTH = 150
MIN_SUMMARY_LENGTH = 30
NUM_BEAMS = 4 # Standard setting for summarization quality

# Prepare to collect results
bart_baseline_results = []
print("Starting BART Baseline Inference on Test Set...")

# Ensure the model is in evaluation mode
model.eval()

# Check if baseline_results exists
BASELINE_PATH = os.path.join(GDRIVE_PATH, 'data/results/baseline')
if not os.path.exists(BASELINE_PATH):
    # Disable gradient calculation for faster inference and lower memory usage
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Generating Baseline Summaries"):
            # Extract input IDs and attention masks and move them to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # --- Model Generation ---
            # Note: We use the input_ids and attention_mask to generate the summary
            summary_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                num_beams=NUM_BEAMS,
                max_length=MAX_SUMMARY_LENGTH,
                min_length=MIN_SUMMARY_LENGTH,
                early_stopping=True
            )

            # --- Decode Summaries ---
            # Decode the generated token IDs back into human-readable text
            generated_summaries = tokenizer.batch_decode(
                summary_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )

            # Decode the original reference labels for comparison
            # The -100 in the labels must be replaced with the pad token ID for decoding
            labels = batch['labels'].cpu().numpy()
            labels[labels == -100] = tokenizer.pad_token_id
            original_summaries = tokenizer.batch_decode(
                labels,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )

            # --- Store Results ---
            for gen_sum, ref_sum in zip(generated_summaries, original_summaries):
                bart_baseline_results.append({
                    'original_summary': ref_sum,
                    'generated_summary': gen_sum,
                })

    print("\nBaseline Summary Generation Complete.")

    # Convert results to a DataFrame for ROUGE evaluation
    bart_baseline_df = pd.DataFrame(bart_baseline_results)
    print(f"BART baseline results stored: {len(bart_baseline_df)} entries.")

    # Save the results to the results
    bart_baseline_df.to_csv(os.path.join(GDRIVE_PATH, 'data/results/baseline/bart_baseline_results.csv'), index=False)
else:
    print("BART baseline results already exist.")
    bart_baseline_df = pd.read_csv(os.path.join(BASELINE_PATH, 'bart_baseline_results.csv'))
    print(f"BART baseline results loaded: {len(bart_baseline_df)} entries.")

Starting BART Baseline Inference on Test Set...
BART baseline results already exist.
BART baseline results loaded: 17898 entries.


In [15]:
import evaluate
import pandas as pd

# Load the ROUGE and BERTScore metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# --- 1. Calculate ROUGE scores (Using bart_baseline_df from previous steps) ---
rouge_results = rouge.compute(
    predictions=bart_baseline_df['generated_summary'].tolist(),
    references=bart_baseline_df['original_summary'].tolist(),
    use_stemmer=True
)

# --- 2. Calculate BERTScore ---
# BERTScore requires a 'model_type' (e.g., 'bert-base-uncased') for its embeddings
# It may take a minute or two to run, as it loads the BERT model.
bert_results = bertscore.compute(
    predictions=bart_baseline_df['generated_summary'].tolist(),
    references=bart_baseline_df['original_summary'].tolist(),
    lang="en", # Language of the text
    model_type="distilbert-base-uncased" # A faster, lighter BERT model for scoring
)

# Calculate the mean F1 score for BERTScore
mean_bert_f1 = sum(bert_results['f1']) / len(bert_results['f1'])


print("\n--- BART BASELINE EVALUATION SCORES ---")
# ROUGE Scores
print(f"ROUGE-1 (F1): {rouge_results['rouge1'] * 100:.2f}")
print(f"ROUGE-2 (F1): {rouge_results['rouge2'] * 100:.2f}")
print(f"ROUGE-L (F1): {rouge_results['rougeL'] * 100:.2f}")
# BERTScore
print(f"BERTScore (Mean F1): {mean_bert_f1 * 100:.2f}")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


--- BART BASELINE EVALUATION SCORES ---
ROUGE-1 (F1): 16.36
ROUGE-2 (F1): 2.53
ROUGE-L (F1): 11.62
BERTScore (Mean F1): 69.72


##### Save the Evaluation Results

In [16]:
# Make the evaluation score into a dataframe
eval_score = pd.DataFrame({
    'ROUGE-1 (F1)': [rouge_results['rouge1'] * 1],
    'ROUGE-2 (F1)': [rouge_results['rouge2'] * 1],
    'ROUGE-L (F1)': [rouge_results['rougeL'] * 1],
    'BERTScore (Mean F1)': [mean_bert_f1 * 1]
})

# Save the evaluation results
if not os.path.exists(os.path.join(GDRIVE_PATH, 'data/eval/baseline')):
    os.makedirs(os.path.join(GDRIVE_PATH, 'data/eval/baseline'))
    eval_score.to_csv(os.path.join(GDRIVE_PATH, 'data/eval/baseline/eval_score.csv'), index=False)
else:
    print("Evluation has already done.")

Evluation has already done.


### Model Fine-Tuning and Saving

#### Tokenize Train/Validation Data

In [17]:
import pandas as pd
import os
import glob
from datasets import Dataset
from transformers import AutoTokenizer

# --- Setup ---
MAX_INPUT_LENGTH = 1024
MAX_SUMMARY_LENGTH = 150
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# Define the paths to your preprocessed data
TRAIN_DIR = os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank//train')
VAL_DIR = os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank/validation')

# --- Helper Function to Load & Combine All CSVs ---
def load_and_combine_data(directory):
    all_files = glob.glob(os.path.join(directory, "*.csv"))
    if not all_files:
        raise FileNotFoundError(f"No CSV files found in {directory}")

    df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    # Clean up index columns if they exist
    df = df.loc[:, ~df.columns.str.contains('^Unnamed|__index_level_0__')]
    return Dataset.from_pandas(df)

train_dataset = load_and_combine_data(TRAIN_DIR)
val_dataset = load_and_combine_data(VAL_DIR)

print(f"Train samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")


# --- Tokenization Function (Same as before) ---
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["transcript"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=MAX_SUMMARY_LENGTH,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
if not os.path.exists(os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank/tokenized')):
    tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
    print("Tokenization for Train set is completed.")
else:
    print("Tokenization for Train set has already done.")

if not os.path.exists(os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank/tokenized')):
    tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)
    print("Tokenization for Validation set is completed")
else:
    print("Tokenization for Validation sets has already done.")

Train samples: 117319, Validation samples: 17597
Tokenization for Train set has already done.
Tokenization for Validation sets has already done.


#### Save the tokenized dataset

In [18]:
import os
from datasets import Dataset

# Define the base save directory
BASE_SAVE_PATH = os.path.join(GDRIVE_PATH, 'data/preprocessed/meetingBank/tokenized')
train_path = os.path.join(BASE_SAVE_PATH, 'train')
val_path = os.path.join(BASE_SAVE_PATH, 'validation')
test_path = os.path.join(BASE_SAVE_PATH, 'test')

# Create the directory if it doesn't exist
if not os.path.exists(BASE_SAVE_PATH):
    os.makedirs(BASE_SAVE_PATH)
    print(f"Created directory: {BASE_SAVE_PATH}")

# --- 1. Save Tokenized Train Data ---
if not os.path.exists(train_path):
    tokenized_train.save_to_disk(train_path)
    print(f"Tokenized train data saved to: {train_path}")
else:
    print("Tokenized train data already exists.")

# --- 2. Save Tokenized Validation Data ---
if not os.path.exists(val_path):
    val_path = tokenized_val.save_to_disk(val_path)
    print(f"Tokenized validation data saved to: {val_path}")
else:
    print("Tokenized validation data already exists.")

# --- 3. Save Tokenized Test Data ---
if not os.path.exists(test_path):
    tokenized_test_dataset.save_to_disk(test_path)
    print(f"Tokenized test data saved to: {test_path}")
else:
    print("Tokenized test data already exists.")

Tokenized train data already exists.
Tokenized validation data already exists.
Tokenized test data already exists.


#### Define Training Arguments and Model

In [21]:
# Load tokenized_train, and tokenized_val from the drive
tokenized_train = Dataset.load_from_disk(os.path.join(BASE_SAVE_PATH, 'train'))
tokenized_val = Dataset.load_from_disk(os.path.join(BASE_SAVE_PATH, 'validation'))

# Show the number of training samples
print(f"Train samples: {len(tokenized_train)}, Validation samples: {len(tokenized_val)}")

Train samples: 117319, Validation samples: 17597


In [22]:
import os
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    DataCollatorForSeq2Seq
)

# Define paths and constants
GDRIVE_PATH = '/content/drive/MyDrive/CS_685/youtube-video-summarization/'
OUTPUT_DIR = os.path.join(GDRIVE_PATH, 'model/training_output')

# --- Training Arguments (Fixed for Transformers v4.57+) ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,

    # Core Hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,

    # GPU/Performance
    fp16=True,
    gradient_accumulation_steps=4,

    # Logging and Saving
    logging_dir=os.path.join(OUTPUT_DIR, 'logs'),
    logging_steps=100,

    # --- FIX FOR NEW VERSIONS ---
    save_strategy="epoch",
    eval_strategy="epoch",       # <--- Renamed from 'evaluation_strategy'

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

print("Training arguments defined successfully.")

Training arguments defined successfully.


#### Create and run the trainer

In [23]:
from transformers import Trainer

FINAL_MODEL_PATH = os.path.join(GDRIVE_PATH, 'model/final_bart_finetuned_model')

# --- Instantiate the Trainer ---
trainer = Trainer(
    model=model,                         # The loaded BART model
    args=training_args,                  # The training arguments defined above
    train_dataset=tokenized_train,       # The tokenized training data
    eval_dataset=tokenized_val,          # The tokenized validation data
    tokenizer=tokenizer,                 # The tokenizer
    data_collator=data_collator,         # The data collator
)

# --- Start Training ---
if not os.path.exists(FINAL_MODEL_PATH):
  print("Starting Fine-Tuning...")
  trainer.train()
else:
  print(f"Final model already exists at: {FINAL_MODEL_PATH}")

  trainer = Trainer(


Final model already exists at: /content/drive/MyDrive/CS_685/youtube-video-summarization/model/final_bart_finetuned_model


### Save Fine-Tuned Model

In [24]:
# --- Save Final/Best Model ---
# The Trainer will automatically save the best model if load_best_model_at_end=True

if not os.path.exists(FINAL_MODEL_PATH):
    os.makedirs(FINAL_MODEL_PATH)

    trainer.save_model(FINAL_MODEL_PATH)
    tokenizer.save_pretrained(FINAL_MODEL_PATH)

    print(f"\nFine-Tuning complete. Best model saved to: {FINAL_MODEL_PATH}")
else:
    print(f"Final model already exists at: {FINAL_MODEL_PATH}")

Final model already exists at: /content/drive/MyDrive/CS_685/youtube-video-summarization/model/final_bart_finetuned_model


### Load Fine-Tuned Model

## Final Evaluation and Analysis

- Final Evaluation:

    - Load the fine-tuned model and run it against the final, unseen test set.

    - Calculate ROUGE-1, ROUGE-2, ROUGE-L scores, and BERTScore to objectively measure the model's summarization quality.

- Result Analysis:

    - Compare the final evaluation scores of the fine-tuned model against the baseline model to quantify your model's improvement.

    - Analyze example generated summaries to identify strengths and weaknesses.

### Fine-Tuning Execution

##### Finetuned Model Inference

In [25]:
import torch
import pandas as pd
import os
from tqdm.auto import tqdm
from transformers import AutoModelForSeq2SeqLM # Ensure this is imported

# Load the best model and move it to the device
# NOTE: FINAL_MODEL_PATH, device, and AutoModelForSeq2SeqLM must be defined/imported
best_model = AutoModelForSeq2SeqLM.from_pretrained(FINAL_MODEL_PATH).to(device)

# Constants used in the generation call
MAX_SUMMARY_LENGTH = 150
MIN_SUMMARY_LENGTH = 30
NUM_BEAMS = 4

# Define the full path for the finetuned results file
FINETUNED_RESULTS_FILE = os.path.join(GDRIVE_PATH, 'data/results/baseline/bart_finetuned_results.csv')

# Prepare to collect results
bart_finetuned_results = []
print("Starting BART Finetuned Model Inference on Test Set...")

# Ensure the model is in evaluation mode
best_model.eval()

# Check if the finetuned results file already exists
if not os.path.exists(FINETUNED_RESULTS_FILE):
    # Ensure the directory for results exists before writing
    os.makedirs(os.path.dirname(FINETUNED_RESULTS_FILE), exist_ok=True)

    # Disable gradient calculation for faster inference and lower memory usage
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Generating Finetuned Summaries"):
            # Extract input IDs and attention masks and move them to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # --- Model Generation ---
            summary_ids = best_model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                num_beams=NUM_BEAMS,
                max_length=MAX_SUMMARY_LENGTH,
                min_length=MIN_SUMMARY_LENGTH,
                early_stopping=True,
                length_penalty=1.0  # <--- FIX: Explicitly set the length penalty
            )

            # --- Decode Summaries ---
            generated_summaries = tokenizer.batch_decode(
                summary_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )

            # Decode the original reference labels for comparison
            labels = batch['labels'].cpu().numpy()
            labels[labels == -100] = tokenizer.pad_token_id
            original_summaries = tokenizer.batch_decode(
                labels,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )

            # --- Store Results ---
            for gen_sum, ref_sum in zip(generated_summaries, original_summaries):
                bart_finetuned_results.append({
                    'original_summary': ref_sum,
                    'generated_summary': gen_sum,
                })

    print("\nFinetuned Summary Generation Complete.")

    # Convert results to a DataFrame for ROUGE evaluation
    bart_finetuned_df = pd.DataFrame(bart_finetuned_results)
    print(f"Finetuned BART results stored: {len(bart_finetuned_df)} entries.")

    # Save the results to the results directory
    bart_finetuned_df.to_csv(FINETUNED_RESULTS_FILE, index=False)
else:
    print("Finetuned BART results already exist. Loading from file...")
    bart_finetuned_df = pd.read_csv(FINETUNED_RESULTS_FILE)
    print(f"Finetuned BART results loaded: {len(bart_finetuned_df)} entries.")



Starting BART Finetuned Model Inference on Test Set...


Generating Finetuned Summaries:   0%|          | 0/1119 [00:00<?, ?it/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)



Finetuned Summary Generation Complete.
Finetuned BART results stored: 17898 entries.


##### Evaluate the ROUGE and BERT Score

In [26]:
import evaluate
import pandas as pd
import os
import torch

# --- 1. Load the Evaluation Metrics ---
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# --- 2. Define the path (make sure this matches the path in your last code block) ---
GDRIVE_PATH = '/content/drive/MyDrive/CS_685/youtube-video-summarization/' # Assuming this is the correct path
FINETUNED_RESULTS_FILE = os.path.join(GDRIVE_PATH, 'data/results/baseline/bart_finetuned_results.csv')

# --- 3. Load the Fine-tuned Results ---
# Assuming the previous block ran and saved this file successfully
try:
    bart_finetuned_df = pd.read_csv(FINETUNED_RESULTS_FILE)
except FileNotFoundError:
    print(f"Error: Fine-tuned results file not found at {FINETUNED_RESULTS_FILE}. Please run the previous cell first.")
    exit()

# Extract predictions and references
predictions = bart_finetuned_df['generated_summary'].tolist()
references = bart_finetuned_df['original_summary'].tolist()


# --- 4. Calculate ROUGE scores ---
print("Calculating ROUGE scores...")
rouge_results = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True
)

# --- 5. Calculate BERTScore ---
# Using DistilBERT for faster scoring
print("Calculating BERTScore (using distilbert-base-uncased)... This may take a moment.")
bert_results = bertscore.compute(
    predictions=predictions,
    references=references,
    lang="en",
    model_type="distilbert-base-uncased"
)

# Calculate the mean F1 score for BERTScore
mean_bert_f1 = sum(bert_results['f1']) / len(bert_results['f1'])

print("\n" + "="*40)
print("FINETUNED MODEL EVALUATION SCORES")
print("="*40)
# ROUGE Scores
print(f"ROUGE-1 (F1): {rouge_results['rouge1'] * 100:.2f}")
print(f"ROUGE-2 (F1): {rouge_results['rouge2'] * 100:.2f}")
print(f"ROUGE-L (F1): {rouge_results['rougeL'] * 100:.2f}")
# BERTScore
print(f"BERTScore (Mean F1): {mean_bert_f1 * 100:.2f}")
print("="*40)

Calculating ROUGE scores...
Calculating BERTScore (using distilbert-base-uncased)... This may take a moment.

FINETUNED MODEL EVALUATION SCORES
ROUGE-1 (F1): 7.03
ROUGE-2 (F1): 1.97
ROUGE-L (F1): 5.50
BERTScore (Mean F1): 62.84


##### Save the Evaluation Result

In [29]:
# Make the evaluation score into a dataframe
eval_score = pd.DataFrame({
    'ROUGE-1 (F1)': [rouge_results['rouge1'] * 1],
    'ROUGE-2 (F1)': [rouge_results['rouge2'] * 1],
    'ROUGE-L (F1)': [rouge_results['rougeL'] * 1],
    'BERTScore (Mean F1)': [mean_bert_f1 * 1]
})

EVAL_FILE_PATH = os.path.join(GDRIVE_PATH, 'data/eval/baseline/eval_finetuned_score.csv')

# Save the evaluation results
if not os.path.exists(EVAL_FILE_PATH):
    eval_score.to_csv(os.path.join(GDRIVE_PATH, 'data/eval/baseline/eval_finetuned_score.csv'), index=False)
    print(f"Evaluation results saved to: {EVAL_FILE_PATH}")
else:
    print("Evluation has already done.")

Evluation has already done.


## Test 1st Epoch Model

In [9]:
import os
from transformers import AutoModelForSeq2SeqLM

# 1. Define the base paths
GDRIVE_PATH = '/content/drive/MyDrive/CS_685/youtube-video-summarization/'
OUTPUT_DIR = os.path.join(GDRIVE_PATH, 'model/training_output')

# 2. Define the exact checkpoint folder name
# This number must match the total steps completed at the end of the first epoch.
# Check your training logs for the precise step count if it's not 7333.
FIRST_EPOCH_CHECKPOINT = "checkpoint-7333"
FIRST_EPOCH_PATH = os.path.join(OUTPUT_DIR, FIRST_EPOCH_CHECKPOINT)

# 3. Load the model
# NOTE: Ensure 'device' is defined (e.g., 'cuda' or 'cpu')
model_epoch_1 = AutoModelForSeq2SeqLM.from_pretrained(FIRST_EPOCH_PATH).to(device)

print(f"Model from first epoch loaded successfully from: {FIRST_EPOCH_PATH}")

# You can now run the inference script (which you fixed earlier) using model_epoch_1
# instead of best_model to test its performance.



Model from first epoch loaded successfully from: /content/drive/MyDrive/CS_685/youtube-video-summarization/model/training_output/checkpoint-7333


### Run the inference script

In [None]:
import torch
import pandas as pd
import os
from tqdm.auto import tqdm
from transformers import AutoModelForSeq2SeqLM # Ensure this is imported

# Constants used in the generation call
MAX_SUMMARY_LENGTH = 150
MIN_SUMMARY_LENGTH = 30
NUM_BEAMS = 4

# Define the full path for the finetuned results file
EPOCH_1_RESULTS_FILE = os.path.join(GDRIVE_PATH, 'data/results/baseline/bart_epoch1_results.csv')

# Prepare to collect results
bart_epoch_1_results = []
print("Starting BART Epoch1 Model Inference on Test Set...")

# Ensure the model is in evaluation mode
model_epoch_1.eval()

# Check if the finetuned results file already exists
if not os.path.exists(EPOCH_1_RESULTS_FILE):
    # Ensure the directory for results exists before writing
    os.makedirs(os.path.dirname(EPOCH_1_RESULTS_FILE), exist_ok=True)

    # Disable gradient calculation for faster inference and lower memory usage
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Generating Finetuned Summaries"):
            # Extract input IDs and attention masks and move them to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # --- Model Generation ---
            summary_ids = model_epoch_1.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                num_beams=NUM_BEAMS,
                max_length=MAX_SUMMARY_LENGTH,
                min_length=MIN_SUMMARY_LENGTH,
                early_stopping=True,
                length_penalty=1.0  # <--- FIX: Explicitly set the length penalty
            )

            # --- Decode Summaries ---
            generated_summaries = tokenizer.batch_decode(
                summary_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )

            # Decode the original reference labels for comparison
            labels = batch['labels'].cpu().numpy()
            labels[labels == -100] = tokenizer.pad_token_id
            original_summaries = tokenizer.batch_decode(
                labels,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )

            # --- Store Results ---
            for gen_sum, ref_sum in zip(generated_summaries, original_summaries):
                bart_epoch_1_results.append({
                    'original_summary': ref_sum,
                    'generated_summary': gen_sum,
                })

    print("\nFinetuned Summary Generation Complete.")

    # Convert results to a DataFrame for ROUGE evaluation
    bart_epoch_1_df = pd.DataFrame(bart_epoch_1_results)
    print(f"Epoch 1 BART results stored: {len(bart_epoch_1_df)} entries.")

    # Save the results to the results directory
    bart_epoch_1_df.to_csv(EPOCH_1_RESULTS_FILE, index=False)
else:
    print("Epoch 1 BART results already exist. Loading from file...")
    bart_epoch_1_df = pd.read_csv(EPOCH_1_RESULTS_FILE)
    print(f"Finetuned BART results loaded: {len(bart_epoch_1_df)} entries.")

#### Evaluate the ROUGE and BERTScore

In [16]:
import evaluate
import pandas as pd
import os
import torch

# --- 1. Load the Evaluation Metrics ---
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# --- 2. Define the path (make sure this matches the path in your last code block) ---
GDRIVE_PATH = '/content/drive/MyDrive/CS_685/youtube-video-summarization/' # Assuming this is the correct path
EPOCH_1_RESULTS_FILE = os.path.join(GDRIVE_PATH, 'data/results/baseline/bart_epoch1_results.csv')

# --- 3. Load the Fine-tuned Results ---
# Assuming the previous block ran and saved this file successfully
try:
    bart_epoch_1_df = pd.read_csv(EPOCH_1_RESULTS_FILE)
except FileNotFoundError:
    print(f"Error: epoch 1 results file not found at {EPOCH_1_RESULTS_FILE}. Please run the previous cell first.")
    exit()

# Extract predictions and references
predictions = bart_epoch_1_df['generated_summary'].tolist()
references = bart_epoch_1_df['original_summary'].tolist()


# --- 4. Calculate ROUGE scores ---
print("Calculating ROUGE scores...")
rouge_results = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True
)

# --- 5. Calculate BERTScore ---
# Using DistilBERT for faster scoring
print("Calculating BERTScore (using distilbert-base-uncased)... This may take a moment.")
bert_results = bertscore.compute(
    predictions=predictions,
    references=references,
    lang="en",
    model_type="distilbert-base-uncased"
)

# Calculate the mean F1 score for BERTScore
mean_bert_f1 = sum(bert_results['f1']) / len(bert_results['f1'])

print("\n" + "="*40)
print("FINETUNED MODEL EVALUATION SCORES")
print("="*40)
# ROUGE Scores
print(f"ROUGE-1 (F1): {rouge_results['rouge1'] * 100:.2f}")
print(f"ROUGE-2 (F1): {rouge_results['rouge2'] * 100:.2f}")
print(f"ROUGE-L (F1): {rouge_results['rougeL'] * 100:.2f}")
# BERTScore
print(f"BERTScore (Mean F1): {mean_bert_f1 * 100:.2f}")
print("="*40)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Calculating ROUGE scores...
Calculating BERTScore (using distilbert-base-uncased)... This may take a moment.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


FINETUNED MODEL EVALUATION SCORES
ROUGE-1 (F1): 7.03
ROUGE-2 (F1): 1.97
ROUGE-L (F1): 5.50
BERTScore (Mean F1): 62.84


#### Save the result

In [17]:
# Make the evaluation score into a dataframe
eval_score = pd.DataFrame({
    'ROUGE-1 (F1)': [rouge_results['rouge1'] * 1],
    'ROUGE-2 (F1)': [rouge_results['rouge2'] * 1],
    'ROUGE-L (F1)': [rouge_results['rougeL'] * 1],
    'BERTScore (Mean F1)': [mean_bert_f1 * 1]
})

EVAL_FILE_PATH = os.path.join(GDRIVE_PATH, 'data/eval/baseline/eval_epoch1_score.csv')

# Save the evaluation results
if not os.path.exists(EVAL_FILE_PATH):
    eval_score.to_csv(os.path.join(GDRIVE_PATH, 'data/eval/baseline/eval_epoch1_score.csv'), index=False)
    print(f"Evaluation results saved to: {EVAL_FILE_PATH}")
else:
    print("Evluation has already done.")

Evaluation results saved to: /content/drive/MyDrive/CS_685/youtube-video-summarization/data/eval/baseline/eval_epoch1_score.csv


### Result Analysis