In [None]:
# One-click: clone repo into Colab runtime and install dependencies (idempotent)
import os, subprocess, sys

repo_dir = 'YouTube-Video-Summarization'
if not os.path.exists(repo_dir):
    print('Cloning repository...')
    subprocess.run(['git', 'clone', 'https://github.com/shigenogoro/YouTube-Video-Summarization.git'], check=False)
else:
    print('Repository already present:', repo_dir)

# Change into repo directory
os.chdir(repo_dir)
print('Working directory:', os.getcwd())

# Install requirements (quiet). If you prefer to edit this, remove --quiet.
print('Installing python dependencies (this may take a minute)...')
subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt', '--quiet'], check=False)
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'transformers', 'datasets', 'evaluate', 'accelerate'], check=False)
print('Dependencies installation step finished.')


In [None]:
# Ensure repository root is on sys.path by locating the `src/` marker directory
import sys, os
from pathlib import Path

def find_repo_root(marker='src'):
    p = Path('.').resolve()
    for candidate in [p] + list(p.parents):
        if (candidate / marker).exists():
            return candidate
    return p

repo_root = find_repo_root('src')
# If we cloned into a subfolder, prefer that location
if (repo_root / '.git').exists():
    print('Found repo root:', repo_root)
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
print('Repo root added to sys.path ->', repo_root)


In [None]:
# Mount Google Drive (Colab) and optionally set DATA_CFG['path']
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print('Drive mounted at /content/drive')
    # Suggested default path on Drive - adjust as needed
    default = '/content/drive/MyDrive/YouTube-Video-Summarization/data'
    use_drive = input(f"Use Google Drive dataset path? (leave blank to use suggested default) [default: {default}]: ")
    if use_drive.strip() == '':
        DATA_CFG['path'] = default
    else:
        DATA_CFG['path'] = use_drive.strip()
    print('DATA_CFG["path"] set to', DATA_CFG['path'])
except Exception as e:
    print('Not running in Colab or Drive mount failed:', e)
    print('If you mounted Drive manually, set DATA_CFG["path"] accordingly before dataset loading.')


# Training Notebook (Colab-ready)

This notebook contains environment setup (for Colab), repository fetch/checkout, dependency installation, and the training flow that imports from `src/`.
It mirrors the modularized `00_setup.ipynb` steps but includes installation and runtime checks appropriate for a fresh Colab runtime.

In [None]:
# OPTIONAL: Clone this repository into the Colab runtime if you haven't uploaded it.
# If you already uploaded the repo or mounted Drive, skip this cell.
import os
if not os.path.exists('YouTube-Video-Summarization'):
    import subprocess
    subprocess.run(['git', 'clone', 'https://github.com/shigenogoro/YouTube-Video-Summarization.git'])
os.chdir('YouTube-Video-Summarization')
print('Working directory:', os.getcwd())

In [None]:
# Install Python dependencies. This will use the repo's requirements.txt and add common ML libs.
# You can edit this to add GPU-specific packages (bitsandbytes, deepspeed) if needed.
!pip install -r requirements.txt --quiet || true
!pip install -q transformers datasets evaluate accelerate || true
print('Dependencies installed (or already present).')

In [None]:
# Basic runtime checks (GPU, torch)
import torch
print('torch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    try:
        print('Device name:', torch.cuda.get_device_name(0))
    except Exception as e:
        print('Could not query device name:', e)

In [None]:
# Load config and small helpers
from src.utils.io import load_yaml
cfg_path = repo_root / 'configs' / 'model_bart_base.yaml'
cfg = load_yaml(str(cfg_path))
print('Loaded config keys:', list(cfg.keys()))
MODEL_NAME = cfg['model']['model_name']
DATA_CFG = cfg['data']
PREPROCESS_CFG = cfg.get('preprocess', {})
TRAINING_CFG = cfg.get('training_args', {})
print('Model:', MODEL_NAME)
print('Data config sample:', DATA_CFG)

In [None]:
# Set seed for reproducibility
from src.utils.seed import set_seed
seed = TRAINING_CFG.get('seed', 42)
set_seed(seed)
print('Seed set to', seed)

In [None]:
# Load dataset (try local/HF via existing loader, else fallback to small sample)
from src.data.loaders import load_json_dataset
from notebooks.helpers import sample_dataset

try:
    ds = load_json_dataset(DATA_CFG)
    print('Dataset loaded with splits:', list(ds.keys()))
except Exception as e:
    print('Could not load dataset from disk/HF (falling back to sample):', e)
    ds = {'train': sample_dataset(), 'validation': sample_dataset()}
    print('Sample dataset created with columns:', ds['train'].column_names)

In [None]:
# Build model + tokenizer (downloads model weights on first run)
from src.models.build_model import build_model_and_tokenizer
print('Model name to load:', MODEL_NAME)
model, tokenizer = build_model_and_tokenizer(MODEL_NAME, cfg.get('model', {}))
print('Model and tokenizer ready — tokenizer vocab size =', getattr(tokenizer, 'vocab_size', 'n/a'))

In [None]:
# Tokenize datasets using helpers.make_tokenize_fn and HF dataset.map (batched)
from notebooks.helpers import make_tokenize_fn
input_col = DATA_CFG.get('text_column', 'dialogue')
summary_col = DATA_CFG.get('summary_column', 'summary')
max_input = PREPROCESS_CFG.get('max_input_length', 1024)
max_target = PREPROCESS_CFG.get('max_target_length', 256)
tokenize_fn = make_tokenize_fn(tokenizer, input_col=input_col, target_col=summary_col, max_input_length=max_input, max_target_length=max_target)

def maybe_map(split):
    d = split
    try:
        tokenized = d.map(tokenize_fn, batched=True, remove_columns=d.column_names)
        return tokenized
    except Exception:
        return d

train_tok = maybe_map(ds['train'])
eval_tok = maybe_map(ds.get('validation', ds.get('valid', ds.get('test', ds['train']))))
print('Tokenization finished — sample keys for train tokenized:', list(train_tok.features.keys()) if hasattr(train_tok, 'features') else 'n/a')

In [None]:
# Build trainer (does not start training yet)
from src.training.trainer import build_trainer
trainer = build_trainer(model, tokenizer, train_tok, eval_tok, cfg)
print('Trainer built. Output dir =', trainer.args.output_dir)

## Start training
The next cell will prompt you before starting training. Training in Colab may require GPU runtime and will download model weights and datasets. Be sure you have enough runtime quota and choose a GPU runtime (Runtime > Change runtime type > GPU).

In [None]:
# Prompt before training to avoid accidental runs in Colab
start = input('Start training now? (y/N): ')
if start.strip().lower() == 'y':
    print('Starting training...')
    trainer.train()
else:
    print('Training skipped. To run, re-run this cell and enter y.')