In [None]:
# clone the repo branch kyle (idempotent if you remove older clone)
!git clone --branch kyle --single-branch https://github.com/shigenogoro/YouTube-Video-Summarization.git
%cd YouTube-Video-Summarization

In [None]:
# Ensure repository root is on sys.path by locating the `src/` marker directory
import sys, os
from pathlib import Path

def find_repo_root(marker='src'):
    p = Path('.').resolve()
    for candidate in [p] + list(p.parents):
        if (candidate / marker).exists():
            return candidate
    return p

repo_root = find_repo_root('src')
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
print('Repo root added to sys.path ->', repo_root)


# Project Setup (modular)
This notebook demonstrates a modular setup that imports from the `src/` package in this repository.
It mirrors common Colab setup steps (config load, seed, dataset load, tokenization, model & trainer construction) while delegating logic to `src/` modules.

In [None]:
# Load config and small helpers
from src.utils.io import load_yaml

cfg_path = repo_root / 'configs' / 'model_bart_base.yaml'
cfg = load_yaml(str(cfg_path))
print('Loaded config keys:', list(cfg.keys()))
# Expose commonly used pieces
MODEL_NAME = cfg['model']['model_name']
DATA_CFG = cfg['data']
PREPROCESS_CFG = cfg.get('preprocess', {})
TRAINING_CFG = cfg.get('training_args', cfg.get('training_args', {}))
print('Model:', MODEL_NAME)
print('Data config sample:', DATA_CFG)

In [None]:
# Set seed for reproducibility using existing util
from src.utils.seed import set_seed
seed = TRAINING_CFG.get('seed', 42)
set_seed(seed)
print('Seed set to', seed)

In [None]:
# Try to load dataset via src.data.loaders; fall back to a small sample if files are missing
from src.data.loaders import load_json_dataset
from notebooks.helpers import sample_dataset

try:
    ds = load_json_dataset(DATA_CFG)
    print('Dataset loaded with splits:', list(ds.keys()))
except Exception as e:
    print('Could not load dataset from disk/HF (falling back to sample):', e)
    ds = {'train': sample_dataset(), 'validation': sample_dataset()}
    print('Sample dataset created with columns:', ds['train'].column_names)

In [None]:
# Build model and tokenizer via existing module (this will download weights the first time)
from src.models.build_model import build_model_and_tokenizer
print('Model name to load:', MODEL_NAME)
model, tokenizer = build_model_and_tokenizer(MODEL_NAME, cfg.get('model', {}))
print('Model and tokenizer ready — tokenizer vocab size =', tokenizer.vocab_size)

In [None]:
# Tokenize datasets using helpers.make_tokenize_fn and HF dataset.map (batched)
from notebooks.helpers import make_tokenize_fn
input_col = DATA_CFG.get('text_column', 'dialogue')
summary_col = DATA_CFG.get('summary_column', 'summary')
max_input = PREPROCESS_CFG.get('max_input_length', 1024)
max_target = PREPROCESS_CFG.get('max_target_length', 256)
tokenize_fn = make_tokenize_fn(tokenizer, input_col=input_col, target_col=summary_col, max_input_length=max_input, max_target_length=max_target)

# If the loaded ds is a dict of Dataset objects (fallback created a dict), handle both cases
def maybe_map(split):
    d = split
    try:
        # HF Dataset object supports .map
        tokenized = d.map(tokenize_fn, batched=True, remove_columns=d.column_names)
        return tokenized
    except Exception:
        # If it's a dict-like (our fallback), assume it's already small and convert via map on Dataset
        return d

train_tok = maybe_map(ds['train'])
eval_tok = maybe_map(ds.get('validation', ds.get('valid', ds.get('test', ds['train']))))
print('Tokenization finished — sample keys for train tokenized:', list(train_tok.features.keys()) if hasattr(train_tok, 'features') else 'n/a')

In [None]:
# Build a trainer object (won't start training here)
from src.training.trainer import build_trainer
trainer = build_trainer(model, tokenizer, train_tok, eval_tok, cfg)
print('Trainer built. Trainer args output_dir =', trainer.args.output_dir)

## Next steps
- Run `trainer.train()` to start training (may require GPU and large disk/network downloads).
- Move heavy dataset downloads and preprocessing to a separate notebook (`01_train.ipynb`).
- Adjust `configs/model_bart_base.yaml` to point `data.path` to your local dataset.