## Step 1: Clone Repository (Kaggle Only)

Run this cell on Kaggle to get the latest code from GitHub:

In [1]:
g_inference_only = False

#delete directory if exists
import shutil
import os
if os.path.exists('prj-2-opencv-dl-pytorch'):
    shutil.rmtree('prj-2-opencv-dl-pytorch')
# Clone repository from GitHub
!git clone https://github.com/ramabyg/prj-2-opencv-dl-pytorch.git

# Add to Python path
import sys
sys.path.insert(0, '/kaggle/working/prj-2-opencv-dl-pytorch')

print("[OK] Repository cloned and added to path")

Cloning into 'prj-2-opencv-dl-pytorch'...
remote: Enumerating objects: 182, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 182 (delta 110), reused 126 (delta 58), pack-reused 0 (from 0)[K
Receiving objects: 100% (182/182), 2.87 MiB | 18.59 MiB/s, done.
Resolving deltas: 100% (110/110), done.
remote: Enumerating objects: 182, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 182 (delta 110), reused 126 (delta 58), pack-reused 0 (from 0)[K
Receiving objects: 100% (182/182), 2.87 MiB | 18.59 MiB/s, done.
Resolving deltas: 100% (110/110), done.
[OK] Repository cloned and added to path
[OK] Repository cloned and added to path


## Step 2: Import Modules

In [2]:
from src.config import get_config
from src.datamodule import KenyanFood13DataModule
from src.model import KenyanFood13Classifier
from src.trainer import train_model
from src.utils import calculate_dataset_mean_std

print("[OK] All modules imported successfully")

[OK] All modules imported successfully


## Step 3: Configure Training

In [None]:
# Get configurations (auto-detects Kaggle environment)
# Phase 1 + Phase 2 (RandAugment) settings
train_config, data_config, system_config = get_config(
    num_epochs=10,           # Recommended for Phase 2 (with RandAugment)
    batch_size=16,           # Reduced for memory efficiency with freeze_pct=0.0
    # All other Phase 1+2 settings are now defaults:
    # - freeze_pct=0.0 (train all layers)
    # - learning_rate=0.0001 (optimal for freeze_pct=0.0)
    # - optimizer="adamw"
    # - scheduler="cosine"
    # - label_smoothing=0.1 (in model)
    # - RandAugment (in datamodule)
    # - input_size=384 (memory-efficient)
    use_scheduler=True,
    scheduler="cosine"
)

# Optional: Customize early stopping
# train_config.early_stop_patience = 10  # More patient
# train_config.use_early_stopping = False  # Disable completely

print(f"Training for {train_config.num_epochs} epochs")
print(f"Model: {train_config.model_name}")
print(f"Learning rate: {train_config.learning_rate}")
print(f"Batch size: {train_config.batch_size}")
print(f"Freeze percentage: {train_config.freeze_pct} (0.0 = all layers trainable)")
print(f"Optimizer: {train_config.optimizer}")
print(f"Scheduler: {train_config.scheduler if train_config.use_scheduler else 'none'}")
print(f"Early stopping: patience={train_config.early_stop_patience}")
print(f"Device: {system_config.device}")
print(f"Output directory: {system_config.output_dir}")

Training for 10 epochs
Model: efficientnetv2
Learning rate: 0.0001
Batch size: 32
Early stopping: patience=7
Device: cuda
Output directory: /kaggle/working/output


## Step 4: Calculate Dataset Statistics (Optional)

You can skip this and use ImageNet defaults for faster startup.

In [4]:
# Option 1: Use model-specific preprocessing (RECOMMENDED for pre-trained models)
# This uses the exact same preprocessing (mean, std, resolution) as the original pre-training
print(f"Using model-specific preprocessing for: {train_config.model_name}")

# Option 2 (Alternative): Calculate from dataset (more accurate for custom stats)
# mean, std = calculate_dataset_mean_std(
#     annotations_file=data_config.annotations_file,
#     img_dir=data_config.img_dir,
#     sample_size=None  # Use more samples on Kaggle, None means use all images
# )

# Option 3 (Alternative): Use pre-computed values (fastest)
# mean = [0.5672, 0.4663, 0.3659]
# std = [0.2484, 0.2561, 0.2600]

Using model-specific preprocessing for: efficientnetv2


## Step 5: Create Data Module and Model

In [5]:
# Create data module with model-specific preprocessing
data_module = KenyanFood13DataModule(
    data_config=data_config,
    model_name=train_config.model_name
)
data_module.setup()

print(f"[OK] Data module created with {data_module.num_classes} classes")

# Create model
model = KenyanFood13Classifier(train_config, data_module.num_classes)

print(f"[OK] Model created: {train_config.model_name}")

[INFO] Using model-specific preprocessing: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
[INFO] Using column 'class' as label column
[INFO] CSV columns: ['id', 'class']
Using 5228 samples for training.
Class to index mapping: {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9, 'pilau': 10, 'sukumawiki': 11, 'ugali': 12}
Dataset initialized with 5228 samples belonging to 13 classes.
[INFO] Using column 'class' as label column
[INFO] CSV columns: ['id', 'class']
Using 1308 samples for validation.
Class to index mapping: {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9, 'pilau': 10, 'sukumawiki': 11, 'ugali': 12}
Dataset initialized with 1308 samples belonging to 13 classes.
[OK] Data module created with 13 classes
Loading pre-trained EfficientNetV2-S weights...


Downloading: "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_v2_s-dd5fe13b.pth
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 82.7M/82.7M [00:00<00:00, 199MB/s]



[OK] Pre-trained EfficientNetV2-S weights loaded successfully
Freezing early layers, unfreezing later layers for fine-tuning...
  Layers frozen: 0 / 3
  Trainable: 21,458,488 / 21,458,488 parameters (100.0%)
[OK] Model created: efficientnetv2


In [6]:
# Diagnostics: check labels, mapping, a sample batch and a model forward pass
import pandas as pd
import torch
import numpy as np

print("--- Diagnostics Start ---")
# Check CSV columns and a small preview
try:
    df = pd.read_csv(data_config.annotations_file)
    print("CSV columns:", list(df.columns))
    print(df.head())
except Exception as e:
    print("Could not read annotations file:", e)

# Show detected label column and class mapping
label_col = getattr(data_module.train_dataset, 'label_col', None)
print("Detected label column:", label_col)
print("class_to_idx mapping (sample):", dict(list(data_module.train_dataset.class_to_idx.items())[:10]))

# Show label distribution (if available)
try:
    if label_col and label_col in df.columns:
        print('\nLabel distribution (top counts):')
        print(df[label_col].value_counts().head(20))
except Exception:
    pass

# Inspect a single batch from train loader
train_loader = data_module.train_dataloader()
images, labels = next(iter(train_loader))
print('\nTrain batch images shape:', images.shape)
print('Train batch labels shape:', labels.shape)
print('Sample label indices:', labels[:10].tolist())

# Reverse mapping idx -> class name
idx_to_class = {v: k for k, v in data_module.train_dataset.class_to_idx.items()}
print('Sample label names:', [idx_to_class.get(int(x), '?') for x in labels[:10]])

# Basic image stats (after preprocessing)
print('Image min/max:', float(images.min()), float(images.max()))

# Quick forward pass through model to inspect outputs
device = torch.device('cuda' if torch.cuda.is_available() and system_config.device == 'cuda' else 'cpu')
print('Using device for diagnostics:', device)
model = model.to(device)
images = images.to(device)
with torch.no_grad():
    logits = model(images)
    probs = torch.softmax(logits, dim=1)
    top1 = probs.argmax(dim=1).cpu().numpy().tolist()
    top_conf_vals, _ = probs.max(dim=1)
    top_conf = top_conf_vals.cpu().numpy().tolist()
    # top_conf = probs.max(dim=1).cpu().numpy().tolist()

print('\nModel predictions (top1 indices):', top1)
print('Model top confidences:', [round(float(x), 4) for x in top_conf])

print('\nData module mean/std:', data_module.mean, data_module.std)
print('--- Diagnostics End ---')

--- Diagnostics Start ---
CSV columns: ['id', 'class']
                     id       class
0  14278962987112149800     githeri
1  13190220095752321996       ugali
2  10431803432626641638  kachumbari
3   4222441716327528413     githeri
4   2547906925836120627      matoke
Detected label column: class
class_to_idx mapping (sample): {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9}

Label distribution (top counts):
class
chapati        862
nyamachoma     784
bhaji          632
ugali          628
mandazi        620
kachumbari     494
matoke         483
githeri        479
masalachips    438
sukumawiki     402
pilau          329
mukimo         212
kukuchoma      173
Name: count, dtype: int64

Train batch images shape: torch.Size([32, 3, 480, 480])
Train batch labels shape: torch.Size([32])
Sample label indices: [6, 9, 3, 5, 9, 5, 0, 6, 11, 9]
Sample label names: ['masalachips', 'nyamachoma', 'ka

## Step 6: Train!

In [7]:
if not g_inference_only:
    # Train the model
    trained_model, _, checkpoint_callback = train_model(
        training_config=train_config,
        data_config=data_config,
        system_config=system_config,
        model=model,
        data_module=data_module
    )

    print(f"\n[OK] Training complete!")
    print(f"Best model: {checkpoint_callback.best_model_path}")
    print(f"Best accuracy: {checkpoint_callback.best_model_score:.4f}")

Early stopping enabled: monitor=valid/acc, patience=7
üìä Detected 2 GPUs - Using DDP Notebook strategy
‚ö†Ô∏è  Note: If still using 1 GPU, Kaggle may need kernel restart

Starting Training
[INFO] Using column 'class' as label column
[INFO] CSV columns: ['id', 'class']
Using 5228 samples for training.
Class to index mapping: {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9, 'pilau': 10, 'sukumawiki': 11, 'ugali': 12}
Dataset initialized with 5228 samples belonging to 13 classes.
[INFO] Using column 'class' as label column
[INFO] CSV columns: ['id', 'class']
Using 1308 samples for validation.
Class to index mapping: {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9, 'pilau': 10, 'sukumawiki': 11, 'ugali': 12}
Dataset initialized with 1308 samples belonging to 13 classes.
[INFO] Using column '

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 106.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 72.19 MiB is free. Process 5005 has 14.67 GiB memory in use. Of the allocated memory 14.35 GiB is allocated by PyTorch, and 193.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Step 7: Save Outputs to Kaggle Dataset

This will save your trained model and training summary to a Kaggle Dataset for easy access later.

In [None]:
if not g_inference_only:
    import json
    import shutil
    from pathlib import Path

    # Create a clean output directory for the dataset
    dataset_dir = Path("/kaggle/working/kenyan_food_model_output")
    if dataset_dir.exists():
        shutil.rmtree(dataset_dir)
    dataset_dir.mkdir(exist_ok=True)

    # Copy the best checkpoint
    best_checkpoint = Path(checkpoint_callback.best_model_path)
    if best_checkpoint.exists():
        shutil.copy(best_checkpoint, dataset_dir / "best_model.ckpt")
        print(f"‚úì Saved best checkpoint: {best_checkpoint.name}")

    # Save training summary as JSON
    summary = {
        "best_val_accuracy": float(checkpoint_callback.best_model_score),
        "num_epochs": train_config.num_epochs,
        "batch_size": train_config.batch_size,
        "learning_rate": train_config.learning_rate,
        "model": train_config.model_name,
        "optimizer": train_config.optimizer,
        "scheduler": train_config.scheduler if train_config.use_scheduler else "none",
        "dataset_mean": data_module.mean,
        "dataset_std": data_module.std,
        "num_classes": data_module.num_classes,
        "checkpoint_path": str(best_checkpoint.name)
    }

    with open(dataset_dir / "training_summary.json", "w") as f:
        json.dump(summary, f, indent=2)
    print(f"‚úì Saved training summary")

    # Copy ALL TensorBoard logs (full logs for offline review)
    tb_logs_src = Path(system_config.output_dir) / "kenyan_food_logs"
    if tb_logs_src.exists():
        tb_logs_dest = dataset_dir / "tensorboard_logs"

        print(f"Copying TensorBoard logs from {tb_logs_src}...")
        shutil.copytree(tb_logs_src, tb_logs_dest, dirs_exist_ok=True)

        # Count total size of logs for user info
        total_size_bytes = sum(f.stat().st_size for f in tb_logs_dest.rglob('*') if f.is_file())
        total_size_mb = total_size_bytes / (1024 * 1024)
        print(f"‚úì Saved TensorBoard logs ({total_size_mb:.1f} MB)")
    else:
        print(f"[WARN] TensorBoard logs not found at {tb_logs_src}")

    # Create a ZIP file for easy download
    print("\nCreating ZIP archive for download...")
    zip_path = Path("/kaggle/working/kenyan_food_model_output")
    shutil.make_archive(str(zip_path), 'zip', dataset_dir)
    zip_size_mb = Path(f"{zip_path}.zip").stat().st_size / (1024 * 1024)
    print(f"‚úì Created kenyan_food_model_output.zip ({zip_size_mb:.1f} MB)")

    print("\n" + "="*60)
    print("[OK] OUTPUT READY FOR DOWNLOAD!")
    print("="*60)
    print("\nFiles saved to: /kaggle/working/kenyan_food_model_output/")
    print("ZIP file: /kaggle/working/kenyan_food_model_output.zip")
    print("\nContents:")
    print("  - best_model.ckpt           : Best model checkpoint")
    print("  - training_summary.json     : Training metrics and config")
    print("  - tensorboard_logs/         : Full TensorBoard event files")
    print("\nTo download:")
    print("1. Click 'Output' tab in the right sidebar")
    print("2. Find 'kenyan_food_model_output.zip'")
    print("3. Click the download button")
    print("\nOr use Kaggle API to create a dataset for reuse in other notebooks!")
    print("="*60)

## Step 8: Generate Test Predictions

After training completes, generate predictions on test data and create submission.csv

In [None]:
from src.inference import create_submission
if g_inference_only:
    checkpoint_path = "/kaggle/working/kenyan_food_model_output/best_model.ckpt"
else:
    checkpoint_path = checkpoint_callback.best_model_path

# Generate predictions and create submission.csv
# Automatically detects Kaggle environment and uses correct paths
submission_df = create_submission(
    checkpoint_path=checkpoint_path,
    output_csv_path="/kaggle/working/submission.csv",
    model_config=train_config,
    batch_size=64
)

print(f"[OK] Submission created: /kaggle/working/submission.csv")
print(f"[INFO] Total predictions: {len(submission_df)}")
print(f"\n[INFO] Prediction distribution:")
print(submission_df['label'].value_counts())
print("\n[OK] Ready to download and submit to Kaggle!")