## Step 1: Clone Repository (Kaggle Only)

Run this cell on Kaggle to get the latest code from GitHub:

In [1]:
#delete directory if exists
import shutil
import os
if os.path.exists('prj-2-opencv-dl-pytorch'):
    shutil.rmtree('prj-2-opencv-dl-pytorch')
# Clone repository from GitHub
!git clone https://github.com/ramabyg/prj-2-opencv-dl-pytorch.git

# Add to Python path
import sys
sys.path.insert(0, '/kaggle/working/prj-2-opencv-dl-pytorch')

print("[OK] Repository cloned and added to path")

Cloning into 'prj-2-opencv-dl-pytorch'...
remote: Enumerating objects: 145, done.[K
remote: Counting objects: 100% (145/145), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 145 (delta 85), reused 103 (delta 47), pack-reused 0 (from 0)[K
Receiving objects: 100% (145/145), 2.35 MiB | 8.61 MiB/s, done.
Resolving deltas: 100% (85/85), done.
[OK] Repository cloned and added to path


## Step 2: Import Modules

In [2]:
from src.config import get_config
from src.datamodule import KenyanFood13DataModule
from src.model import KenyanFood13Classifier
from src.trainer import train_model
from src.utils import calculate_dataset_mean_std

print("[OK] All modules imported successfully")

[OK] All modules imported successfully


## Step 3: Configure Training

In [3]:
# Get configurations (auto-detects Kaggle environment)
# Uses ResNet50 by default (better than GoogleNet)
train_config, data_config, system_config = get_config(
    num_epochs=100,          # Adjust as needed
    batch_size=32,           # For single GPU
    # learning_rate is now 0.0001 by default (better for fine-tuning)
    use_scheduler=True,
    scheduler="cosine"
)

# Optional: Customize early stopping
# train_config.early_stop_patience = 10  # More patient
# train_config.use_early_stopping = False  # Disable completely

print(f"Training for {train_config.num_epochs} epochs")
print(f"Model: {train_config.model_name}")
print(f"Learning rate: {train_config.learning_rate}")
print(f"Batch size: {train_config.batch_size}")
print(f"Early stopping: patience={train_config.early_stop_patience}")
print(f"Device: {system_config.device}")
print(f"Output directory: {system_config.output_dir}")

Training for 100 epochs
Model: efficientnetv2
Learning rate: 0.0001
Batch size: 32
Early stopping: patience=7
Device: cuda
Output directory: /kaggle/working/output


## Step 4: Calculate Dataset Statistics (Optional)

You can skip this and use ImageNet defaults for faster startup.

In [4]:
# Option 1: Use model-specific preprocessing (RECOMMENDED for pre-trained models)
# This uses the exact same preprocessing (mean, std, resolution) as the original pre-training
print(f"Using model-specific preprocessing for: {train_config.model_name}")

# Option 2 (Alternative): Calculate from dataset (more accurate for custom stats)
# mean, std = calculate_dataset_mean_std(
#     annotations_file=data_config.annotations_file,
#     img_dir=data_config.img_dir,
#     sample_size=None  # Use more samples on Kaggle, None means use all images
# )

# Option 3 (Alternative): Use pre-computed values (fastest)
# mean = [0.5672, 0.4663, 0.3659]
# std = [0.2484, 0.2561, 0.2600]

Using model-specific preprocessing for: efficientnetv2


## Step 5: Create Data Module and Model

In [5]:
# Create data module with model-specific preprocessing
data_module = KenyanFood13DataModule(
    data_config=data_config,
    model_name=train_config.model_name
)
data_module.setup()

print(f"[OK] Data module created with {data_module.num_classes} classes")

# Create model
model = KenyanFood13Classifier(train_config, data_module.num_classes)

print(f"[OK] Model created: {train_config.model_name}")

[INFO] Using model-specific input size: 384 (was 224)
[INFO] Using model-specific preprocessing: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
[INFO] Using column 'class' as label column
[INFO] CSV columns: ['id', 'class']
Using 5228 samples for training.
Class to index mapping: {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9, 'pilau': 10, 'sukumawiki': 11, 'ugali': 12}
Dataset initialized with 5228 samples belonging to 13 classes.
[INFO] Using column 'class' as label column
[INFO] CSV columns: ['id', 'class']
Using 1308 samples for validation.
Class to index mapping: {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9, 'pilau': 10, 'sukumawiki': 11, 'ugali': 12}
Dataset initialized with 1308 samples belonging to 13 classes.
[OK] Data module created with 13 classes
Loading pre-trained 

Downloading: "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_v2_s-dd5fe13b.pth
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 82.7M/82.7M [00:00<00:00, 192MB/s]


[OK] Pre-trained EfficientNetV2-S weights loaded successfully
Freezing early layers, unfreezing later layers for fine-tuning...
  Layers frozen: 1 / 3
  Trainable: 15,269,736 / 21,458,488 parameters (71.2%)
[OK] Model created: efficientnetv2


In [None]:
# Diagnostics: check labels, mapping, a sample batch and a model forward pass
import pandas as pd
import torch
import numpy as np

print("--- Diagnostics Start ---")
# Check CSV columns and a small preview
try:
    df = pd.read_csv(data_config.annotations_file)
    print("CSV columns:", list(df.columns))
    print(df.head())
except Exception as e:
    print("Could not read annotations file:", e)

# Show detected label column and class mapping
label_col = getattr(data_module.train_dataset, 'label_col', None)
print("Detected label column:", label_col)
print("class_to_idx mapping (sample):", dict(list(data_module.train_dataset.class_to_idx.items())[:10]))

# Show label distribution (if available)
try:
    if label_col and label_col in df.columns:
        print('\nLabel distribution (top counts):')
        print(df[label_col].value_counts().head(20))
except Exception:
    pass

# Inspect a single batch from train loader
train_loader = data_module.train_dataloader()
images, labels = next(iter(train_loader))
print('\nTrain batch images shape:', images.shape)
print('Train batch labels shape:', labels.shape)
print('Sample label indices:', labels[:10].tolist())

# Reverse mapping idx -> class name
idx_to_class = {v: k for k, v in data_module.train_dataset.class_to_idx.items()}
print('Sample label names:', [idx_to_class.get(int(x), '?') for x in labels[:10]])

# Basic image stats (after preprocessing)
print('Image min/max:', float(images.min()), float(images.max()))

# Quick forward pass through model to inspect outputs
device = torch.device('cuda' if torch.cuda.is_available() and system_config.device == 'cuda' else 'cpu')
print('Using device for diagnostics:', device)
model = model.to(device)
images = images.to(device)
with torch.no_grad():
    logits = model(images)
    probs = torch.softmax(logits, dim=1)
    top1 = probs.argmax(dim=1).cpu().numpy().tolist()
    top_conf = probs.max(dim=1).cpu().numpy().tolist()

print('\nModel predictions (top1 indices):', top1)
print('Model top confidences:', [round(float(x), 4) for x in top_conf])

print('\nData module mean/std:', data_module.mean, data_module.std)
print('--- Diagnostics End ---')

## Step 6: Train!

In [6]:
# Train the model
trained_model, _, checkpoint_callback = train_model(
    training_config=train_config,
    data_config=data_config,
    system_config=system_config,
    model=model,
    data_module=data_module
)

print(f"\n[OK] Training complete!")
print(f"Best model: {checkpoint_callback.best_model_path}")
print(f"Best accuracy: {checkpoint_callback.best_model_score:.4f}")

Early stopping enabled: monitor=valid/acc, patience=7
üìä Detected 2 GPUs - Using DDP Notebook strategy
‚ö†Ô∏è  Note: If still using 1 GPU, Kaggle may need kernel restart

Starting Training
[INFO] Using column 'class' as label column
[INFO] CSV columns: ['id', 'class']
Using 5228 samples for training.
Class to index mapping: {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9, 'pilau': 10, 'sukumawiki': 11, 'ugali': 12}
Dataset initialized with 5228 samples belonging to 13 classes.
[INFO] Using column 'class' as label column
[INFO] CSV columns: ['id', 'class']
Using 1308 samples for validation.
Class to index mapping: {'bhaji': 0, 'chapati': 1, 'githeri': 2, 'kachumbari': 3, 'kukuchoma': 4, 'mandazi': 5, 'masalachips': 6, 'matoke': 7, 'mukimo': 8, 'nyamachoma': 9, 'pilau': 10, 'sukumawiki': 11, 'ugali': 12}
Dataset initialized with 1308 samples belonging to 13 classes.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

NameError: name 'exit' is not defined

## Step 7: Save Outputs to Kaggle Dataset

This will save your trained model and training summary to a Kaggle Dataset for easy access later.

In [None]:
import json
import shutil
from pathlib import Path

# Create a clean output directory for the dataset
dataset_dir = Path("/kaggle/working/kenyan_food_model_output")
if dataset_dir.exists():
    shutil.rmtree(dataset_dir)
dataset_dir.mkdir(exist_ok=True)

# Copy the best checkpoint
best_checkpoint = Path(checkpoint_callback.best_model_path)
if best_checkpoint.exists():
    shutil.copy(best_checkpoint, dataset_dir / "best_model.ckpt")
    print(f"‚úì Saved best checkpoint: {best_checkpoint.name}")

# Save training summary as JSON
summary = {
    "best_val_accuracy": float(checkpoint_callback.best_model_score),
    "num_epochs": train_config.num_epochs,
    "batch_size": train_config.batch_size,
    "learning_rate": train_config.learning_rate,
    "model": train_config.model_name,
    "optimizer": train_config.optimizer,
    "scheduler": train_config.scheduler if train_config.use_scheduler else "none",
    "dataset_mean": data_module.mean,
    "dataset_std": data_module.std,
    "num_classes": data_module.num_classes,
    "checkpoint_path": str(best_checkpoint.name)
}

with open(dataset_dir / "training_summary.json", "w") as f:
    json.dump(summary, f, indent=2)
print(f"‚úì Saved training summary")

# Copy ALL TensorBoard logs (full logs for offline review)
tb_logs_src = Path(system_config.output_dir) / "kenyan_food_logs"
if tb_logs_src.exists():
    tb_logs_dest = dataset_dir / "tensorboard_logs"

    print(f"Copying TensorBoard logs from {tb_logs_src}...")
    shutil.copytree(tb_logs_src, tb_logs_dest, dirs_exist_ok=True)

    # Count total size of logs for user info
    total_size_bytes = sum(f.stat().st_size for f in tb_logs_dest.rglob('*') if f.is_file())
    total_size_mb = total_size_bytes / (1024 * 1024)
    print(f"‚úì Saved TensorBoard logs ({total_size_mb:.1f} MB)")
else:
    print(f"[WARN] TensorBoard logs not found at {tb_logs_src}")

# Create a ZIP file for easy download
print("\nCreating ZIP archive for download...")
zip_path = Path("/kaggle/working/kenyan_food_model_output")
shutil.make_archive(str(zip_path), 'zip', dataset_dir)
zip_size_mb = Path(f"{zip_path}.zip").stat().st_size / (1024 * 1024)
print(f"‚úì Created kenyan_food_model_output.zip ({zip_size_mb:.1f} MB)")

print("\n" + "="*60)
print("[OK] OUTPUT READY FOR DOWNLOAD!")
print("="*60)
print("\nFiles saved to: /kaggle/working/kenyan_food_model_output/")
print("ZIP file: /kaggle/working/kenyan_food_model_output.zip")
print("\nContents:")
print("  - best_model.ckpt           : Best model checkpoint")
print("  - training_summary.json     : Training metrics and config")
print("  - tensorboard_logs/         : Full TensorBoard event files")
print("\nTo download:")
print("1. Click 'Output' tab in the right sidebar")
print("2. Find 'kenyan_food_model_output.zip'")
print("3. Click the download button")
print("\nOr use Kaggle API to create a dataset for reuse in other notebooks!")
print("="*60)

## Step 8: Generate Predictions (TODO)

Add your inference code here to generate submission.csv

In [None]:
# TODO: Load test data and generate predictions
# TODO: Create submission.csv
pass