# Kaggle Training Pipeline for IndianBatsModel

This notebook trains the Bat Species Classifier using code from the [IndianBatsModel repository](https://github.com/Quarkisinproton/IndianBatsModel).

**Steps:**
1.  Clone the repository.
2.  Install dependencies.
3.  Import functions directly from the codebase.
4.  Run the data preparation and training pipeline.


In [None]:
# 1. Setup Environment
# Clone the repository
!git clone https://github.com/Quarkisinproton/IndianBatsModel.git

# Install dependencies
!pip install librosa pyyaml pandas matplotlib torchaudio


In [None]:
# 2. Import Modules
import sys
import os
from pathlib import Path

# Add paths to sys.path to allow importing modules
# When cloning into /kaggle/working, the repo root is /kaggle/working/IndianBatsModel
REPO_DIR = '/kaggle/working/IndianBatsModel'

if REPO_DIR not in sys.path:
    sys.path.append(REPO_DIR)

# Import project modules directly
try:
    from MainShitz.data_prep.generate_annotations import generate_annotations
    from MainShitz.data_prep.wombat_to_spectrograms import process_all as generate_spectrograms
    from MainShitz.data_prep.extract_end_frequency import process_all_and_write_csv as extract_features
    from MainShitz.data_prep.whombat_project_to_wombat import convert_whombat_project_to_wombat_jsons
    from MainShitz.train import train_model
    print("Imports successful!")
except ImportError as e:
    print(f"Import Error: {e}")
    print("Please ensure the repository is cloned correctly and REPO_DIR is in sys.path")


In [None]:
# 3. Configuration
WORK_DIR = '/kaggle/working'

# Input Data Paths (Adjust these to match your Kaggle Dataset structure)
RAW_AUDIO_DIRS = [
    '/kaggle/input/pip-ceylonicusbat-species',
    '/kaggle/input/pip-tenuisbat-species',
]

# Annotation mode
# - 'auto': generate dummy full-file annotations from folder names (NOT RECOMMENDED for call classification)
# - 'provided': convert Whombat project JSON exports into per-audio Wombat JSONs (RECOMMENDED)
ANNOTATION_MODE = 'provided'  # 'auto' or 'provided'

# Only used when ANNOTATION_MODE == 'provided'
# Put the Whombat project export JSON paths here (e.g. exported from Whombat)
WHOMBAT_PROJECT_JSONS = [
    '/kaggle/input/pip-tenuisbat-species/tenuis annotations.json',
    '/kaggle/input/pip-ceylonicusbat-species/Pip ceylonicus.json',
]

# Output Paths
JSON_DIR = os.path.join(WORK_DIR, 'data/annotations_json_folder')
SPECT_OUT = os.path.join(WORK_DIR, 'data/processed/spectrograms')
FEATURES_OUT = os.path.join(WORK_DIR, 'data/processed/features')
FEATURES_CSV = os.path.join(FEATURES_OUT, 'end_frequencies.csv')
MODEL_SAVE_PATH = os.path.join(WORK_DIR, 'models', 'bat_fused_best.pth')

# Ensure directories exist
Path(JSON_DIR).mkdir(parents=True, exist_ok=True)
Path(FEATURES_OUT).mkdir(parents=True, exist_ok=True)
Path(os.path.dirname(MODEL_SAVE_PATH)).mkdir(parents=True, exist_ok=True)

print("Configuration set.")

In [None]:
# 4. Generate/Convert Annotations
print("Preparing Annotations...")

if ANNOTATION_MODE == 'auto':
    print("Mode: auto (generate dummy full-file annotations)")
    generate_annotations(
        raw_audio_dirs=RAW_AUDIO_DIRS,
        output_dir=JSON_DIR,
        label_strategy='folder',
    )
elif ANNOTATION_MODE == 'provided':
    print("Mode: provided (convert Whombat project JSON exports)")
    if not WHOMBAT_PROJECT_JSONS:
        raise ValueError("WHOMBAT_PROJECT_JSONS is empty. Add your Whombat project export JSON paths.")
    for pj in WHOMBAT_PROJECT_JSONS:
        summary = convert_whombat_project_to_wombat_jsons(
            project_json_path=pj,
            output_dir=JSON_DIR,
            tag_key='Species',
            skip_unlabeled=True,
        )
        print(f"Converted {pj}: jsons_written={summary.jsons_written}, sound_events_written={summary.sound_events_written}, skipped_unlabeled={summary.sound_events_skipped_unlabeled}")
else:
    raise ValueError(f"Unknown ANNOTATION_MODE: {ANNOTATION_MODE}")

print("Done.")

In [None]:
# 5. Generate Spectrograms
print("Generating Spectrograms...")
generate_spectrograms(
    raw_audio_dirs=RAW_AUDIO_DIRS,
    json_dir=JSON_DIR,
    out_dir=SPECT_OUT,
    species_key='label'
)
print("Done.")

In [None]:
# 6. Extract Features
print("Extracting Features...")
extract_features(
    raw_audio_dirs=RAW_AUDIO_DIRS,
    json_dir=JSON_DIR,
    out_csv=FEATURES_CSV,
    species_key='label'
)
print(f"Features saved to {FEATURES_CSV}")

In [None]:
# 7. Run Training
import torch

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

# Infer num_classes from generated spectrogram folders
num_classes = len([p for p in Path(SPECT_OUT).iterdir() if p.is_dir()])
print(f"Detected num_classes={num_classes} from {SPECT_OUT}")

# Define Training Configuration Dictionary
config = {
    'data': {
        'train_spectrograms': SPECT_OUT,
        'features_csv': FEATURES_CSV,
        'num_classes': num_classes,
    },
    'train': {  # Updated key from 'training' to 'train' to match new config structure
        'batch_size': 16, 
        'learning_rate': 1e-4,
        'num_epochs': 10,
        'model_save_path': MODEL_SAVE_PATH,
        'num_workers': 2,
    },
}

print("Starting Training...")
train_model(config)
print(f"Training Complete! Model saved to {MODEL_SAVE_PATH}")
