In [1]:
import numpy as np
import pandas as pd
import json
import os
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score


ROOT = Path('.')
TRAIN_CSV = ROOT / 'train_air_respiratory.csv'
TEST_CSV = ROOT / 'test_air_respiratory.csv'
SOUNDS_DIR = ROOT / 'sounds'

def load_mean_embedding(json_path: Path):
    if not json_path.exists():
        return None
    with open(json_path, 'r') as f:
        data = json.load(f)
    arr = np.asarray(data, dtype=np.float32)
    if arr.ndim == 2:
        emb = arr.mean(axis=0)
    elif arr.ndim == 1:
        emb = arr
    else:
        return None
    return emb if emb.shape[0] == 512 else None

# Load and deduplicate train labels (one row per candidateID)
train_df = pd.read_csv(TRAIN_CSV).drop_duplicates(subset=['candidateID'])
label_map = dict(zip(train_df['candidateID'], train_df['disease']))

# Build specialist datasets
cough_X, cough_y, vowel_X, vowel_y = [], [], [], []
missing_cough, missing_vowel = 0, 0

for cid, label in label_map.items():
    base = SOUNDS_DIR / cid
    cough_emb = load_mean_embedding(base / 'emb_cough.json')
    if cough_emb is not None:
        cough_X.append(cough_emb)
        cough_y.append(label)
    else:
        missing_cough += 1

    vowel_emb = load_mean_embedding(base / 'emb_vowel.json')
    if vowel_emb is not None:
        vowel_X.append(vowel_emb)
        vowel_y.append(label)
    else:
        missing_vowel += 1

cough_X = np.vstack(cough_X) if cough_X else np.empty((0, 512))
vowel_X = np.vstack(vowel_X) if vowel_X else np.empty((0, 512))
cough_y = np.array(cough_y)
vowel_y = np.array(vowel_y)

print(f"Cough specialist samples: {len(cough_y)} (missing: {missing_cough})")
print(f"Vowel specialist samples: {len(vowel_y)} (missing: {missing_vowel})")

# Train specialists (audio-only) with optional holdout metrics
base_params = dict(
    depth=6,                    
    learning_rate=0.1,
    iterations=400,             
    subsample=0.9,
    bootstrap_type='Bernoulli',
    rsm=0.9,                    
    loss_function='MultiClass', 
    classes_count=3,           
    thread_count=4,             
    random_seed=42,
    verbose=0,                 
    allow_writing_files=False  
     )


def train_with_eval(X, y, name: str):
    if len(y) == 0:
        print(f"{name}: no data, training skipped")
        return None

    unique_classes = np.unique(y)
    can_eval = len(y) >= 10 and len(unique_classes) >= 2
    if can_eval:
        stratify = y if len(unique_classes) > 1 else None
        X_tr, X_val, y_tr, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=stratify
        )
        model = CatBoostClassifier(**base_params)
        model.fit(X_tr, y_tr)
        val_pred = model.predict(X_val)
        metrics = {
            'f1_macro': f1_score(y_val, val_pred, average='macro'),
            'accuracy': accuracy_score(y_val, val_pred),
            'precision_macro': precision_score(y_val, val_pred, average='macro', zero_division=0),
            'recall_macro': recall_score(y_val, val_pred, average='macro'),
        }
        print(f"\n{name} validation metrics (holdout 20%):")
        for k, v in metrics.items():
            print(f"  {k}: {v:.4f}")
    else:
        print(f"{name}: skipped holdout metrics (samples={len(y)}, classes={len(unique_classes)})")

    # Refit on full data for final model
    final_model = CatBoostClassifier(**base_params)
    final_model.fit(X, y)
    return final_model


cough_model = train_with_eval(cough_X, cough_y, "Cough specialist")
vowel_model = train_with_eval(vowel_X, vowel_y, "Vowel specialist")

# Inference judge

test_df = pd.read_csv(TEST_CSV)
preds, sources = [], []

for cid in test_df['candidateID']:
    base = SOUNDS_DIR / cid
    options = []

    cough_emb = load_mean_embedding(base / 'emb_cough.json')
    if cough_emb is not None and cough_model is not None:
        prob = cough_model.predict_proba(cough_emb.reshape(1, -1))[0]
        options.append((prob, 'cough'))

    vowel_emb = load_mean_embedding(base / 'emb_vowel.json')
    if vowel_emb is not None and vowel_model is not None:
        prob = vowel_model.predict_proba(vowel_emb.reshape(1, -1))[0]
        options.append((prob, 'vowel'))

    if options:
        best_prob, best_src = max(options, key=lambda item: item[0].max())
        pred_class = int(best_prob.argmax())
    else:
        best_src = 'fallback'
        pred_class = 2  # hardcoded fallback

    preds.append(pred_class)
    sources.append(best_src)

submission = pd.DataFrame({'candidateID': test_df['candidateID'], 'disease': preds})
submission_path = ROOT / 'submission_CatBoost_dual_audio.csv'
submission.to_csv(submission_path, index=False)

print(f"Saved submission to {submission_path} with {len(submission)} rows")
print("Prediction source counts:\n", pd.Series(sources).value_counts())

Cough specialist samples: 515 (missing: 29)
Vowel specialist samples: 540 (missing: 4)

Cough specialist validation metrics (holdout 20%):
  f1_macro: 0.3731
  accuracy: 0.4660
  precision_macro: 0.3854
  recall_macro: 0.3953

Vowel specialist validation metrics (holdout 20%):
  f1_macro: 0.4850
  accuracy: 0.5556
  precision_macro: 0.6329
  recall_macro: 0.4977
Saved submission to submission_CatBoost_dual_audio.csv with 338 rows
Prediction source counts:
 vowel    194
cough    144
Name: count, dtype: int64
