## Installation

In [None]:
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", 
                      "numpy<2.0.0", "timm", "albumentations", "optuna"])

## Imports

In [None]:
import os
import gc
import random
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
import cv2

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

import timm
from timm.loss import LabelSmoothingCrossEntropy
import albumentations as A
from albumentations.pytorch import ToTensorV2
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

warnings.filterwarnings('ignore')

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

## Configuration

In [None]:
class Config:
    BASE_DIR = Path("/kaggle/input/logmel-cough-vowel")
    DATA_PATH = BASE_DIR / "dataclean_1"
    CSV_PATH = BASE_DIR / "logMelDataset"
    
    COUGH_PATH = DATA_PATH / "dataclean_cough_log_mel_1"
    VOWEL_PATH = DATA_PATH / "dataclean_vowel_log_mel_1"
    TRAIN_CSV = CSV_PATH / "train.csv"
    TEST_CSV = CSV_PATH / "test.csv"
    OUTPUT_PATH = Path("/kaggle/working")
    
    MODEL_NAME = "convnext_tiny.fb_in22k_ft_in1k"
    IMG_SIZE = 224
    NUM_CLASSES = 3
    FALLBACK_CLASS = 0
    
    EPOCHS = 20
    NUM_WORKERS = 2
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SEED = 42
    
    N_TRIALS = 10
    OPTUNA_EPOCHS = 5

    @staticmethod
    def create_output_dir():
        Config.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

set_seed(Config.SEED)
Config.create_output_dir()

## Image Transform

In [None]:
def get_transforms(data='train'):
    if data == 'train':
        return A.Compose([
            A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2(),
        ])
    else:
        return A.Compose([
            A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2(),
        ])

## Dataset

In [None]:
class SpectrogramDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.img_dir = Path(img_dir)
        self.transform = transform
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        try:
            row = self.df.iloc[idx]
            img_id = row['id'] if 'id' in row else row['candidateID']
            img_path = self.img_dir / img_id / "log_mel_spectrogram.png"
            
            if not img_path.exists(): 
                return None
            
            img = cv2.imread(str(img_path))
            if img is None: 
                return None
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            if self.transform:
                augmented = self.transform(image=img)
                img = augmented['image']
            
            if self.is_test:
                return img, img_id
            else:
                label = int(row['label'])
                return img, label, img_id
        except:
            return None

def collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if len(batch) == 0: 
        return None
    return torch.utils.data.dataloader.default_collate(batch)

## Model Architecture

In [None]:
class ConvNeXtTinyClassifier(nn.Module):
    def __init__(self, model_name=Config.MODEL_NAME, num_classes=Config.NUM_CLASSES):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=True, num_classes=0)
        self.num_features = self.model.num_features
        self.classifier = nn.Sequential(
            nn.LayerNorm(self.num_features),
            nn.Dropout(0.1),
            nn.Linear(self.num_features, num_classes)
        )
        
    def forward(self, x):
        features = self.model(x)
        logits = self.classifier(features)
        return logits, features

## Training Functions

In [None]:
def train_epoch(model, loader, criterion, optimizer, scaler):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    
    for batch in loader:
        if batch is None: 
            continue
        imgs, labels, _ = batch
        imgs, labels = imgs.to(Config.DEVICE), labels.to(Config.DEVICE)
        
        optimizer.zero_grad()
        with autocast():
            logits, _ = model(imgs)
            loss = criterion(logits, labels)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    if len(all_preds) == 0:
        return 0, 0
        
    return total_loss / len(loader), f1_score(all_labels, all_preds, average='macro', zero_division=0)

def validate(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in loader:
            if batch is None: 
                continue
            imgs, labels, _ = batch
            imgs, labels = imgs.to(Config.DEVICE), labels.to(Config.DEVICE)
            
            with autocast():
                logits, _ = model(imgs)
                loss = criterion(logits, labels)
                
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    if len(all_preds) == 0:
        return 0, 0, [], []
        
    return total_loss / len(loader), f1_score(all_labels, all_preds, average='macro', zero_division=0), all_preds, all_labels

In [None]:
def train_pipeline(model, train_loader, val_loader, params, save_name):
    criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
    optimizer = AdamW(model.parameters(), lr=params['lr'], weight_decay=0.05)
    scheduler = CosineAnnealingLR(optimizer, T_max=Config.EPOCHS)
    scaler = GradScaler()
    
    best_f1 = 0
    history = {'train_loss': [], 'val_f1': []}
    
    for epoch in range(Config.EPOCHS):
        t_loss, t_f1 = train_epoch(model, train_loader, criterion, optimizer, scaler)
        v_loss, v_f1, _, _ = validate(model, val_loader, criterion)
        
        history['train_loss'].append(t_loss)
        history['val_f1'].append(v_f1)
        
        if (epoch + 1) % 5 == 0:
            print(f"[{save_name.upper()}] Epoch {epoch+1}/{Config.EPOCHS} | Loss: {t_loss:.4f} | Val F1: {v_f1:.4f}")
        
        if v_f1 > best_f1:
            best_f1 = v_f1
            torch.save(model.state_dict(), Config.OUTPUT_PATH / f"{save_name}_best.pth")
            
        scheduler.step()
    
    return history, best_f1

## Optuna Hyperparameter Tuning

In [None]:
def objective(trial, train_df, img_dir):
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    
    tr_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=Config.SEED)
    
    tr_loader = DataLoader(
        SpectrogramDataset(tr_df, img_dir, get_transforms('train')), 
        batch_size=batch_size, shuffle=True, num_workers=Config.NUM_WORKERS, 
        collate_fn=collate_fn, pin_memory=True
    )
    val_loader = DataLoader(
        SpectrogramDataset(val_df, img_dir, get_transforms('valid')), 
        batch_size=batch_size, shuffle=False, num_workers=Config.NUM_WORKERS, 
        collate_fn=collate_fn, pin_memory=True
    )
    
    model = ConvNeXtTinyClassifier().to(Config.DEVICE)
    criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
    optimizer = AdamW(model.parameters(), lr=lr)
    scaler = GradScaler()
    
    best_temp_f1 = 0
    for epoch in range(Config.OPTUNA_EPOCHS):
        train_epoch(model, tr_loader, criterion, optimizer, scaler)
        _, f1, _, _ = validate(model, val_loader, criterion)
        best_temp_f1 = max(best_temp_f1, f1)
        trial.report(f1, epoch)
        if trial.should_prune(): 
            raise optuna.TrialPruned()
        
    del model, optimizer, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return best_temp_f1

## Load Data

In [None]:
train_df = pd.read_csv(Config.TRAIN_CSV)
test_df = pd.read_csv(Config.TEST_CSV)

train_df = train_df.rename(columns={'disease': 'label', 'candidateID': 'id'})

cough_files = set(p.name for p in Config.COUGH_PATH.glob("*") if p.is_dir())
vowel_files = set(p.name for p in Config.VOWEL_PATH.glob("*") if p.is_dir())

train_df['has_cough'] = train_df['id'].isin(cough_files)
train_df['has_vowel'] = train_df['id'].isin(vowel_files)

print(f"Train: {len(train_df)} | Cough: {train_df['has_cough'].sum()} | Vowel: {train_df['has_vowel'].sum()}")
print(f"Test: {len(test_df)}")

## Train Cough Model

In [None]:
df_c = train_df[train_df['has_cough']].copy()

study_c = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_c.optimize(lambda t: objective(t, df_c, Config.COUGH_PATH), n_trials=Config.N_TRIALS, show_progress_bar=True)
params_c = study_c.best_params

print(f"Best Params: {params_c} | F1: {study_c.best_value:.4f}")

In [None]:
tr_c, val_c = train_test_split(df_c, test_size=0.2, stratify=df_c['label'], random_state=Config.SEED)
model_c = ConvNeXtTinyClassifier().to(Config.DEVICE)

hist_c, best_f1_c = train_pipeline(
    model_c,
    DataLoader(SpectrogramDataset(tr_c, Config.COUGH_PATH, get_transforms('train')), 
               batch_size=params_c['batch_size'], shuffle=True, collate_fn=collate_fn, pin_memory=True),
    DataLoader(SpectrogramDataset(val_c, Config.COUGH_PATH, get_transforms('valid')), 
               batch_size=params_c['batch_size'], collate_fn=collate_fn, pin_memory=True),
    params_c, "cough"
)

print(f"Cough Model Best F1: {best_f1_c:.4f}")

### Cough Model Evaluation

In [None]:
model_c.load_state_dict(torch.load(Config.OUTPUT_PATH / "cough_best.pth"))
model_c.eval()

val_c_loader = DataLoader(
    SpectrogramDataset(val_c, Config.COUGH_PATH, get_transforms('valid')), 
    batch_size=32, shuffle=False, num_workers=2, collate_fn=collate_fn
)

criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
val_loss, val_f1, val_preds, val_labels = validate(model_c, val_c_loader, criterion)

val_acc = accuracy_score(val_labels, val_preds)
class_f1 = f1_score(val_labels, val_preds, average=None, zero_division=0)

print(f"Accuracy: {val_acc:.4f} | F1 Macro: {val_f1:.4f}")
print(f"Per-Class F1: {class_f1}")
print(f"\n{classification_report(val_labels, val_preds, target_names=['Class 0', 'Class 1', 'Class 2'], zero_division=0)}")

cm_c = confusion_matrix(val_labels, val_preds)
fig_cm_c, ax_cm_c = plt.subplots(figsize=(8, 6))
sns.heatmap(cm_c, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1', 'Class 2'],
            yticklabels=['Class 0', 'Class 1', 'Class 2'],
            ax=ax_cm_c, cbar_kws={'label': 'Count'})
ax_cm_c.set_xlabel('Predicted', fontsize=12, fontweight='bold')
ax_cm_c.set_ylabel('True', fontsize=12, fontweight='bold')
ax_cm_c.set_title('Cough Model - Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(Config.OUTPUT_PATH / "cough_confusion_matrix.png", dpi=150, bbox_inches='tight')
plt.show()

## Train Vowel Model

In [None]:
df_v = train_df[train_df['has_vowel']].copy()

study_v = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_v.optimize(lambda t: objective(t, df_v, Config.VOWEL_PATH), n_trials=Config.N_TRIALS, show_progress_bar=True)
params_v = study_v.best_params

print(f"Best Params: {params_v} | F1: {study_v.best_value:.4f}")

In [None]:
tr_v, val_v = train_test_split(df_v, test_size=0.2, stratify=df_v['label'], random_state=Config.SEED)
model_v = ConvNeXtTinyClassifier().to(Config.DEVICE)

hist_v, best_f1_v = train_pipeline(
    model_v,
    DataLoader(SpectrogramDataset(tr_v, Config.VOWEL_PATH, get_transforms('train')), 
               batch_size=params_v['batch_size'], shuffle=True, collate_fn=collate_fn, pin_memory=True),
    DataLoader(SpectrogramDataset(val_v, Config.VOWEL_PATH, get_transforms('valid')), 
               batch_size=params_v['batch_size'], collate_fn=collate_fn, pin_memory=True),
    params_v, "vowel"
)

print(f"Vowel Model Best F1: {best_f1_v:.4f}")

### Vowel Model Evaluation

In [None]:
model_v.load_state_dict(torch.load(Config.OUTPUT_PATH / "vowel_best.pth"))
model_v.eval()

val_v_loader = DataLoader(
    SpectrogramDataset(val_v, Config.VOWEL_PATH, get_transforms('valid')), 
    batch_size=32, shuffle=False, num_workers=2, collate_fn=collate_fn
)

val_loss_v, val_f1_v, val_preds_v, val_labels_v = validate(model_v, val_v_loader, criterion)

val_acc_v = accuracy_score(val_labels_v, val_preds_v)
class_f1_v = f1_score(val_labels_v, val_preds_v, average=None, zero_division=0)

print(f"Accuracy: {val_acc_v:.4f} | F1 Macro: {val_f1_v:.4f}")
print(f"Per-Class F1: {class_f1_v}")
print(f"\n{classification_report(val_labels_v, val_preds_v, target_names=['Class 0', 'Class 1', 'Class 2'], zero_division=0)}")

cm_v = confusion_matrix(val_labels_v, val_preds_v)
fig_cm_v, ax_cm_v = plt.subplots(figsize=(8, 6))
sns.heatmap(cm_v, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Class 0', 'Class 1', 'Class 2'],
            yticklabels=['Class 0', 'Class 1', 'Class 2'],
            ax=ax_cm_v, cbar_kws={'label': 'Count'})
ax_cm_v.set_xlabel('Predicted', fontsize=12, fontweight='bold')
ax_cm_v.set_ylabel('True', fontsize=12, fontweight='bold')
ax_cm_v.set_title('Vowel Model - Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(Config.OUTPUT_PATH / "vowel_confusion_matrix.png", dpi=150, bbox_inches='tight')
plt.show()

## Training Metrics Visualization

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

ax[0].plot(hist_c['val_f1'], marker='o', label=f'Cough (Best: {best_f1_c:.3f})', linewidth=2)
ax[0].plot(hist_v['val_f1'], marker='s', label=f'Vowel (Best: {best_f1_v:.3f})', linewidth=2)
ax[0].set_xlabel('Epoch', fontsize=12)
ax[0].set_ylabel('Validation F1 Score', fontsize=12)
ax[0].set_title('Model Performance', fontsize=14, fontweight='bold')
ax[0].legend(fontsize=10)
ax[0].grid(True, alpha=0.3)

ax[1].plot(hist_c['train_loss'], marker='o', label='Cough', linewidth=2)
ax[1].plot(hist_v['train_loss'], marker='s', label='Vowel', linewidth=2)
ax[1].set_xlabel('Epoch', fontsize=12)
ax[1].set_ylabel('Training Loss', fontsize=12)
ax[1].set_title('Training Loss', fontsize=14, fontweight='bold')
ax[1].legend(fontsize=10)
ax[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(Config.OUTPUT_PATH / "training_metrics.png", dpi=150, bbox_inches='tight')
plt.show()

## Generate Submission

In [None]:
model_c.load_state_dict(torch.load(Config.OUTPUT_PATH / "cough_best.pth"))
model_v.load_state_dict(torch.load(Config.OUTPUT_PATH / "vowel_best.pth"))
model_c.eval()
model_v.eval()

def get_probs(model, df, img_path):
    df_temp = df.rename(columns={'candidateID': 'id'})
    ds = SpectrogramDataset(df_temp, img_path, transform=get_transforms('valid'), is_test=True)
    loader = DataLoader(ds, batch_size=32, shuffle=False, num_workers=2, collate_fn=collate_fn)
    
    probs_map = {}
    with torch.no_grad():
        for batch in loader:
            if batch is None: 
                continue
            imgs, ids = batch
            imgs = imgs.to(Config.DEVICE)
            with autocast():
                logits, _ = model(imgs)
                probs = F.softmax(logits, dim=1).cpu().numpy()
            
            for i, uid in enumerate(ids):
                probs_map[uid] = probs[i]
    return probs_map

probs_c = get_probs(model_c, test_df, Config.COUGH_PATH)
probs_v = get_probs(model_v, test_df, Config.VOWEL_PATH)

print(f"Cough predictions: {len(probs_c)} | Vowel predictions: {len(probs_v)}")

In [None]:
final_results = []
missing_count = 0

for idx, row in test_df.iterrows():
    uid = row['candidateID']
    p_c = probs_c.get(uid)
    p_v = probs_v.get(uid)
    
    if p_c is not None and p_v is not None:
        final_prob = (p_c + p_v) / 2
        pred = int(np.argmax(final_prob))
    elif p_c is not None:
        pred = int(np.argmax(p_c))
    elif p_v is not None:
        pred = int(np.argmax(p_v))
    else:
        pred = Config.FALLBACK_CLASS
        missing_count += 1
        
    final_results.append({'candidateID': uid, 'disease': pred})

submission = pd.DataFrame(final_results)
submission = submission[['candidateID', 'disease']]

print(f"Total: {len(submission)} | Missing: {missing_count}")
print(f"\nPrediction distribution:\n{submission['disease'].value_counts().sort_index()}")

submission.to_csv(Config.OUTPUT_PATH / "submission.csv", index=False)
submission.head(10)