## Installation


In [None]:
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", 
                      "protobuf<=3.20.3", "numpy<2.0.0", "librosa", 
                      "optuna", "tensorflow-hub"])


## Imports


In [None]:
import os
import gc
import random
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

import librosa
import tensorflow as tf
import tensorflow_hub as hub

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import seaborn as sns

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


## Configuration


In [None]:
class Config:
    BASE_DIR = Path("/kaggle/input/logmel-cough-vowel")
    DATA_PATH = BASE_DIR / "dataclean_COUGH_VOWEL"
    CSV_PATH = BASE_DIR / "logMelDataset"
    
    COUGH_PATH = DATA_PATH / "dataclean_cough_1"
    VOWEL_PATH = DATA_PATH / "dataclean_vowel_1"
    TRAIN_CSV = CSV_PATH / "train.csv"
    TEST_CSV = CSV_PATH / "test.csv"
    OUTPUT_PATH = Path("/kaggle/working")
    
    YAMNET_URL = "https://tfhub.dev/google/yamnet/1"
    YAMNET_SAMPLE_RATE = 16000
    YAMNET_EMBEDDING_DIM = 1024
    
    NUM_CLASSES = 3
    FALLBACK_CLASS = 1
    
    WARMUP_EPOCHS = 5
    FINETUNE_EPOCHS = 20
    LR_REDUCTION_FACTOR = 5
    
    NUM_WORKERS = 0
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SEED = 42
    
    N_TRIALS = 10
    OPTUNA_WARMUP_EPOCHS = 3
    OPTUNA_FINETUNE_EPOCHS = 5

    @staticmethod
    def create_output_dir():
        Config.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

set_seed(Config.SEED)
Config.create_output_dir()


## YAMNet Model & Embedding Extraction


In [None]:
tf.config.set_visible_devices([], 'GPU')
yamnet_model = hub.load(Config.YAMNET_URL)

def load_wav_16k_mono(filepath):
    try:
        waveform, sr = librosa.load(filepath, sr=None, mono=True)
        if sr != Config.YAMNET_SAMPLE_RATE:
            waveform = librosa.resample(waveform, orig_sr=sr, target_sr=Config.YAMNET_SAMPLE_RATE)
        
        waveform = waveform.astype(np.float32)
        if np.abs(waveform).max() > 0:
            waveform = waveform / np.abs(waveform).max()
        return waveform
    except:
        return None

def extract_yamnet_embeddings(waveform):
    if waveform is None: 
        return None
    try:
        waveform_tf = tf.constant(waveform, dtype=tf.float32)
        _, embeddings, _ = yamnet_model(waveform_tf)
        embedding = tf.reduce_mean(embeddings, axis=0).numpy()
        return embedding
    except:
        return None

def extract_all_embeddings(df, audio_dir, audio_type='cough'):
    audio_filename = f'{audio_type}.wav'
    embeddings_dict = {}

    for idx, row in df.iterrows():
        uid = row['id'] if 'id' in row else row['candidateID']
        audio_path = audio_dir / uid / audio_filename
        
        if not audio_path.exists():
            continue
        
        waveform = load_wav_16k_mono(str(audio_path))
        if waveform is None:
            continue
        
        embedding = extract_yamnet_embeddings(waveform)
        if embedding is None:
            continue
        
        embeddings_dict[uid] = embedding
    
    return embeddings_dict


## Dataset


In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, df, embeddings_dict, is_test=False):
        self.df = df.reset_index(drop=True)
        self.embeddings_dict = embeddings_dict
        self.is_test = is_test
        
        self.valid_indices = []
        for idx, row in self.df.iterrows():
            uid = row['id'] if 'id' in row else row['candidateID']
            if uid in self.embeddings_dict:
                self.valid_indices.append(idx)
                
    def __len__(self):
        return len(self.valid_indices)
    
    def __getitem__(self, idx):
        real_idx = self.valid_indices[idx]
        row = self.df.iloc[real_idx]
        uid = row['id'] if 'id' in row else row['candidateID']
        
        embedding = torch.from_numpy(self.embeddings_dict[uid]).float()
        
        if self.is_test:
            return embedding, uid
        else:
            label = int(row['label'])
            return embedding, label, uid


## Model Architecture


In [None]:
class YAMNetClassifier(nn.Module):
    def __init__(self, embedding_dim=Config.YAMNET_EMBEDDING_DIM, num_classes=Config.NUM_CLASSES):
        super().__init__()
        
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, num_classes)
        )
        
    def forward(self, x):
        logits = self.classifier(x)
        return logits, x
    
    def freeze_early_layers(self):
        for param in self.classifier[0].parameters():
            param.requires_grad = False
    
    def unfreeze_all(self):
        for param in self.parameters():
            param.requires_grad = True
    
    def get_trainable_params(self):
        return [p for p in self.parameters() if p.requires_grad]


## Training Functions


In [None]:
def train_epoch(model, loader, criterion, optimizer, scaler):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    
    for batch in loader:
        embeddings, labels, _ = batch
        embeddings, labels = embeddings.to(Config.DEVICE), labels.to(Config.DEVICE)
        
        optimizer.zero_grad()
        with autocast():
            logits, _ = model(embeddings)
            loss = criterion(logits, labels)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    if len(all_preds) == 0: 
        return 0, 0
    return total_loss / len(loader), f1_score(all_labels, all_preds, average='macro', zero_division=0)

def validate(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in loader:
            embeddings, labels, _ = batch
            embeddings, labels = embeddings.to(Config.DEVICE), labels.to(Config.DEVICE)
            
            with autocast():
                logits, _ = model(embeddings)
                loss = criterion(logits, labels)
            
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    if len(all_preds) == 0: 
        return 0, 0, [], []
    return total_loss / len(loader), f1_score(all_labels, all_preds, average='macro', zero_division=0), all_preds, all_labels


In [None]:
def train_pipeline(model, train_loader, val_loader, params, save_name, class_weights_tensor,
                   warmup_epochs=Config.WARMUP_EPOCHS, finetune_epochs=Config.FINETUNE_EPOCHS):
    
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.1)
    scaler = GradScaler()
    
    best_f1 = 0
    history = {'train_loss': [], 'val_f1': []}
    base_lr = params['lr']
    
    model.freeze_early_layers()
    optimizer = AdamW(model.get_trainable_params(), lr=base_lr, weight_decay=1e-2)
    scheduler = CosineAnnealingLR(optimizer, T_max=warmup_epochs)
    
    for epoch in range(warmup_epochs):
        t_loss, t_f1 = train_epoch(model, train_loader, criterion, optimizer, scaler)
        v_loss, v_f1, _, _ = validate(model, val_loader, criterion)
        
        history['train_loss'].append(t_loss)
        history['val_f1'].append(v_f1)
        
        if (epoch + 1) % 5 == 0:
            print(f"[{save_name.upper()}] Warmup Epoch {epoch+1}/{warmup_epochs} | Loss: {t_loss:.4f} | Val F1: {v_f1:.4f}")
        
        if v_f1 > best_f1:
            best_f1 = v_f1
            torch.save(model.state_dict(), Config.OUTPUT_PATH / f"{save_name}_best.pth")
        
        scheduler.step()
    
    model.unfreeze_all()
    ft_lr = base_lr / Config.LR_REDUCTION_FACTOR
    optimizer = AdamW(model.parameters(), lr=ft_lr, weight_decay=1e-2)
    scheduler = CosineAnnealingLR(optimizer, T_max=finetune_epochs)
    
    for epoch in range(finetune_epochs):
        t_loss, t_f1 = train_epoch(model, train_loader, criterion, optimizer, scaler)
        v_loss, v_f1, _, _ = validate(model, val_loader, criterion)
        
        history['train_loss'].append(t_loss)
        history['val_f1'].append(v_f1)
        
        if (epoch + 1) % 5 == 0:
            print(f"[{save_name.upper()}] Finetune Epoch {epoch+1}/{finetune_epochs} | Loss: {t_loss:.4f} | Val F1: {v_f1:.4f}")
        
        if v_f1 > best_f1:
            best_f1 = v_f1
            torch.save(model.state_dict(), Config.OUTPUT_PATH / f"{save_name}_best.pth")
            
        scheduler.step()
        
    return history, best_f1


## Optuna Hyperparameter Tuning


In [None]:
def objective(trial, train_df, embeddings_dict, class_weights_tensor):
    lr = trial.suggest_float('lr', 1e-4, 5e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    
    tr_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=Config.SEED)
    
    tr_loader = DataLoader(EmbeddingDataset(tr_df, embeddings_dict), batch_size=batch_size, 
                           shuffle=True, num_workers=0)
    val_loader = DataLoader(EmbeddingDataset(val_df, embeddings_dict), batch_size=batch_size, 
                           shuffle=False, num_workers=0)
    
    model = YAMNetClassifier().to(Config.DEVICE)
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.1)
    scaler = GradScaler()
    
    best_temp_f1 = 0
    optimizer = AdamW(model.parameters(), lr=lr)
    
    for epoch in range(Config.OPTUNA_FINETUNE_EPOCHS):
        train_epoch(model, tr_loader, criterion, optimizer, scaler)
        _, f1, _, _ = validate(model, val_loader, criterion)
        
        best_temp_f1 = max(best_temp_f1, f1)
        trial.report(f1, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()
            
    return best_temp_f1


## Load Data


In [None]:
train_df = pd.read_csv(Config.TRAIN_CSV)
test_df = pd.read_csv(Config.TEST_CSV)
train_df = train_df.rename(columns={'disease': 'label', 'candidateID': 'id'})
test_df = test_df.rename(columns={'candidateID': 'id'})

y_train = train_df['label'].values
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(Config.DEVICE)

train_df['has_cough'] = train_df['id'].isin(set(p.name for p in Config.COUGH_PATH.glob("*")))
train_df['has_vowel'] = train_df['id'].isin(set(p.name for p in Config.VOWEL_PATH.glob("*")))

print(f"Train: {len(train_df)} | Cough: {train_df['has_cough'].sum()} | Vowel: {train_df['has_vowel'].sum()}")
print(f"Test: {len(test_df)}")


## Extract Embeddings


In [None]:
df_c_train = train_df[train_df['has_cough']].copy()
cough_train_emb = extract_all_embeddings(df_c_train, Config.COUGH_PATH, 'cough')
cough_test_emb = extract_all_embeddings(test_df, Config.COUGH_PATH, 'cough')

df_v_train = train_df[train_df['has_vowel']].copy()
vowel_train_emb = extract_all_embeddings(df_v_train, Config.VOWEL_PATH, 'vowel')
vowel_test_emb = extract_all_embeddings(test_df, Config.VOWEL_PATH, 'vowel')

del yamnet_model
gc.collect()


## Train Cough Model


In [None]:
df_c_filtered = df_c_train[df_c_train['id'].isin(cough_train_emb.keys())].copy()

study_c = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_c.optimize(lambda t: objective(t, df_c_filtered, cough_train_emb, class_weights_tensor), 
                 n_trials=Config.N_TRIALS, show_progress_bar=True)
params_c = study_c.best_params

print(f"Best Params: {params_c} | F1: {study_c.best_value:.4f}")


In [None]:
tr_c, val_c = train_test_split(df_c_filtered, test_size=0.2, stratify=df_c_filtered['label'], random_state=Config.SEED)
model_c = YAMNetClassifier().to(Config.DEVICE)

hist_c, best_f1_c = train_pipeline(
    model_c,
    DataLoader(EmbeddingDataset(tr_c, cough_train_emb), batch_size=params_c['batch_size'], shuffle=True),
    DataLoader(EmbeddingDataset(val_c, cough_train_emb), batch_size=params_c['batch_size']),
    params_c, "cough", class_weights_tensor
)

print(f"Cough Model Best F1: {best_f1_c:.4f}")


### Cough Model Evaluation


In [None]:
model_c.load_state_dict(torch.load(Config.OUTPUT_PATH / "cough_best.pth"))
model_c.eval()

val_c_loader = DataLoader(
    EmbeddingDataset(val_c, cough_train_emb), 
    batch_size=32, shuffle=False, num_workers=0
)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.1)
val_loss, val_f1, val_preds, val_labels = validate(model_c, val_c_loader, criterion)

val_acc = accuracy_score(val_labels, val_preds)
class_f1 = f1_score(val_labels, val_preds, average=None, zero_division=0)

print(f"Accuracy: {val_acc:.4f} | F1 Macro: {val_f1:.4f}")
print(f"Per-Class F1: {class_f1}")
print(f"\n{classification_report(val_labels, val_preds, target_names=['Class 0', 'Class 1', 'Class 2'], zero_division=0)}")

cm_c = confusion_matrix(val_labels, val_preds)
fig_cm_c, ax_cm_c = plt.subplots(figsize=(8, 6))
sns.heatmap(cm_c, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1', 'Class 2'],
            yticklabels=['Class 0', 'Class 1', 'Class 2'],
            ax=ax_cm_c, cbar_kws={'label': 'Count'})
ax_cm_c.set_xlabel('Predicted', fontsize=12, fontweight='bold')
ax_cm_c.set_ylabel('True', fontsize=12, fontweight='bold')
ax_cm_c.set_title('Cough Model - Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(Config.OUTPUT_PATH / "cough_confusion_matrix.png", dpi=150, bbox_inches='tight')
plt.show()


## Train Vowel Model


In [None]:
df_v_filtered = df_v_train[df_v_train['id'].isin(vowel_train_emb.keys())].copy()

study_v = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_v.optimize(lambda t: objective(t, df_v_filtered, vowel_train_emb, class_weights_tensor),
                 n_trials=Config.N_TRIALS, show_progress_bar=True)
params_v = study_v.best_params

print(f"Best Params: {params_v} | F1: {study_v.best_value:.4f}")


In [None]:
tr_v, val_v = train_test_split(df_v_filtered, test_size=0.2, stratify=df_v_filtered['label'], random_state=Config.SEED)
model_v = YAMNetClassifier().to(Config.DEVICE)

hist_v, best_f1_v = train_pipeline(
    model_v,
    DataLoader(EmbeddingDataset(tr_v, vowel_train_emb), batch_size=params_v['batch_size'], shuffle=True),
    DataLoader(EmbeddingDataset(val_v, vowel_train_emb), batch_size=params_v['batch_size']),
    params_v, "vowel", class_weights_tensor
)

print(f"Vowel Model Best F1: {best_f1_v:.4f}")


### Vowel Model Evaluation


In [None]:
model_v.load_state_dict(torch.load(Config.OUTPUT_PATH / "vowel_best.pth"))
model_v.eval()

val_v_loader = DataLoader(
    EmbeddingDataset(val_v, vowel_train_emb), 
    batch_size=32, shuffle=False, num_workers=0
)

val_loss_v, val_f1_v, val_preds_v, val_labels_v = validate(model_v, val_v_loader, criterion)

val_acc_v = accuracy_score(val_labels_v, val_preds_v)
class_f1_v = f1_score(val_labels_v, val_preds_v, average=None, zero_division=0)

print(f"Accuracy: {val_acc_v:.4f} | F1 Macro: {val_f1_v:.4f}")
print(f"Per-Class F1: {class_f1_v}")
print(f"\n{classification_report(val_labels_v, val_preds_v, target_names=['Class 0', 'Class 1', 'Class 2'], zero_division=0)}")

cm_v = confusion_matrix(val_labels_v, val_preds_v)
fig_cm_v, ax_cm_v = plt.subplots(figsize=(8, 6))
sns.heatmap(cm_v, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Class 0', 'Class 1', 'Class 2'],
            yticklabels=['Class 0', 'Class 1', 'Class 2'],
            ax=ax_cm_v, cbar_kws={'label': 'Count'})
ax_cm_v.set_xlabel('Predicted', fontsize=12, fontweight='bold')
ax_cm_v.set_ylabel('True', fontsize=12, fontweight='bold')
ax_cm_v.set_title('Vowel Model - Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(Config.OUTPUT_PATH / "vowel_confusion_matrix.png", dpi=150, bbox_inches='tight')
plt.show()


## Training Metrics Visualization


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

ax[0].plot(hist_c['val_f1'], marker='o', label=f'Cough (Best: {best_f1_c:.3f})', linewidth=2)
ax[0].plot(hist_v['val_f1'], marker='s', label=f'Vowel (Best: {best_f1_v:.3f})', linewidth=2)
ax[0].set_xlabel('Epoch', fontsize=12)
ax[0].set_ylabel('Validation F1 Score', fontsize=12)
ax[0].set_title('Model Performance', fontsize=14, fontweight='bold')
ax[0].legend(fontsize=10)
ax[0].grid(True, alpha=0.3)

ax[1].plot(hist_c['train_loss'], marker='o', label='Cough', linewidth=2)
ax[1].plot(hist_v['train_loss'], marker='s', label='Vowel', linewidth=2)
ax[1].set_xlabel('Epoch', fontsize=12)
ax[1].set_ylabel('Training Loss', fontsize=12)
ax[1].set_title('Training Loss', fontsize=14, fontweight='bold')
ax[1].legend(fontsize=10)
ax[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(Config.OUTPUT_PATH / "training_metrics.png", dpi=150, bbox_inches='tight')
plt.show()


## Generate Submission


In [None]:
model_c.load_state_dict(torch.load(Config.OUTPUT_PATH / "cough_best.pth"))
model_v.load_state_dict(torch.load(Config.OUTPUT_PATH / "vowel_best.pth"))
model_c.eval()
model_v.eval()

def get_probs(model, embeddings_dict):
    probs_map = {}
    with torch.no_grad():
        for uid, emb in embeddings_dict.items():
            emb_tensor = torch.from_numpy(emb).float().unsqueeze(0).to(Config.DEVICE)
            with autocast():
                logits, _ = model(emb_tensor)
                probs = F.softmax(logits, dim=1).cpu().numpy()[0]
            probs_map[uid] = probs
    return probs_map

probs_c = get_probs(model_c, cough_test_emb)
probs_v = get_probs(model_v, vowel_test_emb)

print(f"Cough predictions: {len(probs_c)} | Vowel predictions: {len(probs_v)}")


In [None]:
final_results = []
missing_count = 0

for idx, row in test_df.iterrows():
    uid = row['id']
    p_c = probs_c.get(uid)
    p_v = probs_v.get(uid)
    
    if p_c is not None and p_v is not None:
        final_prob = (p_c + p_v) / 2
        pred = int(np.argmax(final_prob))
    elif p_c is not None:
        pred = int(np.argmax(p_c))
    elif p_v is not None:
        pred = int(np.argmax(p_v))
    else:
        pred = Config.FALLBACK_CLASS
        missing_count += 1
        
    final_results.append({'candidateID': uid, 'disease': pred})

submission = pd.DataFrame(final_results)
submission = submission[['candidateID', 'disease']]

print(f"Total: {len(submission)} | Missing: {missing_count}")
print(f"\nPrediction distribution:\n{submission['disease'].value_counts().sort_index()}")

submission.to_csv(Config.OUTPUT_PATH / "submission.csv", index=False)
submission.head(10)
