In [1]:


import numpy as np
import pandas as pd
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from catboost import CatBoostClassifier

ROOT = Path('.')
TRAIN_CSV = ROOT / 'train_air_respiratory.csv'
TEST_CSV = ROOT / 'test_air_respiratory.csv'
COUGH_DIR = ROOT / 'dataclean_cough_mfcc'
VOWEL_DIR = ROOT / 'dataclean_vowel_mfcc'

def load_mean_embedding_mfcc(folder: Path):
    mfcc_path = folder / 'mfcc_features.json'
    if not mfcc_path.exists():
        return None
    try:
        with open(mfcc_path, 'r') as f:
            data = json.load(f)
        if isinstance(data, dict) and 'features' in data and 'mfcc' in data['features']:
            arr = np.asarray(data['features']['mfcc'], dtype=np.float32)
        elif isinstance(data, dict) and 'mfcc' in data:
            arr = np.asarray(data['mfcc'], dtype=np.float32)
        elif isinstance(data, list):
            arr = np.asarray(data, dtype=np.float32)
        else:
            return None
        if arr.ndim == 2:
            emb = arr.mean(axis=0)
        elif arr.ndim == 1:
            emb = arr
        else:
            return None
        return emb
    except Exception as e:
        return None

# Load and deduplicate train labels (one row per candidateID)
train_df = pd.read_csv(TRAIN_CSV).drop_duplicates(subset=['candidateID'])
label_map = dict(zip(train_df['candidateID'], train_df['disease']))



# Build cough-only dataset
cough_only_X, cough_only_y, cough_only_ids = [], [], []

for cid, label in label_map.items():
    cough_folder = COUGH_DIR / cid
    cough_emb = load_mean_embedding_mfcc(cough_folder)
    if cough_emb is not None:
        cough_only_X.append(cough_emb)
        cough_only_y.append(label)
        cough_only_ids.append(cid)

if cough_only_X:
    cough_only_X = np.vstack(cough_only_X)
else:
    cough_only_X = np.empty((0, 0))
cough_only_y = np.array(cough_only_y)

print(f"Cough-only training samples: {len(cough_only_y)}")

# Train cough-only model with evaluation
CatBoost_CoughOnly = dict(
    depth=6,                    
    learning_rate=0.1,
    iterations=400,             
    subsample=0.9,
    bootstrap_type='Bernoulli',
    rsm=0.9,                    
    loss_function='MultiClass', 
    classes_count=3,           
    thread_count=4,             
    random_seed=42,
    verbose=0,                 
    allow_writing_files=False  
)

if len(cough_only_y) > 0:
    unique_classes = np.unique(cough_only_y)
    can_eval = len(cough_only_y) >= 10 and len(unique_classes) >= 2
    if can_eval:
        X_tr, X_val, y_tr, y_val = train_test_split(
            cough_only_X, cough_only_y, test_size=0.2, random_state=42, 
            stratify=cough_only_y if len(unique_classes) > 1 else None
        )
        cough_only_model = CatBoostClassifier(**CatBoost_CoughOnly)
        cough_only_model.fit(X_tr, y_tr)
        val_pred = cough_only_model.predict(X_val)
        print("\nCough-only validation metrics (holdout 20%):")
        print(f"  f1_macro: {f1_score(y_val, val_pred, average='macro'):.4f}")
        print(f"  accuracy: {accuracy_score(y_val, val_pred):.4f}")
        print(f"  precision_macro: {precision_score(y_val, val_pred, average='macro', zero_division=0):.4f}")
        print(f"  recall_macro: {recall_score(y_val, val_pred, average='macro'):.4f}")
    else:
        print("Cough-only: skipped holdout metrics (insufficient samples/classes)")

    # Refit on full data
    cough_only_model = CatBoostClassifier(**CatBoost_CoughOnly)
    cough_only_model.fit(cough_only_X, cough_only_y)
else:
    cough_only_model = None
    print("Cough-only: no data available")

# Inference for cough-only
test_df = pd.read_csv(TEST_CSV)
cough_preds = []

for cid in test_df['candidateID']:
    cough_folder = COUGH_DIR / cid
    cough_emb = load_mean_embedding_mfcc(cough_folder)
    if cough_emb is not None and cough_only_model is not None:
        pred_class = cough_only_model.predict(cough_emb.reshape(1, -1))[0]
    else:
        pred_class = 2  # fallback
    cough_preds.append(pred_class)

cough_submission = pd.DataFrame({'candidateID': test_df['candidateID'], 'disease': cough_preds})
cough_submission_path = ROOT / 'submission_CatBoostJ_scough_only.csv'
cough_submission.to_csv(cough_submission_path, index=False)

print(f"\nSaved cough-only submission to {cough_submission_path}")
print(f"Total rows: {len(cough_submission)}")
print(f"Prediction distribution:\n{pd.Series(cough_preds).value_counts().sort_index()}")

PIPELINE A: COUGH-ONLY
Cough-only training samples: 538

Cough-only validation metrics (holdout 20%):
  f1_macro: 0.2610
  accuracy: 0.3519
  precision_macro: 0.2628
  recall_macro: 0.2877

Saved cough-only submission to submission_CatBoostJ_scough_only.csv
Total rows: 338
Prediction distribution:
[0]     57
[1]    209
[2]     72
Name: count, dtype: int64


In [2]:
# === PIPELINE B: Vowel-Only ===

print("\n" + "=" * 60)
print("PIPELINE B: VOWEL-ONLY")
print("=" * 60)

# Build vowel-only dataset
vowel_only_X, vowel_only_y, vowel_only_ids = [], [], []

for cid, label in label_map.items():
    vowel_folder = VOWEL_DIR / cid
    vowel_emb = load_mean_embedding_mfcc(vowel_folder)
    if vowel_emb is not None:
        vowel_only_X.append(vowel_emb)
        vowel_only_y.append(label)
        vowel_only_ids.append(cid)

if vowel_only_X:
    vowel_only_X = np.vstack(vowel_only_X)
else:
    vowel_only_X = np.empty((0, 0))
vowel_only_y = np.array(vowel_only_y)

print(f"Vowel-only training samples: {len(vowel_only_y)}")

# Train vowel-only model with evaluation
CatBoost_VowelOnly = dict(
   depth=6,                    
    learning_rate=0.1,
    iterations=400,             
    subsample=0.9,
    bootstrap_type='Bernoulli',
    rsm=0.9,                    
    loss_function='MultiClass', 
    classes_count=3,           
    thread_count=4,             
    random_seed=42,
    verbose=0,                 
    allow_writing_files=False 
)

if len(vowel_only_y) > 0:
    unique_classes = np.unique(vowel_only_y)
    can_eval = len(vowel_only_y) >= 10 and len(unique_classes) >= 2
    if can_eval:
        X_tr, X_val, y_tr, y_val = train_test_split(
            vowel_only_X, vowel_only_y, test_size=0.2, random_state=42, 
            stratify=vowel_only_y if len(unique_classes) > 1 else None
        )
        vowel_only_model = CatBoostClassifier(**CatBoost_VowelOnly)
        vowel_only_model.fit(X_tr, y_tr)
        val_pred = vowel_only_model.predict(X_val)
        print("\nVowel-only validation metrics (holdout 20%):")
        print(f"  f1_macro: {f1_score(y_val, val_pred, average='macro'):.4f}")
        print(f"  accuracy: {accuracy_score(y_val, val_pred):.4f}")
        print(f"  precision_macro: {precision_score(y_val, val_pred, average='macro', zero_division=0):.4f}")
        print(f"  recall_macro: {recall_score(y_val, val_pred, average='macro'):.4f}")
    else:
        print("Vowel-only: skipped holdout metrics (insufficient samples/classes)")

    # Refit on full data
    vowel_only_model = CatBoostClassifier(**CatBoost_VowelOnly)
    vowel_only_model.fit(vowel_only_X, vowel_only_y)
else:
    vowel_only_model = None
    print("Vowel-only: no data available")

# Inference for vowel-only
vowel_preds = []

for cid in test_df['candidateID']:
    vowel_folder = VOWEL_DIR / cid
    vowel_emb = load_mean_embedding_mfcc(vowel_folder)
    if vowel_emb is not None and vowel_only_model is not None:
        pred_class = vowel_only_model.predict(vowel_emb.reshape(1, -1))[0]
    else:
        pred_class = 2  # fallback
    vowel_preds.append(pred_class)

vowel_submission = pd.DataFrame({'candidateID': test_df['candidateID'], 'disease': vowel_preds})
vowel_submission_path = ROOT / 'submission_CatBoostJ_vowel_only.csv'
vowel_submission.to_csv(vowel_submission_path, index=False)

print(f"\nSaved vowel-only submission to {vowel_submission_path}")
print(f"Total rows: {len(vowel_submission)}")
print(f"Prediction distribution:\n{pd.Series(vowel_preds).value_counts().sort_index()}")

print("\n" + "=" * 60)
print("BOTH PIPELINES COMPLETED")
print("=" * 60)


PIPELINE B: VOWEL-ONLY
Vowel-only training samples: 533

Vowel-only validation metrics (holdout 20%):
  f1_macro: 0.3139
  accuracy: 0.3738
  precision_macro: 0.3173
  recall_macro: 0.3241

Saved vowel-only submission to submission_CatBoostJ_vowel_only.csv
Total rows: 338
Prediction distribution:
[0]     69
[1]    177
[2]     90
2        2
Name: count, dtype: int64

BOTH PIPELINES COMPLETED


In [3]:
import pandas as pd

# Load the cough submission dataframe
df_cough = pd.read_csv('submission_CatBoostJ_scough_only.csv')

# Inspect the data
print("Cough submission data:")
print(df_cough.head())
print(df_cough.info())
print(f"Total rows: {len(df_cough)}")


Cough submission data:
     candidateID disease
0  136bac9a3e081     [1]
1  b121e45942a46     [0]
2  6b6853c07e4fb     [1]
3  71de185eac888     [2]
4  25deed742f133     [0]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   candidateID  338 non-null    object
 1   disease      338 non-null    object
dtypes: object(2)
memory usage: 5.4+ KB
None
Total rows: 338


In [4]:
df_cough['disease'] = df_cough['disease'].astype(str).str.replace(r'[\[\]]', '', regex=True)


print(df_cough.head())

# Save the modified dataframe
df_cough.to_csv('submission_CatBoostJ_scough_only_cleaned.csv', index=False)
print("Saved to submission_CatBoostJ_scough_only_cleaned.csv")


Cough submission cleaned:
     candidateID disease
0  136bac9a3e081       1
1  b121e45942a46       0
2  6b6853c07e4fb       1
3  71de185eac888       2
4  25deed742f133       0
Saved to submission_CatBoostJ_scough_only_cleaned.csv


In [5]:
# Load the vowel submission dataframe
df_vowel = pd.read_csv('submission_CatBoostJ_vowel_only.csv')

# Inspect the data
print("Vowel submission data:")
print(df_vowel.head())
print(df_vowel.info())
print(f"Total rows: {len(df_vowel)}")


Vowel submission data:
     candidateID disease
0  136bac9a3e081     [1]
1  b121e45942a46     [1]
2  6b6853c07e4fb     [0]
3  71de185eac888     [2]
4  25deed742f133     [2]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   candidateID  338 non-null    object
 1   disease      338 non-null    object
dtypes: object(2)
memory usage: 5.4+ KB
None
Total rows: 338


In [6]:
# Convert the column to string and remove brackets from vowel submission
df_vowel['disease'] = df_vowel['disease'].astype(str).str.replace(r'[\[\]]', '', regex=True)

# Verify the changes
print("Vowel submission cleaned:")
print(df_vowel.head())

# Save the modified dataframe
df_vowel.to_csv('submission_CatBoostJ_vowel_only_cleaned.csv', index=False)
print("Saved to submission_CatBoostJ_vowel_only_cleaned.csv")


Vowel submission cleaned:
     candidateID disease
0  136bac9a3e081       1
1  b121e45942a46       1
2  6b6853c07e4fb       0
3  71de185eac888       2
4  25deed742f133       2
Saved to submission_CatBoostJ_vowel_only_cleaned.csv
