## Standalone Tabular-Only Pipeline
Train XGBoost using only demographic and medical features from CSV. No audio embeddings used.

In [None]:
# === STANDALONE TABULAR-ONLY PIPELINE ===

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import xgboost as xgb

# Paths
ROOT = Path('.')
TRAIN_CSV = ROOT / 'train_air_respiratory.csv'
TEST_CSV = ROOT / 'test_air_respiratory.csv'

print("=" * 60)
print("TABULAR-ONLY XGBOOST PIPELINE")
print("=" * 60)

# === LOAD AND PREPARE TRAINING DATA ===

train_df = pd.read_csv(TRAIN_CSV)
print(f"\nTrain CSV shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")

# Remove duplicates by candidateID (keep first occurrence for each ID)
train_df = train_df.drop_duplicates(subset=['candidateID'], keep='first')
print(f"After deduplication: {train_df.shape[0]} unique candidates")

# Extract target
y_train = train_df['disease'].values
print(f"Target distribution:\n{pd.Series(y_train).value_counts().sort_index()}")

# Extract features: all columns except candidateID and disease
feature_cols = [col for col in train_df.columns if col not in ['candidateID', 'disease']]
print(f"\nFeature columns ({len(feature_cols)}): {feature_cols}")

X_train = train_df[feature_cols].copy()
print(f"\nX_train shape before preprocessing: {X_train.shape}")
print(f"X_train data types:\n{X_train.dtypes}")
print(f"\nMissing values before imputation:\n{X_train.isnull().sum()}")

# === PREPROCESSING ===

# Create imputer for missing values (most frequent strategy for categorical, median for numeric)
imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = imputer.fit_transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=feature_cols)

print(f"\nMissing values after imputation:\n{X_train_imputed.isnull().sum().sum()} total")

# Convert all to numeric (ensure float dtype for XGBoost)
X_train_imputed = X_train_imputed.astype(np.float32)

print(f"X_train shape after preprocessing: {X_train_imputed.shape}")

# === TRAINING WITH VALIDATION ===

unique_classes = np.unique(y_train)
can_eval = len(y_train) >= 10 and len(unique_classes) >= 2

if can_eval:
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train_imputed, y_train, test_size=0.2, random_state=42,
        stratify=y_train if len(unique_classes) > 1 else None
    )
    
    xgb_params = dict(
        max_depth=6,
        learning_rate=0.1,
        n_estimators=400,
        subsample=0.9,
        colsample_bytree=0.9,
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=3,
        n_jobs=4,
        random_state=42,
        verbosity=0
    )
    
    tabular_model = xgb.XGBClassifier(**xgb_params)
    tabular_model.fit(X_tr, y_tr)
    val_pred = tabular_model.predict(X_val)
    
    print("\nValidation metrics (holdout 20%):")
    print(f"  f1_macro: {f1_score(y_val, val_pred, average='macro'):.4f}")
    print(f"  accuracy: {accuracy_score(y_val, val_pred):.4f}")
    print(f"  precision_macro: {precision_score(y_val, val_pred, average='macro', zero_division=0):.4f}")
    print(f"  recall_macro: {recall_score(y_val, val_pred, average='macro'):.4f}")
else:
    print("Tabular: skipped holdout metrics (insufficient samples/classes)")

# Refit on full training data
tabular_model = xgb.XGBClassifier(**xgb_params)
tabular_model.fit(X_train_imputed, y_train)

print("\nTabular model trained on full dataset.")

# === INFERENCE ON TEST SET ===

test_df = pd.read_csv(TEST_CSV)
print(f"\nTest CSV shape: {test_df.shape}")

# Apply same preprocessing to test set
X_test = test_df[feature_cols].copy()
X_test_imputed = imputer.transform(X_test)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=feature_cols)
X_test_imputed = X_test_imputed.astype(np.float32)

print(f"X_test shape after preprocessing: {X_test_imputed.shape}")

# Generate predictions
test_preds = tabular_model.predict(X_test_imputed)

# Create submission
submission = pd.DataFrame({
    'candidateID': test_df['candidateID'],
    'disease': test_preds
})

submission_path = ROOT / 'submission_Xgb_tabular_only.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSaved tabular-only submission to {submission_path}")
print(f"Total rows: {len(submission)}")
print(f"Prediction distribution:\n{pd.Series(test_preds).value_counts().sort_index()}")

print("\n" + "=" * 60)
print("TABULAR-ONLY PIPELINE COMPLETED")
print("=" * 60)

TABULAR-ONLY XGBOOST PIPELINE

Train CSV shape: (546, 11)
Columns: ['candidateID', 'age', 'gender', 'tbContactHistory', 'wheezingHistory', 'phlegmCough', 'familyAsthmaHistory', 'feverHistory', 'coldPresent', 'packYears', 'disease']
After deduplication: 544 unique candidates
Target distribution:
0    139
1    237
2    168
Name: count, dtype: int64

Feature columns (9): ['age', 'gender', 'tbContactHistory', 'wheezingHistory', 'phlegmCough', 'familyAsthmaHistory', 'feverHistory', 'coldPresent', 'packYears']

X_train shape before preprocessing: (544, 9)
X_train data types:
age                      int64
gender                   int64
tbContactHistory       float64
wheezingHistory        float64
phlegmCough            float64
familyAsthmaHistory    float64
feverHistory             int64
coldPresent            float64
packYears                int64
dtype: object

Missing values before imputation:
age                      0
gender                   0
tbContactHistory         0
wheezingHistory

: 