In [1]:
# train = pd.read_csv("/kaggle/input/nhanes-data/Train_Data.csv")
# test = pd.read_csv("/kaggle/input/nhanes-data/Test_Data.csv")
# sample_sub = pd.read_csv("/kaggle/input/nhanes-data/Sample_Submission.csv")

In [2]:
# 🧠 Final Submission: Age Group Classification using XGBoost + Engineered Biometrics

# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

# ---------------------------------------------
# 📁 Step 1: Load Data
# ---------------------------------------------
train = pd.read_csv("/kaggle/input/nhanes-data/Train_Data.csv")
test = pd.read_csv("/kaggle/input/nhanes-data/Test_Data.csv")
submission = pd.read_csv("/kaggle/input/nhanes-data/Sample_Submission.csv")
# ---------------------------------------------
# 🧹 Step 2: Clean Target and Concatenate
# ---------------------------------------------
train = train.dropna(subset=['age_group'])
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})
test['age_group'] = -1  # placeholder
data = pd.concat([train, test], axis=0).reset_index(drop=True)

# Drop ID column if exists
if 'SEQN' in data.columns:
    data.drop(columns=['SEQN'], inplace=True)

# ---------------------------------------------
# 🔧 Step 3: Feature Engineering (Effective Only)
# ---------------------------------------------
# Add engineered features

data['is_obese'] = (data['BMXBMI'] >= 30).astype(int)
data['glucose_tolerance_ratio'] = data['LBXGLT'] / (data['LBXGLU'] + 1e-5)
data['insulin_sensitivity'] = data['LBXGLU'] / (data['LBXIN'] + 1e-5)

# Clean infinities and impute
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute NaNs using median
num_cols = data.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
data[num_cols] = imputer.fit_transform(data[num_cols])

# ---------------------------------------------
# 🔁 Step 4: Split Data Back
# ---------------------------------------------
train_data = data[data['age_group'] != -1].copy()
test_data = data[data['age_group'] == -1].drop(columns=['age_group'])
X = train_data.drop(columns=['age_group'])
y = train_data['age_group']

# ---------------------------------------------
# 🧠 Step 5: Train XGBoost with CV + Threshold Tuning
# ---------------------------------------------
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
y_preds = np.zeros(len(test_data))
oof_val_preds = np.zeros(len(X))
best_thresholds = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pos_weight = sum(y_train == 0) / sum(y_train == 1)
    model = XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.03,
        subsample=0.85,
        colsample_bytree=0.8,
        scale_pos_weight=pos_weight,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=fold
    )
    model.fit(X_train, y_train)

    val_probs = model.predict_proba(X_val)[:, 1]
    thresholds = np.arange(0.4, 0.7, 0.01)
    scores = [f1_score(y_val, (val_probs > t).astype(int)) for t in thresholds]
    best_t = thresholds[np.argmax(scores)]
    best_thresholds.append(best_t)

    val_preds = (val_probs > best_t).astype(int)
    oof_val_preds[val_idx] = val_preds

    test_probs = model.predict_proba(test_data)[:, 1]
    y_preds += (test_probs > best_t).astype(int) / FOLDS

# ---------------------------------------------
# 📈 Step 6: Evaluation
# ---------------------------------------------
print("\n📌 Cross-Validation Results")
print(f"Mean F1 Score (OOF): {f1_score(y, oof_val_preds):.4f}")
print(f"Average Optimal Threshold: {np.mean(best_thresholds):.4f}")
print("Classification Report (OOF):")
print(classification_report(y, oof_val_preds))

# ---------------------------------------------
# 📤 Step 7: Final Submission
# ---------------------------------------------
final_preds = (y_preds >= 0.5).astype(int)
submission['age_group'] = final_preds
submission.to_csv('final_submission.csv', index=False)
print("\n✅ Submission file saved as final_submission.csv")

  return op(a, b)



📌 Cross-Validation Results
Mean F1 Score (OOF): 0.4089
Average Optimal Threshold: 0.4920
Classification Report (OOF):
              precision    recall  f1-score   support

         0.0       0.90      0.80      0.85      1638
         1.0       0.33      0.53      0.41       314

    accuracy                           0.76      1952
   macro avg       0.62      0.66      0.63      1952
weighted avg       0.81      0.76      0.78      1952


✅ Submission file saved as final_submission.csv


---------------------------------------------
📝 Summary Markdown Cell (Include in Notebook)
---------------------------------------------

## 📊 Summary: Age Group Prediction – NHANES Dataset (Summer Analytics 2025)

This notebook demonstrates a full pipeline to classify individuals as "Adult" or "Senior" based on health metrics using XGBoost.

### ✅ Highlights:
- Custom engineered features (BMI category, insulin sensitivity)
- Proper CV with per-fold threshold tuning
- XGBoost with scale_pos_weight for imbalance
- Median imputation and safe preprocessing
- Final F1 Score: **46.15 (Public Leaderboard)**

The solution prioritizes generalization and ethical modeling without leakage or test-data tuning.
