In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#!pip install --force-reinstall -U scikit-learn imbalanced-learn --quiet



import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aiplanet/Train_Data.csv
/kaggle/input/aiplanet/Test_Data.csv
/kaggle/input/okokok/Sample_Submission.csv


In [2]:
# Load data
train = pd.read_csv("/kaggle/input/aiplanet/Train_Data.csv")
test = pd.read_csv("/kaggle/input/aiplanet/Test_Data.csv")

# Filter target rows
train = train[train['age_group'].isin(['Adult', 'Senior'])]
y = train['age_group'].map({'Adult': 0, 'Senior': 1})
X = train.drop(columns=['SEQN', 'age_group'])

# Save SEQN for submission
test_seqn = test['SEQN']
X_test = test.drop(columns=['SEQN'])

# Impute missing values using median
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [3]:
# Create glucose-insulin ratio
X['glucose_insulin_ratio'] = X['LBXGLU'] / (X['LBXIN'] + 1)
X_test['glucose_insulin_ratio'] = X_test['LBXGLU'] / (X_test['LBXIN'] + 1)

# BMI class bins
X['bmi_class'] = pd.cut(X['BMXBMI'], bins=[0, 18.5, 25, 30, 100], labels=[0, 1, 2, 3])
X_test['bmi_class'] = pd.cut(X_test['BMXBMI'], bins=[0, 18.5, 25, 30, 100], labels=[0, 1, 2, 3])

# Fill new categorical bmi_class
X['bmi_class'] = X['bmi_class'].astype(float)
X_test['bmi_class'] = X_test['bmi_class'].astype(float)

ratio = (y == 0).sum() / (y == 1).sum()


In [4]:
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=ratio, random_state=42),
    "LightGBM": LGBMClassifier(class_weight='balanced', random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42, scale_pos_weight=ratio)
}

# Evaluate with Stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1 = make_scorer(f1_score)

print("\n📊 Model Evaluation:")
results = {}
for name, model in models.items():
    score = cross_val_score(model, X, y, scoring=f1, cv=cv)
    print(f"{name}: Mean F1 = {score.mean():.4f}")
    results[name] = score.mean()




📊 Model Evaluation:
XGBoost: Mean F1 = 0.2844
[LightGBM] [Info] Number of positive: 251, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003255 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1002
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 251, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000100 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1006
[LightGBM] [Info] Number of data points in the train set: 1561, number o

In [5]:
# Train best model on full data
best_model_name = max(results, key=results.get)
final_model = models[best_model_name]
final_model.fit(X, y)

# Predict on test
preds = final_model.predict(X_test)

# Save submission
submission = pd.DataFrame({'age_group': preds})
submission.to_csv("submission.csv", index=False)
print(f"\n✅ submission.csv created using {best_model_name}.")



✅ submission.csv created using CatBoost.
