In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [2]:
# Load data
train = pd.read_csv('Train_Data.csv')
test = pd.read_csv('Test_Data.csv')

In [3]:
test_ids = test['SEQN']
train.drop('SEQN', axis=1, inplace=True)
test.drop('SEQN', axis=1, inplace=True)

In [4]:
# Encode categorical variables
def encode_data(df):
    df = df.copy()
    df['RIAGENDR'] = df['RIAGENDR'].map({1: 0, 2: 1})
    df['PAQ605'] = df['PAQ605'].map({1: 1, 2: 0, 9: np.nan})
    df['DIQ010'] = df['DIQ010'].map({1: 1, 2: 0, 3: np.nan, 9: np.nan})
    return df

In [5]:
X = train.drop('age_group', axis=1)
y = train['age_group'].map({'Adult': 0, 'Senior': 1})  # Convert to numeric

In [6]:
X_encoded = encode_data(X)
test_encoded = encode_data(test)

In [7]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)
test_imputed = pd.DataFrame(imputer.transform(test_encoded), columns=X_encoded.columns)

In [8]:
# Drop rows with NaN in y
mask = ~y.isnull()
X_clean = X_imputed[mask].reset_index(drop=True)
y_clean = y[mask].reset_index(drop=True)

In [9]:
# Feature engineering
X_clean['GLU_to_GLT'] = X_clean['LBXGLU'] / (X_clean['LBXGLT'] + 1e-5)
test_imputed['GLU_to_GLT'] = test_imputed['LBXGLU'] / (test_imputed['LBXGLT'] + 1e-5)

In [10]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean)

In [11]:
# SMOTE oversampling
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [18]:
# XGBoost classifier
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    scale_pos_weight=1.5,  # try tuning based on class imbalance ratio
    eval_metric='logloss',
    random_state=42
)

In [19]:
model.fit(X_res, y_res)

In [20]:
# Evaluate
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.87      0.83      0.85       328
         1.0       0.30      0.38      0.33        63

    accuracy                           0.75       391
   macro avg       0.59      0.60      0.59       391
weighted avg       0.78      0.75      0.77       391



In [21]:
# Train on full data + oversample
X_final, y_final = sm.fit_resample(X_clean, y_clean)
model.fit(X_final, y_final)

In [22]:
# Predict test
final_preds = model.predict(test_imputed).astype(int)
submission = pd.DataFrame({'age_group': final_preds})
submission.to_csv('submission.csv', index=False)