In [38]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Diabetes Prediction Challenge

## Business Understanding

Predict the probability of `diagnosed_diabetes`.

## Measurable Metrics

The ROC-AUC metric will be used, as required by the challenge guidelines.

## Data Preparation

In [70]:
import pandas as pd
import numpy as np
import seaborn as sns

import src.utils as utils

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [40]:
dtypes = {
    'id': np.int64,
    'age': np.int16,
    'alcohol_consumption_per_week': np.int64,
    'physical_activity_minutes_per_week': np.int64,
    'diet_score': np.float64,
    'sleep_hours_per_day': np.float64,
    'screen_time_hours_per_day': np.float64,
    'bmi': np.float64,
    'waist_to_hip_ratio': np.float64,
    'systolic_bp': np.int64,
    'diastolic_bp': np.int64,
    'heart_rate': np.int64,
    'cholesterol_total': np.int64,
    'hdl_cholesterol': np.int64,
    'ldl_cholesterol': np.int64,
    'triglycerides': np.int64,
    'gender': pd.CategoricalDtype(),
    'ethnicity': pd.CategoricalDtype(),
    'education_level': pd.CategoricalDtype(),
    'income_level': pd.CategoricalDtype(),
    'smoking_status': pd.CategoricalDtype(),
    'employment_status': pd.CategoricalDtype(),
    'family_history_diabetes': np.int64,
    'hypertension_history': np.int64,
    'cardiovascular_history': np.int64,
    'diagnosed_diabetes': np.float64,
}

df_train = pd.read_csv('input/train.csv', dtype=dtypes) # type: ignore
df_test = pd.read_csv('input/test.csv', dtype=dtypes) # type: ignore

In [41]:
df_train.sample(5, random_state=3)

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
543938,543938,45,1,72,5.3,7.4,4.0,27.2,0.87,100,83,77,198,41,133,91,Male,Black,Graduate,Lower-Middle,Never,Employed,0,0,0,1.0
115011,115011,89,3,165,7.3,8.7,6.7,26.9,0.82,132,80,48,214,57,126,135,Female,White,Graduate,Low,Never,Employed,0,1,0,1.0
620215,620215,59,3,90,4.9,6.5,6.7,26.7,0.89,106,88,61,203,58,123,121,Female,Hispanic,Graduate,Middle,Never,Employed,0,0,0,1.0
447572,447572,30,1,86,6.2,6.7,2.5,24.8,0.89,94,80,61,166,46,87,123,Female,Hispanic,Graduate,Upper-Middle,Current,Employed,0,0,0,0.0
330778,330778,72,3,65,5.6,7.9,6.8,24.9,0.86,116,78,83,215,58,123,114,Female,White,Graduate,Middle,Current,Employed,0,0,0,1.0


In [42]:
# skims data for more informations

utils.skim_data(df_train)

Total duplicate rows: 0
DF shape: (700000, 26)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,id,int64,0.0,0.0,0.0,700000,100.0,"[0, 1, 2, 3, 4]"
1,age,int16,0.0,0.0,0.0,71,0.01,"[31, 50, 32, 54, 42]"
2,alcohol_consumption_per_week,int64,0.0,0.0,0.0,9,0.0,"[1, 2, 3, 4, 5]"
3,physical_activity_minutes_per_week,int64,0.0,0.0,0.0,565,0.08,"[45, 73, 158, 77, 55]"
4,diet_score,float64,0.0,0.0,0.0,99,0.01,"[7.7, 5.7, 8.5, 4.6, 4.4]"
5,sleep_hours_per_day,float64,0.0,0.0,0.0,69,0.01,"[6.8, 6.5, 7.4, 7.0, 6.2]"
6,screen_time_hours_per_day,float64,0.0,0.0,0.0,151,0.02,"[6.1, 5.8, 9.1, 9.2, 5.1]"
7,bmi,float64,0.0,0.0,0.0,231,0.03,"[33.4, 23.8, 24.1, 26.6, 28.8]"
8,waist_to_hip_ratio,float64,0.0,0.0,0.0,36,0.01,"[0.93, 0.83, 0.9, 0.84, 0.89]"
9,systolic_bp,int64,0.0,0.0,0.0,71,0.01,"[112, 120, 95, 121, 108]"


Key takeaways:

- `id` is not useful since its variance is too high.
- data imbalance: there are more people with diabetes than without.
- no null values.
- no features with negative values.
- no features with 100% zero values.

In [43]:
utils.skim_data(df_test)

Total duplicate rows: 0
DF shape: (300000, 25)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,id,int64,0.0,0.0,0.0,300000,100.0,"[700000, 700001, 700002, 700003, 700004]"
1,age,int16,0.0,0.0,0.0,71,0.02,"[45, 35, 55, 77, 32]"
2,alcohol_consumption_per_week,int64,0.0,0.0,0.0,9,0.0,"[4, 1, 2, 3, 5]"
3,physical_activity_minutes_per_week,int64,0.0,0.0,0.0,544,0.18,"[100, 87, 61, 81, 29]"
4,diet_score,float64,0.0,0.0,0.0,99,0.03,"[4.3, 3.5, 7.6, 7.3, 4.8]"
5,sleep_hours_per_day,float64,0.0,0.0,0.0,68,0.02,"[6.8, 4.6, 7.3, 7.6, 7.0]"
6,screen_time_hours_per_day,float64,0.0,0.0,0.0,147,0.05,"[6.2, 9.0, 7.0, 5.0, 8.5]"
7,bmi,float64,0.0,0.0,0.0,230,0.08,"[25.5, 28.6, 28.5, 26.9, 22.0]"
8,waist_to_hip_ratio,float64,0.0,0.0,0.0,37,0.01,"[0.84, 0.88, 0.94, 0.91, 0.83]"
9,systolic_bp,int64,0.0,0.0,0.0,77,0.03,"[123, 120, 112, 114, 131]"


In [44]:
X = df_train.drop(columns=['diagnosed_diabetes'])
y = df_train['diagnosed_diabetes']

# separate to train and validation datasets

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=29, stratify=y)

## Fitting Model

### Fitting SGD Model

In [57]:
# find the best model

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV

import category_encoders as ce

from skopt import BayesSearchCV
from skopt.space import Real, Categorical

def sgd_proba(X_train, y_train, X_valid):
    # preprocessor
    numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
    numeric_features = [col for col in numeric_features if col != 'id']
    ohe_features = ['gender', 'smoking_status']
    target_features = ['ethnicity', 'education_level', 'employment_status']
    ordinal_feature = ['income_level']

    numeric_pipeline = Pipeline(
        steps=[('step', StandardScaler())]
    )
    ohe_pipeline = Pipeline(
        steps=[('step', OneHotEncoder(handle_unknown='ignore'))]
    )
    target_pipeline = Pipeline(
        steps=[('step', ce.TargetEncoder())]
    )
    ordinal_pipeline = Pipeline(
        steps=[
            (
                'step',
                OrdinalEncoder(
                    categories=[['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]
                )
            )
        ]
    )
    preprocessing = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_features),
            ('ohe', ohe_pipeline, ohe_features),
            ('tar', target_pipeline, target_features),
            ('ord', ordinal_pipeline, ordinal_feature),
        ],
        remainder='drop'
    )

    # main pipeline
    pipeline = Pipeline(
        steps=[
            ('preprocess', preprocessing),
            ('select', SelectFromModel(estimator=RandomForestClassifier(random_state=29))),
            ('rbf', RBFSampler(random_state=29)),
            (
                'classifier',
                CalibratedClassifierCV(
                    estimator=SGDClassifier(random_state=29, loss='log_loss'),
                    cv=5
                )
            )
        ]
    )

    # randomized search
    param_grid = {
        'select__threshold': Categorical(['0.5*median', '0.75*median']),
        'rbf__gamma': Real(1e-3, 1e+2, 'log-uniform'),
        'classifier__estimator__alpha': Real(1e-6, 1e-1, 'log-uniform'),
        'classifier__method': Categorical(['sigmoid', 'isotonic'])
    }
    custom_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=29)
    bayes_search = BayesSearchCV(
        estimator=pipeline,
        search_spaces=param_grid,
        scoring='roc_auc',
        n_jobs=4,
        cv=custom_cv,
        random_state=29,
        n_iter=30
    )
    start_time = utils.get_time()
    print(f'Fitting RandomizedSearch at {start_time}')
    bayes_search.fit(X_train, y_train)
    print(f'Best parameters: {bayes_search.best_params_}')
    best_model = bayes_search.best_estimator_
    timestamp = utils.get_time()
    utils.save_model(best_model, f'sgd_{timestamp}.joblib')
    print('Predict on valid dataset')
    df_valid_result = bayes_search.predict_proba(X_valid)
    df_submission = pd.DataFrame(
        {
            'id': X_valid['id'],
            'diagnosed_diabetes': df_valid_result[:, 1]
        }
    )
    timestamp = utils.get_time()
    df_submission.to_csv(f'input/sgd_valid_{timestamp}.csv', index=False)

sgd_proba(X_train, y_train, X_valid)

Fitting RandomizedSearch at 2025_12_03_21_15_09
Best parameters: OrderedDict({'classifier__estimator__alpha': 1e-06, 'classifier__method': 'isotonic', 'rbf__gamma': 0.001, 'select__threshold': '0.5*median'})
Model object successfully saved at: models/sgd_2025_12_04_00_30_44.joblib
Predict on valid dataset


### Fitting LightGBM Model

In [48]:
from lightgbm import LGBMClassifier

def lgbm_proba(X_train, y_train, X_valid):
    # preprocessor
    numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
    numeric_features = [col for col in numeric_features if col != 'id']
    ohe_features = ['gender', 'smoking_status']
    target_features = ['ethnicity', 'education_level', 'employment_status']
    ordinal_feature = ['income_level']

    numeric_pipeline = Pipeline(
        steps=[('step', StandardScaler())]
    )
    ohe_pipeline = Pipeline(
        steps=[('step', OneHotEncoder(handle_unknown='ignore'))]
    )
    target_pipeline = Pipeline(
        steps=[('step', ce.TargetEncoder())]
    )
    ordinal_pipeline = Pipeline(
        steps=[
            (
                'step',
                OrdinalEncoder(
                    categories=[['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]
                )
            )
        ]
    )
    preprocessing = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_features),
            ('ohe', ohe_pipeline, ohe_features),
            ('tar', target_pipeline, target_features),
            ('ord', ordinal_pipeline, ordinal_feature),
        ],
        remainder='drop'
    )

    # main pipeline
    pipeline = Pipeline(
        steps=[
            ('preprocess', preprocessing),
            ('classifier', LGBMClassifier(random_state=29))
        ]
    )

    # randomized search
    param_grid = {
        'classifier__n_estimators': [100, 200, 500],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__num_leaves': [20, 31, 40],
        'classifier__reg_alpha': [0.1, 0.5, 1.0],
        'classifier__reg_lambda': [0.1, 0.5, 1.0]
    }
    custom_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=29)
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_grid,
        scoring='roc_auc',
        n_jobs=5,
        cv=custom_cv,
        random_state=29,
    )
    start_time = utils.get_time()
    print(f'Fitting RandomizedSearch at {start_time}')
    random_search.fit(X_train, y_train)
    print(f'Best parameters: {random_search.best_params_}')
    best_model = random_search.best_estimator_
    timestamp = utils.get_time()
    utils.save_model(best_model, f'lgbm_{timestamp}.joblib')
    print('Predict on valid dataset')
    df_valid_result = random_search.predict_proba(X_valid)
    df_submission = pd.DataFrame(
        {
            'id': X_valid['id'],
            'diagnosed_diabetes': df_valid_result[:, 1]
        }
    )
    timestamp = utils.get_time()
    df_submission.to_csv(f'input/lgbm_valid_{timestamp}.csv', index=False)

lgbm_proba(X_train, y_train, X_valid)

Fitting RandomizedSearch at 2025_12_03_20_23_53




[LightGBM] [Info] Number of positive: 349046, number of negative: 210954
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042807 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1654
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623296 -> initscore=0.503564
[LightGBM] [Info] Start training from score 0.503564
Best parameters: {'classifier__reg_lambda': 1.0, 'classifier__reg_alpha': 0.1, 'classifier__num_leaves': 20, 'classifier__n_estimators': 500, 'classifier__learning_rate': 0.05}
Model object successfully saved at: models/lgbm_2025_12_03_20_28_58.joblib
Predict on valid dataset




### Generate Best Weight

In [58]:
from sklearn.metrics import roc_auc_score

valid_preds_sgd = pd.read_csv('input/sgd_valid_2025_12_04_00_30_47.csv')['diagnosed_diabetes']
valid_preds_lgbm = pd.read_csv('input/lgbm_valid_2025_12_03_20_29_00.csv')['diagnosed_diabetes']
best_score = 0
best_weight = 0

for weight in np.arange(0.0, 1.01, 0.01):
    blended_score = (weight * valid_preds_sgd) + ((1 - weight) * valid_preds_lgbm)
    score = roc_auc_score(y_valid, blended_score)

    if score > best_score:
        best_score = score
        best_weight = weight

print(f"Local validation score for SGD only: {roc_auc_score(y_valid, valid_preds_sgd):.5f}")
print(f"Local validation score for LGBM only: {roc_auc_score(y_valid, valid_preds_lgbm):.5f}")
print("-" * 20)
print(f"Best blending weight for SGD model: {best_weight:.2f}")
print(f"Best blending weight for LGBM model: {1 - best_weight:.2f}")
print(f"Best blended ROC AUC score on holdout set: {best_score:.5f}")
print("-" * 50)

Local validation score for SGD only: 0.69574
Local validation score for LGBM only: 0.72581
--------------------
Best blending weight for SGD model: 0.00
Best blending weight for LGBM model: 1.00
Best blended ROC AUC score on holdout set: 0.72581
--------------------------------------------------


### Generate Best Result

In [59]:
sgd_model = utils.load_model('models/sgd_2025_12_03_20_09_37.joblib')
lgbm_model = utils.load_model('models/lgbm_2025_12_03_20_28_58.joblib')

df_test_sgd_result = None
df_test_lgbm_result = None

if sgd_model is not None:
    print('Predict on test dataset using SGD')
    df_test_sgd_result = sgd_model.predict_proba(df_test)

if lgbm_model is not None:
    print('Predict on test dataset using LGBM')
    df_test_lgbm_result = lgbm_model.predict_proba(df_test)

final_blended_preds = (best_weight * df_test_sgd_result) + ((1 - best_weight) * df_test_lgbm_result)
df_submission = pd.DataFrame(
    {
        'id': df_test['id'],
        'diagnosed_diabetes': final_blended_preds[:, 1]
    }
)
timestamp = utils.get_time()
df_submission.to_csv(f'input/submission_blended_{timestamp}.csv', index=False)

Model loaded from: models/sgd_2025_12_03_20_09_37.joblib
Model loaded from: models/lgbm_2025_12_03_20_28_58.joblib
Predict on test dataset using SGD
Predict on test dataset using LGBM




### Fitting CatBoost Model

In [76]:
from skopt.space import Integer
from catboost import CatBoostClassifier

def catboost_proba(X_train, y_train, X_test):
    numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
    numeric_features = [col for col in numeric_features if col != 'id']
    ohe_features = ['gender', 'smoking_status']
    target_features = ['ethnicity', 'education_level', 'employment_status']
    ordinal_feature = ['income_level']
    categorical_features = ohe_features + target_features + ordinal_feature

    search_spaces = {
        'iterations': Integer(50, 200),
        'learning_rate': Real(1e-3, 0.1, 'log-uniform'),
        'depth': Integer(4, 10),
        'l2_leaf_reg': Real(1e-2, 1e1, 'log-uniform'),
        'border_count': Integer(32, 128),
        'bootstrap_type': Categorical(['Bayesian', 'MVS']),
        'random_strength': Real(1e-2, 1e0, 'log-uniform'),
    }

    cb_model = CatBoostClassifier(random_state=23,
                                  logging_level='Silent',
                                  cat_features=categorical_features)
    opt = BayesSearchCV(
        estimator=cb_model,
        search_spaces=search_spaces,
        n_iter=50,
        cv=3,
        scoring='roc_auc',
        verbose=0,
        n_jobs=4,
        random_state=29
    )

    start_time = utils.get_time()
    print(f'Fitting BayesSearch at {start_time}')
    opt.fit(X_train, y_train)
    print(f'Best parameters: {opt.best_params_}')
    best_model = opt.best_estimator_
    end_time = utils.get_time()
    utils.save_model(best_model, f'cbt_{end_time}.joblib')
    print('Predict on test dataset')
    df_test_result = opt.predict_proba(X_test)
    df_submission = pd.DataFrame(
        {
            'id': X_test['id'],
            'diagnosed_diabetes': df_test_result[:, 1]
        }
    )
    df_submission.to_csv(f'input/cbt_test_{end_time}.csv', index=False)

catboost_proba(X_train, y_train, df_test)

Fitting BayesSearch at 2025_12_04_23_00_02
Best parameters: OrderedDict({'bootstrap_type': 'MVS', 'border_count': 128, 'depth': 9, 'iterations': 200, 'l2_leaf_reg': 10.0, 'learning_rate': 0.1, 'random_strength': 0.01})
Model object successfully saved at: models/cbt_2025_12_05_00_00_06.joblib
Predict on test dataset


### Stacking Implementation

In [81]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

def stacking_proba(X_train, y_train, X_test):
    # preprocessor
    numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
    numeric_features = [col for col in numeric_features if col != 'id']
    ohe_features = ['gender', 'smoking_status']
    target_features = ['ethnicity', 'education_level', 'employment_status']
    ordinal_feature = ['income_level']
    numeric_pipeline = Pipeline(
        steps=[('step', StandardScaler())]
    )
    ohe_pipeline = Pipeline(
        steps=[('step', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]
    )
    target_pipeline = Pipeline(
        steps=[('step', ce.TargetEncoder())]
    )
    ordinal_pipeline = Pipeline(
        steps=[
            (
                'step',
                OrdinalEncoder(
                    categories=[['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]
                )
            )
        ]
    )
    preprocessing = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_features),
            ('ohe', ohe_pipeline, ohe_features),
            ('tar', target_pipeline, target_features),
            ('ord', ordinal_pipeline, ordinal_feature),
        ],
        remainder='drop'
    )

    # sgd pipeline
    sgd_pipeline = Pipeline(
        steps=[
            ('select', SelectFromModel(estimator=RandomForestClassifier(random_state=29), threshold='0.5*median')),
            ('rbf', RBFSampler(random_state=29, gamma=0.001)),
            (
                'classifier',
                SGDClassifier(random_state=29, loss='log_loss', alpha=1e-06)
            )
        ]
    )

    # lgbm pipeline
    lgbm_pipeline = LGBMClassifier(reg_lambda=1.0, reg_alpha=0.1, num_leaves=20, n_estimators=500, learning_rate=0.05, random_state=29)

    # catboost pipeline
    cbt_pipeline = CatBoostClassifier(random_state=23,
                                      logging_level='Silent',
                                      bootstrap_type='MVS',
                                      border_count=128,
                                      depth=9,
                                      iterations=200,
                                      l2_leaf_reg=10.0,
                                      learning_rate=0.1,
                                      random_strength=0.01)

    # main pipeline
    estimator_list = [
        ('lgbm', lgbm_pipeline),
        ('cbt', cbt_pipeline)
    ]
    meta_model = LogisticRegression(C=1.331359140628303, random_state=29)
    stacking_model = StackingClassifier(
        estimators=estimator_list,
        final_estimator=meta_model,
        cv=5,
        passthrough=True,
        n_jobs=4
    )
    pipeline = Pipeline(
        steps=[
            ('preprocess', preprocessing),
            ('classifier', stacking_model)
        ]
    )

    start_time = utils.get_time()
    print(f'Fitting Stacking Model at {start_time}')
    pipeline.fit(X_train, y_train)
    stacking_test_result = pipeline.predict_proba(X_test)
    df_submission = pd.DataFrame(
        {
            'id': X_test['id'],
            'diagnosed_diabetes': stacking_test_result[:, 1]
        }
    )
    timestamp = utils.get_time()
    df_submission.to_csv(f'input/submission_stacking_{timestamp}.csv', index=False)

X_train_whole = df_train.drop(columns=['diagnosed_diabetes'])
y_train_whole = df_train['diagnosed_diabetes']
stacking_proba(X_train_whole, y_train_whole, df_test)

Fitting Stacking Model at 2025_12_05_18_48_35
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Number of positive: 349046, number of negative: 210954
[LightGBM] [Info] Number of positive: 349046, number of negative: 210954
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.109792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1650
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 28
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1656
[LightGBM] [Info] Number of data points in the t

Latest result: CatBoost is still dominating, even against stacking. We will develop new feature engineering using combinatorial feature generation and asymmetry features.

## Feature Engineering

Thanks to [Yanzhe Zhou](https://www.kaggle.com/code/yuanzhezhou/baseline-lgb-xgb-and-catboost) for the idea.

In [86]:
utils.skim_data(df_train)

Total duplicate rows: 0
DF shape: (700000, 26)


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,id,int64,0.0,0.0,0.0,700000,100.0,"[0, 1, 2, 3, 4]"
1,age,int16,0.0,0.0,0.0,71,0.01,"[31, 50, 32, 54, 42]"
2,alcohol_consumption_per_week,int64,0.0,0.0,0.0,9,0.0,"[1, 2, 3, 4, 5]"
3,physical_activity_minutes_per_week,int64,0.0,0.0,0.0,565,0.08,"[45, 73, 158, 77, 55]"
4,diet_score,float64,0.0,0.0,0.0,99,0.01,"[7.7, 5.7, 8.5, 4.6, 4.4]"
5,sleep_hours_per_day,float64,0.0,0.0,0.0,69,0.01,"[6.8, 6.5, 7.4, 7.0, 6.2]"
6,screen_time_hours_per_day,float64,0.0,0.0,0.0,151,0.02,"[6.1, 5.8, 9.1, 9.2, 5.1]"
7,bmi,float64,0.0,0.0,0.0,231,0.03,"[33.4, 23.8, 24.1, 26.6, 28.8]"
8,waist_to_hip_ratio,float64,0.0,0.0,0.0,36,0.01,"[0.93, 0.83, 0.9, 0.84, 0.89]"
9,systolic_bp,int64,0.0,0.0,0.0,71,0.01,"[112, 120, 95, 121, 108]"


In [87]:
def transform_features(df):
    # creating families of features
    lifestyle_A = ['alcohol_consumption_per_week', 'physical_activity_minutes_per_week']
    lifestyle_B = ['diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day']
    heart_related = ['systolic_bp', 'diastolic_bp', 'heart_rate']
    cholesterol_level = ['hdl_cholesterol', 'ldl_cholesterol']

    df_copy = df.copy()

    # combinatorial feature generation (cfg)
    cfg_features = [lifestyle_A, lifestyle_B, heart_related, cholesterol_level]

    for feature in cfg_features:
        for i, a in enumerate(feature):
            for j, b in enumerate(feature):
                if i > j:
                    df_copy[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')

    # asymmetry features
    af_features = [lifestyle_B, heart_related]

    for feature in af_features:
        for i, a in enumerate(feature):
            for j, b in enumerate(feature):
                for k, c in enumerate(feature):
                    if i > j and j > k:
                        max_ = df_copy[[a, b, c]].max(axis=1)
                        min_ = df_copy[[a, b, c]].min(axis=1)
                        mid_ = df_copy[[a, b, c]].sum(axis=1)-min_-max_
                        df_copy[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)

    return df_copy

transform_features(df_train.sample(3))

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes,physical_activity_minutes_per_week_alcohol_consumption_per_week_imb,sleep_hours_per_day_diet_score_imb,screen_time_hours_per_day_diet_score_imb,screen_time_hours_per_day_sleep_hours_per_day_imb,diastolic_bp_systolic_bp_imb,heart_rate_systolic_bp_imb,heart_rate_diastolic_bp_imb,ldl_cholesterol_hdl_cholesterol_imb,screen_time_hours_per_day_sleep_hours_per_day_diet_score_imb2,heart_rate_diastolic_bp_systolic_bp_imb2
81159,81159,55,2,69,8.5,6.6,3.9,31.0,0.86,125,83,69,172,54,91,144,Male,White,Graduate,Middle,Never,Employed,0,0,0,1.0,0.943662,-0.125828,-0.370968,-0.257143,-0.201923,-0.28866,-0.092105,0.255172,0.703704,3.0
167180,167180,46,1,104,6.4,5.8,5.1,26.1,0.84,108,75,66,194,34,139,109,Female,Hispanic,Postgraduate,High,Former,Employed,0,0,0,1.0,0.980952,-0.04918,-0.113043,-0.06422,-0.180328,-0.241379,-0.06383,0.606936,0.857143,3.666667
622567,622567,74,2,44,7.2,6.9,2.5,29.7,0.91,130,81,70,192,56,115,150,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0,0.913043,-0.021277,-0.484536,-0.468085,-0.232227,-0.3,-0.072848,0.345029,0.068182,4.454545


In [88]:
def catboost_proba2(X_train, y_train, X_test):
    numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
    numeric_features = [col for col in numeric_features if col != 'id']
    ohe_features = ['gender', 'smoking_status']
    target_features = ['ethnicity', 'education_level', 'employment_status']
    ordinal_feature = ['income_level']
    categorical_features = ohe_features + target_features + ordinal_feature

    X_train_engineered = transform_features(X_train)
    X_test_engineered = transform_features(X_test)

    search_spaces = {
        'iterations': Integer(50, 200),
        'learning_rate': Real(1e-3, 0.1, 'log-uniform'),
        'depth': Integer(4, 10),
        'l2_leaf_reg': Real(1e-2, 1e1, 'log-uniform'),
        'border_count': Integer(32, 128),
        'bootstrap_type': Categorical(['Bayesian', 'MVS']),
        'random_strength': Real(1e-2, 1e0, 'log-uniform'),
    }

    cb_model = CatBoostClassifier(random_state=23,
                                  logging_level='Silent',
                                  cat_features=categorical_features)
    opt = BayesSearchCV(
        estimator=cb_model,
        search_spaces=search_spaces,
        n_iter=50,
        cv=3,
        scoring='roc_auc',
        verbose=0,
        n_jobs=4,
        random_state=29
    )

    start_time = utils.get_time()
    print(f'Fitting BayesSearch at {start_time}')
    opt.fit(X_train_engineered, y_train)
    print(f'Best parameters: {opt.best_params_}')
    best_model = opt.best_estimator_
    end_time = utils.get_time()
    utils.save_model(best_model, f'cbt_{end_time}.joblib')
    print('Predict on test dataset')
    df_test_result = opt.predict_proba(X_test_engineered)
    df_submission = pd.DataFrame(
        {
            'id': X_test['id'],
            'diagnosed_diabetes': df_test_result[:, 1]
        }
    )
    df_submission.to_csv(f'input/cbt_test_{end_time}.csv', index=False)

catboost_proba(X_train, y_train, df_test)

Fitting BayesSearch at 2025_12_05_20_05_54
Best parameters: OrderedDict({'bootstrap_type': 'MVS', 'border_count': 128, 'depth': 9, 'iterations': 200, 'l2_leaf_reg': 10.0, 'learning_rate': 0.1, 'random_strength': 0.01})
Model object successfully saved at: models/cbt_2025_12_05_21_23_22.joblib
Predict on test dataset


Latest score: 0.69831. The very same score apparently can be achieved without using the feature engineering.