# Setup

In [1]:
import os
import gc
gc.enable()

import numpy as np
import scipy.stats as st
import pandas as pd
pd.set_option('precision', 4)
pd.set_option('display.max_columns', None)

import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

import catboost as cb
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from typing import Tuple, Any

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [2]:
#remove cell to use future versions
assert optuna.__version__ == '2.10.0', f'Change in Optuna version. Original notebook version: 2.10.0'
assert cb.__version__ == '1.0.6', f'Change in CatBoost version. Original notebook version: 1.0.6'

In [3]:
train = pd.read_csv('../input/autismdiagnosis/Autism_Prediction/train.csv')
test = pd.read_csv('../input/autismdiagnosis/Autism_Prediction/test.csv')

# Data preparation

In [4]:
features = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 
            'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 
            'AQ10_sum', 'AQ10_6_or_above', 'result', 'autism_family', 
            'is_White_European', 'jaundice', 'relation', 'age', 'age_group']

num_features = ['result', 'age', 'AQ10_sum'] #continuous features
cat_features = [f for f in features if f not in num_features] #categorical features

best_features = [f for f in features if f not in ('age', 'age_group', 'jaundice', 'relation')]

In [5]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare the data according to the process described in
    (https://www.kaggle.com/code/stiwarids/autismprediction-eda),
    and return the modified dataframe.
    """
    #creating features - AQ10_sum, AQ10_6_or_above
    aq_scores = [f'A{i}_Score' for i in range(1, 11)]
    df['AQ10_sum'] = df[aq_scores].sum(axis=1)
    df['AQ10_6_or_above'] = (df['AQ10_sum'] > 5).astype('int')
    
    #creating feature - age_group
    def create_age_group(age):
        """Determine age group and return an integer indicating the category."""
        if age < 13: return 0 #child
        elif age < 21: return 1 #adolescent
        elif age < 40: return 2 #adult
        elif age < 60: return 3 #middle-aged
        else: return 4 #elderly
        
    df['age_group'] = df['age'].apply(create_age_group)
    
    #creating feature - is_White_European
    df['is_White_European'] = (df['ethnicity'] == 'White-European').astype(int)
    
    #reclassifying column - relation
    relation_mapping = {
        'Self': 2, 
        'Parent': 1, 'Relative': 1, 
        'Others': 0, '?': 0, 'Health care professional': 0
    }
    df['relation'] = df['relation'].replace(to_replace=relation_mapping)
    
    #correcting column name - austim -> autism_family
    df.rename({'austim': 'autism_family'}, axis=1, inplace=True)
    
    #integer encoding for 'yes'/'no' categorical columns - jaundice, autism_family
    df[['jaundice', 'autism_family']] = df[['jaundice', 'autism_family']].replace(to_replace={'no': 0, 'yes': 1})
    
    df = df.loc[:, features] #retaining only the selected features
    df[cat_features] = df[cat_features].astype('category') #marking the categorical features
    return df

In [6]:
test_index = test.ID #for generating submission file
target = train['Class/ASD']

train = preprocess_dataset(train)
test = preprocess_dataset(test)

gc.collect()

0

# Hyperparameter tuning (using Optuna)

**objective** function

In [7]:
def objective(
        trial: optuna.trial.Trial, 
        data: Tuple[pd.DataFrame, pd.Series],
        base_params: dict,
        resampler: Any = None) -> float:
    """Define hyperparameter search space for trial, evaluate the model
    with chosen hyperparameters on data provided, and return the metric
    score (ROC-AUC).
    """
    
    scores = []
    X, y = data
    cat_features = list(X.select_dtypes(include='category').columns)
    
    #Defining hyperparameter search space
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3, step=0.05),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2, 10, step=0.1),
        'depth': trial.suggest_int('depth', 3, 15),
        'rsm': trial.suggest_float('rsm', 0.5, 1.0, step=0.05), #colsample_bylevel
        'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 4, 10),
        'random_strength': trial.suggest_float('random_strength', 0.2, 5),
        'bootstrap_type': trial.suggest_categorical(
            'bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
    }
    #conditional hyperparameters
    if param_grid['bootstrap_type'] == 'Bayesian':
        param_grid['bagging_temperature'] = trial.suggest_int('bagging_temperature', 0, 10)
    else:
        param_grid['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
    
    #cross-validating for each trial
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        xtrain, ytrain = X.loc[train_idx], y.iloc[train_idx]
        xval, yval = X.loc[val_idx], y.iloc[val_idx]
        if resampler:
            #only training folds are resampled to ensure 
            #realistic evaluation on imbalanced validation fold
            xtrain, ytrain = resampler.fit_resample(xtrain, ytrain)
        
        class_ratio = sum(ytrain == 0) / sum(ytrain == 1)
        model = cb.CatBoostClassifier(
            **base_params, 
            **param_grid,
            scale_pos_weight=class_ratio) #setting this only after resampling
        model.fit(
            xtrain, ytrain,
            eval_set=[(xval, yval)],
            cat_features=cat_features,
            early_stopping_rounds=25,
            verbose=False)
        predicted_probs = model.predict_proba(xval)[:, 1]
        scores.append(roc_auc_score(yval, predicted_probs))
    
    return np.mean(scores)

**tune_hyperparameters** function

In [8]:
def tune_hyperparameters(
        data: Tuple[pd.DataFrame, pd.Series],
        base_params: dict,
        resampler: Any = None,
        n_trials: int = 5,
        direction: str = 'maximize') -> optuna.study.Study:
    """Optimize CatBoost objective function and return dict of best hyperparameters."""
    
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(seed=SEED),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Cross-validation

**evaluate_model** function

In [9]:
def evaluate_model(
        data: Tuple[pd.DataFrame, pd.DataFrame, pd.Series],
        model_params: dict, 
        resampler: Any = None,
        verbose: bool = True) -> list:
    """Cross-validate the model and return the predicted probabilities for test-set."""
    
    #scores/predictions for all folds
    probs_test = [] #test set predicted probabilities
    preds_test = [] #test set predicted classes
    scores_auc = [] #validation set AUC scores
    scores_f1 = [] #validation set weighted-f1 scores
    scores_balacc = [] #validation set balanced accuracy scores
    
    X, X_test, y = data
    cat_features = list(X.select_dtypes(include='category').columns)
    
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        xtrain, ytrain = X.loc[train_idx], y.iloc[train_idx]
        xval, yval = X.loc[val_idx], y.iloc[val_idx]
        
        if resampler:
            #only training folds are resampled to ensure 
            #realistic evaluation on imbalanced validation fold
            xtrain, ytrain = resampler.fit_resample(xtrain, ytrain)
        
        class_ratio = sum(ytrain == 0) / sum(ytrain == 1)
        model = cb.CatBoostClassifier(
            **model_params,
            scale_pos_weight=class_ratio) #setting this only after resampling
        
        model.fit(
            xtrain, ytrain,
            eval_set=[(xval, yval)],
            cat_features=cat_features,
            verbose=False)
        
        probs_val = model.predict_proba(xval)[:, 1]
        preds_val = model.predict(xval)
        probs_test.append(model.predict_proba(X_test)[:, 1])
        preds_test.append(model.predict(X_test))
        
        scores_auc.append(roc_auc_score(yval, probs_val))
        scores_f1.append(f1_score(yval, preds_val, average='weighted'))
        scores_balacc.append(balanced_accuracy_score(yval, preds_val))
    
    if verbose:
        scores_df = pd.DataFrame.from_dict({
            'ROC-AUC': scores_auc,
            'Weighted-f1': scores_f1,
            'Balanced-accuracy':scores_balacc
        })
        scores_df.index.name = 'Fold'
        display(scores_df.T)
    
    print(f'Average ROC-AUC = {np.mean(scores_auc):.4f} (with std = {np.std(scores_auc):.4f})')
    print(f'Average Weighted-f1 = {np.mean(scores_f1):.4f} (with std = {np.std(scores_f1):.4f})')
    print(f'Average Balanced-accuracy = {np.mean(scores_balacc):.4f} (with std = {np.std(scores_balacc):.4f})')
    
    preds_test = st.mode(np.column_stack(preds_test), axis=1).mode
    probs_test = np.mean(np.column_stack(probs_test), axis=1)
    
    return probs_test #only probabilities needed for competition metric

# Experiment = Hyperparameter-tuning + Cross-validation

In [10]:
def run_experiment(
        feature_set: list, 
        resampler: Any = None, 
        n_trials: int = 5) -> np.ndarray:
    """Tune the hyperparameters, train and evaluate the model, and return test predictions."""
    
    X = train[feature_set].copy()
    X_test = test[feature_set].copy()
    y = target.copy()
    
    base_params = {
        'n_estimators': 1000,
        'boosting_type': 'Ordered', #since our dataset is small
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'early_stopping_rounds': 25,
        'use_best_model': True,
        'one_hot_max_size': 2, #one-hot-encoding binary features only
        'verbose': False,
        'random_seed': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_hyperparameters(
        data=(X, y), 
        base_params=base_params,
        resampler=resampler,
        n_trials=n_trials,
        direction='maximize')
    print(f'Best trial: {study.best_trial.number} -> Best value (AUC): {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    predicted_probs = evaluate_model(
        data=(X, X_test, y), 
        model_params=model_params, 
        resampler=resampler)
    
    return predicted_probs

In [11]:
#test-set predicted probabilities for all experiments
predictions_dict = {}

### Experiment 1: All features, no resampling

In [12]:
%%time
predictions_dict[1] = run_experiment(feature_set=features, n_trials=50)

---------------Hyperparameter tuning---------------
Best trial: 10 -> Best value (AUC): 0.92259
Best hyperparameters:
learning_rate        - 0.15000000000000002
l2_leaf_reg          - 3.1
depth                - 7
rsm                  - 0.5
max_ctr_complexity   - 7
random_strength      - 2.0455983252981476
bootstrap_type       - Bernoulli
subsample            - 0.9020836064760344
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
ROC-AUC,0.8682,0.9541,0.9363,0.8988,0.9556,0.916,0.8889,0.9448,0.9329,0.932,0.9507,0.8774,0.9161,0.9197,0.9145
Weighted-f1,0.8197,0.8771,0.8127,0.8047,0.8501,0.8176,0.8055,0.8392,0.8492,0.8392,0.8506,0.8153,0.8269,0.8554,0.8119
Balanced-accuracy,0.7969,0.8828,0.8477,0.7969,0.8752,0.8398,0.8086,0.8555,0.8516,0.8673,0.875,0.8047,0.8242,0.8672,0.8476


Average ROC-AUC = 0.9204 (with std = 0.0265)
Average Weighted-f1 = 0.8317 (with std = 0.0209)
Average Balanced-accuracy = 0.8427 (with std = 0.0287)
CPU times: user 9min 32s, sys: 1min 35s, total: 11min 7s
Wall time: 4min 40s


### Experiment 2: Best features, no resampling

In [13]:
%%time
predictions_dict[2] = run_experiment(feature_set=best_features, n_trials=50)

---------------Hyperparameter tuning---------------
Best trial: 10 -> Best value (AUC): 0.92043
Best hyperparameters:
learning_rate        - 0.15000000000000002
l2_leaf_reg          - 3.1
depth                - 7
rsm                  - 0.5
max_ctr_complexity   - 7
random_strength      - 2.0455983252981476
bootstrap_type       - Bernoulli
subsample            - 0.9020836064760344
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
ROC-AUC,0.8766,0.9457,0.9218,0.9104,0.9477,0.9015,0.8931,0.9375,0.9204,0.9221,0.9412,0.8839,0.91,0.9261,0.9189
Weighted-f1,0.8153,0.8717,0.8186,0.8162,0.8768,0.8338,0.8162,0.8608,0.8528,0.8331,0.8561,0.8223,0.8376,0.8608,0.8223
Balanced-accuracy,0.8047,0.8789,0.8633,0.8164,0.8837,0.8516,0.8164,0.8711,0.832,0.8522,0.8789,0.832,0.832,0.8711,0.8443


Average ROC-AUC = 0.9171 (with std = 0.0209)
Average Weighted-f1 = 0.8396 (with std = 0.0209)
Average Balanced-accuracy = 0.8486 (with std = 0.0247)
CPU times: user 5min 20s, sys: 58.5 s, total: 6min 18s
Wall time: 3min 48s


### Experiment 3: All features, RandomOverSampler

In [14]:
%%time
ros = RandomOverSampler(random_state=SEED)
predictions_dict[3] = run_experiment(feature_set=features, resampler=ros, n_trials=50)

---------------Hyperparameter tuning---------------
Best trial: 10 -> Best value (AUC): 0.92259
Best hyperparameters:
learning_rate        - 0.15000000000000002
l2_leaf_reg          - 3.1
depth                - 7
rsm                  - 0.5
max_ctr_complexity   - 7
random_strength      - 2.0455983252981476
bootstrap_type       - Bernoulli
subsample            - 0.9020836064760344
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
ROC-AUC,0.8684,0.9452,0.9274,0.8861,0.9469,0.8988,0.8916,0.9135,0.9348,0.9147,0.9431,0.8933,0.9191,0.9197,0.929
Weighted-f1,0.8028,0.8669,0.8122,0.794,0.883,0.8169,0.7994,0.8338,0.8537,0.8588,0.85,0.8277,0.8055,0.85,0.8392
Balanced-accuracy,0.7734,0.8867,0.8359,0.7891,0.8988,0.8281,0.793,0.8516,0.8438,0.8494,0.8633,0.8359,0.8086,0.8633,0.8673


Average ROC-AUC = 0.9154 (with std = 0.0228)
Average Weighted-f1 = 0.8329 (with std = 0.0263)
Average Balanced-accuracy = 0.8392 (with std = 0.0347)
CPU times: user 9min 32s, sys: 1min 36s, total: 11min 8s
Wall time: 4min 40s


### Experiment 4: Best features, RandomOverSampler

In [15]:
%%time
predictions_dict[4] = run_experiment(feature_set=best_features, resampler=ros, n_trials=50)

---------------Hyperparameter tuning---------------
Best trial: 10 -> Best value (AUC): 0.92043
Best hyperparameters:
learning_rate        - 0.15000000000000002
l2_leaf_reg          - 3.1
depth                - 7
rsm                  - 0.5
max_ctr_complexity   - 7
random_strength      - 2.0455983252981476
bootstrap_type       - Bernoulli
subsample            - 0.9020836064760344
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
ROC-AUC,0.8617,0.9342,0.9021,0.8969,0.9427,0.9186,0.9009,0.9319,0.9255,0.9194,0.9233,0.8661,0.9197,0.9202,0.9194
Weighted-f1,0.8081,0.8546,0.8186,0.81,0.8705,0.8116,0.81,0.8492,0.8517,0.8797,0.8474,0.8081,0.826,0.842,0.806
Balanced-accuracy,0.7773,0.8555,0.7852,0.8008,0.8685,0.8242,0.8008,0.8516,0.8203,0.854,0.8281,0.7773,0.8125,0.8242,0.8325


Average ROC-AUC = 0.9122 (with std = 0.0224)
Average Weighted-f1 = 0.8329 (with std = 0.0241)
Average Balanced-accuracy = 0.8209 (with std = 0.0279)
CPU times: user 5min 24s, sys: 59 s, total: 6min 23s
Wall time: 3min 49s


### Experiment 5: All features, RandomUnderSampler

In [16]:
%%time
rus = RandomUnderSampler(random_state=SEED)
predictions_dict[5] = run_experiment(feature_set=features, resampler=rus, n_trials=50)

---------------Hyperparameter tuning---------------
Best trial: 10 -> Best value (AUC): 0.92259
Best hyperparameters:
learning_rate        - 0.15000000000000002
l2_leaf_reg          - 3.1
depth                - 7
rsm                  - 0.5
max_ctr_complexity   - 7
random_strength      - 2.0455983252981476
bootstrap_type       - Bernoulli
subsample            - 0.9020836064760344
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
ROC-AUC,0.8654,0.9465,0.9011,0.8845,0.9516,0.8948,0.8801,0.9351,0.9238,0.9269,0.9482,0.865,0.9102,0.9232,0.9143
Weighted-f1,0.81,0.8398,0.791,0.8001,0.8836,0.8376,0.8269,0.829,0.85,0.8392,0.8182,0.8038,0.8176,0.8561,0.806
Balanced-accuracy,0.8008,0.8672,0.832,0.8047,0.91,0.832,0.8242,0.8594,0.8633,0.8673,0.8516,0.7852,0.8398,0.8789,0.8325


Average ROC-AUC = 0.9114 (with std = 0.0279)
Average Weighted-f1 = 0.8273 (with std = 0.0238)
Average Balanced-accuracy = 0.8433 (with std = 0.0316)
CPU times: user 9min 33s, sys: 1min 36s, total: 11min 10s
Wall time: 4min 41s


### Experiment 6: Best features, RandomUnderSampler

In [17]:
%%time
predictions_dict[6] = run_experiment(feature_set=best_features, resampler=rus, n_trials=50)

---------------Hyperparameter tuning---------------
Best trial: 10 -> Best value (AUC): 0.92043
Best hyperparameters:
learning_rate        - 0.15000000000000002
l2_leaf_reg          - 3.1
depth                - 7
rsm                  - 0.5
max_ctr_complexity   - 7
random_strength      - 2.0455983252981476
bootstrap_type       - Bernoulli
subsample            - 0.9020836064760344
-----------------Cross-validation------------------


Fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
ROC-AUC,0.8595,0.9471,0.901,0.875,0.9437,0.9033,0.8821,0.928,0.9264,0.9223,0.9445,0.8683,0.9081,0.9081,0.9241
Weighted-f1,0.8153,0.8392,0.796,0.8055,0.872,0.8223,0.7985,0.8284,0.8284,0.8386,0.8176,0.8055,0.7954,0.8506,0.7903
Balanced-accuracy,0.8047,0.8555,0.8242,0.8086,0.891,0.832,0.7812,0.8477,0.8477,0.8561,0.8398,0.8086,0.8125,0.875,0.8431


Average ROC-AUC = 0.9094 (with std = 0.0272)
Average Weighted-f1 = 0.8202 (with std = 0.0223)
Average Balanced-accuracy = 0.8352 (with std = 0.0281)
CPU times: user 5min 27s, sys: 59.3 s, total: 6min 27s
Wall time: 3min 51s


* Tuning with Optuna squeezed out slightly better performance out of CatBoost, compared to our [results earlier](https://www.kaggle.com/code/stiwarids/autismprediction-gradient-boosting).
* CatBoost's internal imbalance handling using *scale_pos_weight* showed better results than external resampling.  
* We tried only random oversampling and random undersampling from imblearn package. There are more complex resampling methods which would require some preprocessing that we have avoided because of CatBoost (such as one-hot encoding, which is handled by CatBoost itself). Will try to improve the code design in future versions to incorporate other resampling methods.

# Generating submission files

In [18]:
for expmt, predictions in predictions_dict.items():
    sub = pd.DataFrame({
        'ID': test_index,
        'Class/ASD': predictions
    })
    sub.to_csv(f'sub_{expmt}.csv', index=False)

In [19]:
!head sub_2.csv

ID,Class/ASD
1,0.5491343165740652
2,0.11499922071061432
3,0.5132124021092852
4,0.08211719054398459
5,0.15903607732518024
6,0.08132252532094235
7,0.6518810850408253
8,0.299415659799436
9,0.09489812655278837


**Would appreciate any feedback about the content as well as the presentation. Thank you!**