# Setup

In [1]:
import os
import gc
import warnings
gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import scipy.stats as st
import pandas as pd
pd.set_option('precision', 4)
pd.set_option('display.max_columns', None)

import xgboost as xgb
import lightgbm as lgbm
import catboost as cb
from typing import Any
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [2]:
#delete cell if no problems with later versions
assert xgb.__version__ == '1.6.1', f'Change in XGBoost version. Original notebook version: 1.6.1'
assert lgbm.__version__ == '3.3.2', f'Change in LightGBM version. Original notebook version: 3.3.2'
assert cb.__version__ == '1.0.6', f'Change in CatBoost version. Original notebook version: 1.0.6'

In [3]:
train = pd.read_csv('../input/autismdiagnosis/Autism_Prediction/train.csv')
test = pd.read_csv('../input/autismdiagnosis/Autism_Prediction/test.csv')

# Preprocessing

(based on [EDA](https://www.kaggle.com/code/stiwarids/autismprediction-eda) and [Resampling experiments](https://www.kaggle.com/code/stiwarids/autismprediction-resampling-experiments)) 

In [4]:
features = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 
            'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 
            'AQ10_6_or_above', 'age_group', 'autism_family', 'is_White_European', 
            'jaundice', 'relation', 'result', 'age', 'AQ10_sum']
num_features = ['result', 'age', 'AQ10_sum'] #continuous features
cat_features = [f for f in features if f not in num_features] #categorical features
cat_mask = [f in cat_features for f in features] #boolean mask: is categorical feature or not

best_features = [f for f in features if f not in ('age', 'age_group', 'jaundice', 'relation')]
best_cat_features = [f for f in cat_features if f in best_features]

In [5]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare the data in accordance with the process described 
    in EDA notebook and return the modified dataframe.
    
    """
    #creating features - AQ10_sum, AQ10_6_or_above
    aq_scores = [f'A{i}_Score' for i in range(1, 11)]
    df['AQ10_sum'] = df[aq_scores].sum(axis=1)
    df['AQ10_6_or_above'] = (df['AQ10_sum'] > 5).astype('int')
    
    #creating feature - age_group
    def create_age_group(age):
        """Determine age group and return an integer indicating the category."""
        if age < 13: return 0 #child
        elif age < 21: return 1 #adolescent
        elif age < 40: return 2 #adult
        elif age < 60: return 3 #middle-aged
        else: return 4 #elderly
        
    df['age_group'] = df['age'].apply(create_age_group)
    
    #creating feature - is_White_European
    df['is_White_European'] = (df['ethnicity'] == 'White-European').astype(int)
    
    #reclassifying column - relation
    relation_mapping = {
        'Self': 2, 
        'Parent': 1, 'Relative': 1, 
        'Others': 0, '?': 0, 'Health care professional': 0
    }
    df['relation'] = df['relation'].replace(to_replace=relation_mapping)
    
    #correcting column name - austim -> autism_family
    df.rename({'austim': 'autism_family'}, axis=1, inplace=True)
    
    #integer encoding for 'yes'/'no' categorical columns - jaundice, autism_family
    df[['jaundice', 'autism_family']] = df[['jaundice', 'autism_family']].replace(to_replace={'no': 0, 'yes': 1})
    
    df = df[features] #retaining only the selected features
    df[cat_features] = df[cat_features].astype('category') #for modeling pipeline
    return df


In [6]:
test_index = test.ID #for generating submission file
target = train['Class/ASD']

train = preprocess_dataset(train)
test = preprocess_dataset(test)

gc.collect()

5

# Modeling

In [7]:
def evaluate_model(
        model: Any, 
        features: list, 
        fit_params: dict = {}, 
        verbose: bool = True) -> list:
    """Cross-validate the model and return the predicted probabilities for test-set."""
    
    probs_test = [] #test set predicted probabilities for all folds
    preds_test = [] #test set predicted classes for all folds
    scores_auc = [] #validation set AUC scores for all folds
    scores_balacc = [] #validation set balanced accuracy scores for all folds
    scores_f1 = [] #validation set weighted-f1 scores for all folds
        
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(train, target)):
        xtrain, ytrain = train.loc[train_idx], target.iloc[train_idx]
        xval, yval = train.loc[val_idx], target.iloc[val_idx]
        
        model.fit(
            xtrain[features], ytrain,
            eval_set=[(xval[features], yval)],
            verbose=False,
            **fit_params)
        
        probs_val = model.predict_proba(xval[features])[:, 1]
        preds_val = model.predict(xval[features])
        probs_test.append(model.predict_proba(test[features])[:, 1])
        preds_test.append(model.predict(test[features]))
        
        scores_auc.append(roc_auc_score(yval, probs_val))
        scores_balacc.append(balanced_accuracy_score(yval, preds_val))
        scores_f1.append(f1_score(yval, preds_val, average='weighted'))
    
    if verbose:
        scores_df = pd.DataFrame.from_dict({
            'ROC-AUC': scores_auc,
            'Balanced-accuracy':scores_balacc,
            'Weighted-f1': scores_f1
        })
        scores_df.index.name = 'Fold'
        display(scores_df)
    
    print(f'Average ROC-AUC = {np.mean(scores_auc):.4f} (with std = {np.std(scores_auc):.4f})')
    print(f'Average Balanced-accuracy = {np.mean(scores_balacc):.4f} (with std = {np.std(scores_balacc):.4f})')
    print(f'Average Weighted-f1 = {np.mean(scores_f1):.4f} (with std = {np.std(scores_f1):.4f})')
    
    preds_test = st.mode(np.column_stack(preds_test), axis=1).mode
    probs_test = np.mean(np.column_stack(probs_test), axis=1)
    
    return probs_test #only probabilities are needed for the competition metric

In [8]:
predictions_dict = {}

### XGBoost

In [9]:
#target class imbalance: ratio of negative to positive samples
class_ratio = sum(target == 0) / sum(target == 1)

In [10]:
xgb_model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=4,
    objective='binary:logistic',
    tree_method='gpu_hist',
    scale_pos_weight=class_ratio,
    enable_categorical=True,
    max_cat_to_onehot=2,
    eval_metric='auc',
    early_stopping_rounds=25,
    random_state=SEED)

In [11]:
predictions_dict['xgb_all'] = evaluate_model(xgb_model, features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8528,0.7734,0.8028
1,0.9392,0.875,0.8662
2,0.9056,0.8438,0.8073
3,0.8839,0.8008,0.81
4,0.9538,0.8797,0.8713
5,0.9001,0.8594,0.829
6,0.8727,0.7695,0.8266
7,0.9374,0.8555,0.8236
8,0.9326,0.8555,0.8546
9,0.9089,0.8298,0.8317


Average ROC-AUC = 0.9084 (with std = 0.0294)
Average Balanced-accuracy = 0.8344 (with std = 0.0328)
Average Weighted-f1 = 0.8331 (with std = 0.0229)


In [12]:
predictions_dict['xgb_best'] = evaluate_model(xgb_model, best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8527,0.793,0.7994
1,0.9462,0.8828,0.8771
2,0.9119,0.8359,0.8277
3,0.8651,0.8086,0.79
4,0.9457,0.8646,0.865
5,0.8899,0.8398,0.8176
6,0.8774,0.8164,0.8313
7,0.9414,0.8789,0.8404
8,0.9225,0.8594,0.8446
9,0.9005,0.8331,0.8216


Average ROC-AUC = 0.9050 (with std = 0.0300)
Average Balanced-accuracy = 0.8405 (with std = 0.0276)
Average Weighted-f1 = 0.8295 (with std = 0.0238)


### LightGBM

In [13]:
lgbm_model = lgbm.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=4,
    objective='binary',
    scale_pos_weight=class_ratio,
    random_state=SEED,
    device_type='gpu',
    early_stopping_round=25,
    max_cat_to_onehot=2,
    verbosity=-1)

In [14]:
predictions_dict['lgbm_all'] = evaluate_model(lgbm_model, features, {'eval_metric': 'auc'})

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8849,0.7891,0.8386
1,0.9408,0.8555,0.8546
2,0.8998,0.7812,0.8133
3,0.8765,0.7383,0.7991
4,0.9433,0.5,0.7025
5,0.8802,0.7578,0.7964
6,0.8619,0.7344,0.8355
7,0.9342,0.5,0.7111
8,0.9276,0.8125,0.8559
9,0.9247,0.7867,0.8719


Average ROC-AUC = 0.9070 (with std = 0.0259)
Average Balanced-accuracy = 0.7019 (with std = 0.1324)
Average Weighted-f1 = 0.7995 (with std = 0.0602)


In [15]:
predictions_dict['lgbm_best'] = evaluate_model(lgbm_model, best_features, {'eval_metric': 'auc'})

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.871,0.6836,0.8203
1,0.9395,0.8711,0.8608
2,0.9,0.793,0.8144
3,0.8677,0.7422,0.79
4,0.9414,0.8652,0.8807
5,0.8722,0.7539,0.8199
6,0.8636,0.832,0.8528
7,0.933,0.5,0.7111
8,0.9128,0.8242,0.8571
9,0.9127,0.8528,0.8487


Average ROC-AUC = 0.9033 (with std = 0.0253)
Average Balanced-accuracy = 0.7395 (with std = 0.1287)
Average Weighted-f1 = 0.8152 (with std = 0.0564)


### CatBoost

In [16]:
cb_model = cb.CatBoostClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=4,
    boosting_type='Ordered',
    eval_metric='Logloss',
    scale_pos_weight=class_ratio,
    one_hot_max_size=2,
    use_best_model=True,
    task_type='GPU',
    verbose=False,
    early_stopping_rounds=25,
    random_state=SEED)

In [17]:
predictions_dict['cb_all'] = evaluate_model(cb_model, features, {'cat_features': cat_features})

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8599,0.793,0.7994
1,0.9395,0.8672,0.8554
2,0.9076,0.8477,0.8284
3,0.8904,0.8047,0.8001
4,0.9435,0.8685,0.8705
5,0.8972,0.8516,0.8338
6,0.8776,0.8086,0.8055
7,0.9236,0.8594,0.8446
8,0.9274,0.8594,0.8446
9,0.9179,0.8601,0.844


Average ROC-AUC = 0.9082 (with std = 0.0257)
Average Balanced-accuracy = 0.8439 (with std = 0.0240)
Average Weighted-f1 = 0.8343 (with std = 0.0221)


In [18]:
predictions_dict['cb_best'] = evaluate_model(cb_model, best_features, {'cat_features': best_cat_features})

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8661,0.8047,0.8153
1,0.9448,0.8867,0.8669
2,0.9059,0.8555,0.8236
3,0.8818,0.7969,0.7894
4,0.9485,0.8797,0.8713
5,0.8827,0.8594,0.829
6,0.8922,0.8125,0.8108
7,0.9331,0.8672,0.8398
8,0.9225,0.8789,0.8561
9,0.9166,0.8601,0.844


Average ROC-AUC = 0.9100 (with std = 0.0246)
Average Balanced-accuracy = 0.8498 (with std = 0.0278)
Average Weighted-f1 = 0.8332 (with std = 0.0226)


* Performance is even better (slightly) than the [resampling + logistic regression pipelines](https://www.kaggle.com/code/stiwarids/autismprediction-resampling-experiments).
* The *learning_rate* and *max_depth* are chosen manually. We can tune all the hyperparameters using GridSearch/Optuna/HyperOpt/etc.
* In terms of AUC, all three algorithms have performed well, but we can see that balanced accuracy and weighted-f1 score for LightGBM is significantly low, which means it is not able to classify the minority class correctly. This may be because of LightGBM being optimized for large-scale datasets and our dataset being very small. It might be also because the manual hyperparameters for LightGBM are not correct for this dataset and some hyperparameter tuning might solve this problem.
* **The notable hyperparameter is *scale_pos_weight* which makes the GBMs internally compensate for the class imbalance. We can see the effect of this hyperparameter by removing it and checking the metrics:**

In [19]:
xgb_model_no_scale = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=4,
    objective='binary:logistic',
    tree_method='gpu_hist',
    enable_categorical=True,
    max_cat_to_onehot=2,
    eval_metric='auc',
    early_stopping_rounds=25,
    random_state=SEED)

In [20]:
predictions_dict['xgb_all_no_scale'] = evaluate_model(xgb_model_no_scale, features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8563,0.7344,0.8355
1,0.9373,0.8164,0.8918
2,0.9038,0.7266,0.8386
3,0.8838,0.7539,0.834
4,0.9507,0.7985,0.8889
5,0.8995,0.8008,0.8695
6,0.8737,0.7109,0.831
7,0.9285,0.8281,0.8777
8,0.9243,0.7734,0.8608
9,0.9114,0.7867,0.8719


Average ROC-AUC = 0.9072 (with std = 0.0270)
Average Balanced-accuracy = 0.7733 (with std = 0.0361)
Average Weighted-f1 = 0.8618 (with std = 0.0215)


**AUC is the same but balanced accuracy has fallen because minority class samples are not being classified correctly anymore.**

# Generating submissions

In [21]:
for model_name, predictions in predictions_dict.items():
    sub = pd.DataFrame({
        'ID': test_index,
        'Class/ASD': predictions})
    sub.to_csv(f'sub_{model_name}.csv', index=False)

In [22]:
!head sub_xgb_all.csv

ID,Class/ASD
1,0.59519756
2,0.21645269
3,0.5770506
4,0.211017
5,0.21696441
6,0.22354202
7,0.58546287
8,0.30835214
9,0.21050224


**Would appreciate any feedback about the content as well as the presentation. Thank you!**