# Setup

In [1]:
import os
import gc
gc.enable()
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import scipy.stats as st
import pandas as pd
pd.set_option('precision', 4)
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score

from imblearn.over_sampling import RandomOverSampler, SMOTENC
from imblearn.under_sampling import RandomUnderSampler, NearMiss, OneSidedSelection
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [2]:
train = pd.read_csv('../input/autismdiagnosis/Autism_Prediction/train.csv')
test = pd.read_csv('../input/autismdiagnosis/Autism_Prediction/test.csv')

# Data Preprocessing

We explored the data in the **[EDA notebook](https://www.kaggle.com/code/stiwarids/autismprediction-eda)** and processed it along the way to prepare it for some baseline modeling. Here we will condense all those preprocessing steps into a single function.

In [3]:
#selected features from EDA
selected_features = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 
                     'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 
                     'A9_Score', 'A10_Score', 'AQ10_6_or_above', 
                     'age_group', 'autism_family', 'is_White_European', 
                     'jaundice', 'relation', 'result', 'age', 'AQ10_sum']

best_features = [f for f in selected_features 
                 if f not in ('age', 'age_group', 'relation', 'jaundice')]

num_features = ['AQ10_sum', 'result', 'age']
cat_features = [f for f in selected_features if f not in num_features]
cat_idx = list(range(16)) #will be required for resamplers

In [4]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare the data in accordance with the process described 
    in EDA notebook and return the modified dataframe.
    
    """
    #creating features - AQ10_sum, AQ10_6_or_above
    aq_scores = [f'A{i}_Score' for i in range(1, 11)]
    df['AQ10_sum'] = df[aq_scores].sum(axis=1)
    df['AQ10_6_or_above'] = (df['AQ10_sum'] > 5).astype('int')
    
    #creating feature - age_group
    def create_age_group(age):
        """Determine age group and return an integer indicating the category."""
        if age < 13: return 0 #child
        elif age < 21: return 1 #adolescent
        elif age < 40: return 2 #adult
        elif age < 60: return 3 #middle-aged
        else: return 4 #elderly
        
    df['age_group'] = df['age'].apply(create_age_group)
    
    #creating feature - is_White_European
    df['is_White_European'] = (df['ethnicity'] == 'White-European').astype(int)
    
    #reclassifying column - relation
    relation_mapping = {
        'Self': 2, 
        'Parent': 1, 'Relative': 1, 
        'Others': 0, '?': 0, 'Health care professional': 0
    }
    df['relation'] = df['relation'].replace(to_replace=relation_mapping)
    
    #correcting column name - austim -> autism_family
    df.rename({'austim': 'autism_family'}, axis=1, inplace=True)
    
    #integer encoding for 'yes'/'no' categorical columns - jaundice, autism_family
    df[['jaundice', 'autism_family']] = df[['jaundice', 'autism_family']].replace(to_replace={'no': 0, 'yes': 1})
    
    df = df[selected_features] #retaining only the selected features
    df[cat_features] = df[cat_features].astype('category') #for modeling pipeline
    return df

In [5]:
test_index = test.ID #for generating submission file
target = train['Class/ASD']

train = preprocess_dataset(train)
test = preprocess_dataset(test)

gc.collect()

4

In [6]:
train.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,AQ10_6_or_above,age_group,autism_family,is_White_European,jaundice,relation,result,age,AQ10_sum
0,1,0,1,0,1,0,1,0,1,1,1,2,0,0,0,2,6.3512,38.1727,6
1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,2,2.2552,47.7505,0
2,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,2,14.8515,7.3804,10
3,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,2,2.2766,23.5619,0
4,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,2,-4.7773,43.2058,0


# Resampling + Modeling pipeline

**Note: We have used *imblearn.pipeline.Pipeline* instead of *sklearn.pipeline.Pipeline* to include resampling steps.**

In [7]:
def create_pipeline(resampler) -> Pipeline:
    """Create pipeline using predefined preprocessing and modeling
    strategy along with the resampling method provided.
    """
    categorical_pipeline = Pipeline([
        ('onehot', OneHotEncoder(drop='if_binary', handle_unknown='ignore'))
    ])
    numerical_pipeline = Pipeline([
        ('standardize', RobustScaler()),
        ('normalize', MinMaxScaler())
    ])
    preprocessor = ColumnTransformer([
        ('cat_preprocessor', categorical_pipeline, make_column_selector(dtype_include='category')),
        ('num_preprocessor', numerical_pipeline, make_column_selector(dtype_include='number'))
    ])
    model = Pipeline([
        ('preprocessing', preprocessor),
        ('resampling', resampler),
        ('modeling', LogisticRegression(class_weight='balanced', 
                                        random_state=SEED, 
                                        solver='liblinear', 
                                        max_iter=500))
    ])
    
    return model

# Model evaluation

In [41]:
def evaluate_model(
        model: Pipeline, 
        features: list, 
        verbose: bool = True) -> list:
    """Cross-validate the model and return the predicted probabilities on test-set."""
    
    probs_test = [] #test set predicted probabilities for all folds
    preds_test = [] #test set predicted classes for all folds
    scores_auc = [] #validation set AUC scores for all folds
    scores_balacc = [] #validation set balanced accuracy scores for all folds
    scores_f1 = [] #validation set weighted-f1 scores for all folds
    
    N_SPLITS = 5
    cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(train, target)):
        xtrain, ytrain = train.loc[train_idx], target.iloc[train_idx]
        xval, yval = train.loc[val_idx], target.iloc[val_idx]
        
        model.fit(xtrain[features], ytrain)
        probs_val = model.predict_proba(xval[features])[:, 1]
        preds_val = model.predict(xval[features])
        probs_test.append(model.predict_proba(test[features])[:, 1])
        preds_test.append(model.predict(test[features]))
        
        auc = roc_auc_score(yval, probs_val)
        scores_auc.append(auc)
        balacc = balanced_accuracy_score(yval, preds_val)
        scores_balacc.append(balacc)
        f1 = f1_score(yval, preds_val, average='weighted')
        scores_f1.append(f1)
    
    if verbose:
        scores_df = pd.DataFrame.from_dict({
            'ROC-AUC': scores_auc,
            'Balanced-accuracy':scores_balacc,
            'Weighted-f1': scores_f1
        })
        scores_df.index.name = 'Fold'
        display(scores_df)
    
    print(f'Average: ROC-AUC = {np.mean(scores_auc):.4f}, ' \
          f'Balanced-accuracy = {np.mean(scores_balacc):.4f}, ' \
          f'Weighted-f1 = {np.mean(scores_f1):.4f}')
    
    preds_test = st.mode(np.column_stack(preds_test), axis=1).mode
    probs_test = np.mean(np.column_stack(probs_test), axis=1)
    return probs_test #only probabilities are needed for the competition metric

In [42]:
predictions_dict = {}

### Model 1: RandomOverSampler + Selected features 

In [43]:
ros = RandomOverSampler(random_state=SEED)
_, y = ros.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {0: 639, 1: 639}


In [44]:
predictions_dict[1] = evaluate_model(create_pipeline(ros), selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8357,0.7734,0.7879
1,0.9397,0.8672,0.8554
2,0.9128,0.8477,0.8284
3,0.8855,0.793,0.7994
4,0.9358,0.9022,0.8726


Average: ROC-AUC = 0.9019, Balanced-accuracy = 0.8367, Weighted-f1 = 0.8287


### Model 2: RandomOverSampler + Best features

In [45]:
predictions_dict[2] = evaluate_model(create_pipeline(ros), best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8438,0.793,0.7994
1,0.9424,0.8711,0.8452
2,0.9153,0.8516,0.8338
3,0.8855,0.8125,0.7954
4,0.9475,0.891,0.872


Average: ROC-AUC = 0.9069, Balanced-accuracy = 0.8438, Weighted-f1 = 0.8292


### Model 3: SMOTENC + Selected features

In [46]:
smotenc_sel = SMOTENC(
    categorical_features=[f in cat_features for f in selected_features], #boolean mask 
    random_state=SEED)

_, y = smotenc_sel.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {0: 639, 1: 639}


In [47]:
predictions_dict[3] = evaluate_model(create_pipeline(smotenc_sel), selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8467,0.7461,0.7952
1,0.9419,0.8594,0.8754
2,0.9058,0.8203,0.8367
3,0.8872,0.7812,0.8133
4,0.9322,0.8764,0.8815


Average: ROC-AUC = 0.9028, Balanced-accuracy = 0.8167, Weighted-f1 = 0.8404


### Model 4: SMOTENC + Best features

In [48]:
smotenc_best = SMOTENC(
    categorical_features=[f in cat_features for f in best_features], #boolean mask 
    random_state=SEED)

_, y = smotenc_best.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {0: 639, 1: 639}


In [49]:
predictions_dict[4] = evaluate_model(create_pipeline(smotenc_best), best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8423,0.7969,0.8047
1,0.9473,0.8789,0.8717
2,0.9099,0.8398,0.833
3,0.8899,0.7617,0.787
4,0.9477,0.8679,0.8549


Average: ROC-AUC = 0.9074, Balanced-accuracy = 0.8291, Weighted-f1 = 0.8303


### Model 5: RandomUnderSampler + Selected features

In [50]:
rus = RandomUnderSampler(random_state=SEED)
_, y = rus.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {0: 161, 1: 161}


In [51]:
predictions_dict[5] = evaluate_model(create_pipeline(rus), selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8342,0.7812,0.7985
1,0.9414,0.8789,0.8561
2,0.906,0.832,0.8068
3,0.8782,0.8125,0.7954
4,0.917,0.8864,0.8507


Average: ROC-AUC = 0.8954, Balanced-accuracy = 0.8382, Weighted-f1 = 0.8215


### Model 6: RandomUnderSampler + Best features

In [52]:
predictions_dict[6] = evaluate_model(create_pipeline(rus), best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8174,0.7891,0.8091
1,0.9468,0.8828,0.8458
2,0.9146,0.8359,0.8122
3,0.8884,0.8125,0.7954
4,0.9363,0.8673,0.8392


Average: ROC-AUC = 0.9007, Balanced-accuracy = 0.8375, Weighted-f1 = 0.8203


### Model 7: NearMiss + Selected features

In [53]:
nm = NearMiss()
_, y = nm.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {0: 161, 1: 161}


In [54]:
predictions_dict[7] = evaluate_model(create_pipeline(nm), selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.7739,0.7539,0.8056
1,0.8977,0.7812,0.8425
2,0.7917,0.7227,0.764
3,0.8896,0.7969,0.8197
4,0.7993,0.7567,0.7756


Average: ROC-AUC = 0.8305, Balanced-accuracy = 0.7623, Weighted-f1 = 0.8015


### Model 8: NearMiss + Best features

In [55]:
predictions_dict[8] = evaluate_model(create_pipeline(nm), best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.7678,0.7148,0.7959
1,0.9026,0.832,0.8679
2,0.8025,0.75,0.8004
3,0.8804,0.7891,0.8091
4,0.8125,0.7652,0.8011


Average: ROC-AUC = 0.8331, Balanced-accuracy = 0.7702, Weighted-f1 = 0.8149


### Model 9: OneSidedSelection + Selected features

In [56]:
oss = OneSidedSelection(random_state=SEED)
_, y = oss.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {0: 597, 1: 161}


In [57]:
predictions_dict[9] = evaluate_model(create_pipeline(oss), selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8369,0.7812,0.7985
1,0.9429,0.875,0.8662
2,0.9065,0.8516,0.8182
3,0.8928,0.8164,0.8008
4,0.9358,0.8791,0.8556


Average: ROC-AUC = 0.9030, Balanced-accuracy = 0.8407, Weighted-f1 = 0.8278


### Model 10: OneSidedSelection + Best features

In [58]:
predictions_dict[10] = evaluate_model(create_pipeline(oss), best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8352,0.7891,0.8091
1,0.9358,0.8906,0.8881
2,0.9155,0.8398,0.8176
3,0.8977,0.8086,0.79
4,0.9468,0.8791,0.8556


Average: ROC-AUC = 0.9062, Balanced-accuracy = 0.8415, Weighted-f1 = 0.8321


### Model 11: SMOTEENN + Selected features

In [59]:
smoteenn_sel = SMOTEENN(random_state=SEED, smote=smotenc_sel)
_, y = smoteenn_sel.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {1: 550, 0: 470}


In [60]:
predictions_dict[11] = evaluate_model(create_pipeline(smoteenn_sel), selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8621,0.8125,0.8108
1,0.946,0.8555,0.8392
2,0.8901,0.8281,0.8169
3,0.8855,0.8008,0.7792
4,0.9337,0.8791,0.8556


Average: ROC-AUC = 0.9035, Balanced-accuracy = 0.8352, Weighted-f1 = 0.8203


### Model 12: SMOTEENN + Best features

In [61]:
smoteenn_best = SMOTEENN(random_state=SEED, smote=smotenc_best)
_, y = smoteenn_best.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {1: 569, 0: 478}


In [62]:
predictions_dict[12] = evaluate_model(create_pipeline(smoteenn_best), best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.825,0.7891,0.7787
1,0.9497,0.8828,0.8615
2,0.9199,0.8398,0.8176
3,0.8921,0.8125,0.7954
4,0.9437,0.8904,0.8562


Average: ROC-AUC = 0.9061, Balanced-accuracy = 0.8429, Weighted-f1 = 0.8219


### Model 13: SMOTETomek + Selected features

In [63]:
smotetomek_sel = SMOTETomek(random_state=SEED, smote=smotenc_sel)
_, y = smotetomek_sel.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {0: 626, 1: 626}


In [64]:
predictions_dict[13] = evaluate_model(create_pipeline(smotetomek_sel), selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8455,0.7422,0.79
1,0.9436,0.8555,0.87
2,0.9033,0.8203,0.8367
3,0.8884,0.7852,0.8186
4,0.9315,0.8764,0.8815


Average: ROC-AUC = 0.9025, Balanced-accuracy = 0.8159, Weighted-f1 = 0.8394


### Model 14: SMOTETomek + Best features

In [65]:
smotetomek_best = SMOTETomek(random_state=SEED, smote=smotenc_best)
_, y = smotetomek_best.fit_resample(train, target)
print(f'Before resampling: {target.value_counts().to_dict()}')
print(f'After resampling: {y.value_counts().to_dict()}')

Before resampling: {0: 639, 1: 161}
After resampling: {0: 628, 1: 628}


In [66]:
predictions_dict[14] = evaluate_model(create_pipeline(smotetomek_best), best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8452,0.7969,0.8047
1,0.9495,0.875,0.8818
2,0.9097,0.832,0.8223
3,0.8926,0.7617,0.787
4,0.9439,0.864,0.8495


Average: ROC-AUC = 0.9082, Balanced-accuracy = 0.8259, Weighted-f1 = 0.8290


# Ensemble methods

### Model 15: BalancedRandomForestClassifier + Selected features

In [67]:
predictions_dict[15] = evaluate_model(
    model=BalancedRandomForestClassifier(
        n_estimators=100, 
        class_weight='balanced_subsample', 
        random_state=SEED),
    features=selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.875,0.7969,0.8047
1,0.9508,0.8672,0.8554
2,0.8811,0.8438,0.823
3,0.8809,0.8164,0.8162
4,0.955,0.8679,0.8549


Average: ROC-AUC = 0.9086, Balanced-accuracy = 0.8384, Weighted-f1 = 0.8308


### Model 16: BalancedRandomForestClassifier + Best features

In [68]:
predictions_dict[16] = evaluate_model(
    model=BalancedRandomForestClassifier(
        n_estimators=100, 
        class_weight='balanced_subsample', 
        random_state=SEED),
    features=best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8562,0.8008,0.81
1,0.9347,0.8633,0.85
2,0.8821,0.8516,0.8182
3,0.8616,0.8281,0.8169
4,0.9443,0.891,0.872


Average: ROC-AUC = 0.8958, Balanced-accuracy = 0.8469, Weighted-f1 = 0.8334


### Model 17: EasyEnsembleClassifier + Selected features

In [69]:
predictions_dict[17] = evaluate_model(
    model=EasyEnsembleClassifier(
        n_estimators=50,
        random_state=SEED),
    features=selected_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8418,0.7734,0.7727
1,0.9192,0.8477,0.8284
2,0.9001,0.8086,0.79
3,0.876,0.8164,0.8008
4,0.9167,0.8797,0.8713


Average: ROC-AUC = 0.8908, Balanced-accuracy = 0.8252, Weighted-f1 = 0.8126


### Model 18: EasyEnsembleClassifier + Best features

In [70]:
predictions_dict[18] = evaluate_model(
    model=EasyEnsembleClassifier(
        n_estimators=50,
        random_state=SEED),
    features=best_features)

Unnamed: 0_level_0,ROC-AUC,Balanced-accuracy,Weighted-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8259,0.7656,0.7773
1,0.9126,0.8633,0.8344
2,0.8945,0.8125,0.7954
3,0.8721,0.8164,0.7851
4,0.9194,0.8719,0.8604


Average: ROC-AUC = 0.8849, Balanced-accuracy = 0.8259, Weighted-f1 = 0.8105


# Summary

* We tried several resampling methods from the imblearn package, as part of the modelling pipeline and also as standalone classifiers. There was improvement compared to the performance we obtained from a simple Logistic Regression model.  
* OneSidedSelection, SMOTENC, SMOTEENN and RandomOverSampler were the resampling methods that performed well according to all the metrics.
* Among the ensemble methods, BalancedRandomForestClassifier performed better, but there is scope for further tuning.
* We also set up a model evaluation framework which can be used to cross-validate and compare models easily.

# Generating submissions

In [71]:
for i, predictions in predictions_dict.items():
    sub = pd.DataFrame({
        'ID': test_index,
        'Class/ASD': predictions})
    sub.to_csv(f'sub_{i}.csv', index=False)

In [72]:
!head sub_12.csv

ID,Class/ASD
1,0.43239865107493713
2,0.0016006409416714571
3,0.8490235783695544
4,0.004524598820930283
5,0.024717980451027122
6,0.008433309759310989
7,0.870881067591438
8,0.27846418153582
9,0.0033503111936771398
