In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, precision_score
from xgboost import XGBClassifier

# some hyper parameters
SEED = 1970
test_train_split_SEED = 1970
# FOLDS = 10
show_fold_stats = True
VERBOSE = 0
FOLDS = 5

The dataset is from the research paper by Zhou Dan et al. published on April 21st of 2020 - [Altered gut microbial profile is associated with abnormal metabolism activity of Autism Spectrum Disorder](https://www.tandfonline.com/doi/full/10.1080/19490976.2020.1747329)

From the Abstract:<br>
> Autism Spectrum Disorder (ASD) is a severe neurodevelopmental disorder. To enhance the understanding of the gut microbiota structure in ASD children at different ages as well as the relationship between gut microbiota and fecal metabolites, we first used the 16S rRNA sequencing to evaluate the gut microbial population in a cohort of 143 children aged 2–13 years old. We found that the α-diversity of ASD group showed no significant change with age, while the TD group showed increased α-diversity with age, which indicates that the compositional development of the gut microbiota in ASD varies at different ages in ways that are not consistent with TD group. Recent studies have shown that chronic constipation is one of the most commonly obvious gastrointestinal (GI) symptoms along with ASD core symptoms. To further investigate the potential interaction effects between ASD and GI symptoms, the 30 C-ASD and their aged-matched TD were picked out to perform metagenomics analysis. We observed that C-ASD group displayed decreased diversity, depletion of species of Sutterella, Prevotella, and Bacteroides as well as dysregulation of associated metabolism activities, which may involve in the pathogenesis of C-ASD. Consistent with metagenomic analysis, liquid chromatography-mass spectrometry (LC/MS) revealed some of the differential metabolites between C-ASD and TD group were involved in the metabolic network of neurotransmitters including serotonin, dopamine, histidine, and GABA. Furthermore, we found these differences in metabolites were associated with altered abundance of specific bacteria. The study suggested possible future modalities for ASD intervention through targeting the specific bacteria associated with neurotransmitter metabolism.

In [None]:
pd_abundance = pd.read_csv('../input/human-gut-microbiome-with-asd/GSE113690_Autism_16S_rRNA_OTU_assignment_and_abundance.csv')
pd_meta_abundance = pd.read_csv('../input/human-gut-microbiome-with-asd/ASD meta abundance.csv')

In [None]:
taxa = pd_abundance[['OTU', 'taxonomy']].set_index('OTU')
pd_abundance_T = pd_abundance.drop('taxonomy', axis=1).set_index('OTU').transpose()

target = pd_abundance_T.index.to_list()
binary_target = np.array([1 if t.startswith('A') else 0 for t in target ])

total_species = pd_abundance_T.sum(axis = 1)
abs_abundance = 31757
pd_rel_abundance = pd_abundance_T / abs_abundance 


First will try 16s rRNA using both RF and XGBoost

In [None]:
# Lets put aside a small test set, so we can check performance of different classifiers against it
disease_train, disease_test, disease_y_train, disease_y_test = train_test_split(pd_rel_abundance, binary_target, test_size = 0.05,  random_state = test_train_split_SEED , shuffle = True)   

In [None]:
skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)

for fold, (idxT,idxV) in enumerate(skf.split(disease_train, disease_y_train)):

    X_train = disease_train.iloc[idxT]
    X_val = disease_train.iloc[idxV]
    y_train = disease_y_train[idxT]
    y_val = disease_y_train[idxV]

    clf = RandomForestClassifier(n_estimators = 500, random_state = SEED, verbose = 0)
    clf.fit(X_train, y_train )

    RF_pred_class = clf.predict(X_val)
    RF_preds = clf.predict_proba(X_val)
    
    RF_AUC_test_score = roc_auc_score(y_val, RF_preds[:,1])
    RF_f1_test = f1_score(y_val, RF_pred_class)
    RF_recall_test = recall_score(y_val, RF_pred_class)
    RF_precision_test = precision_score(y_val, RF_pred_class)
    
    if show_fold_stats:
        print('-' * 80)
        print('Fold : %s'%(fold+1))
        print('ROC AUC score for RandomForest model, validation set: %.4f'%RF_AUC_test_score)
        print('F1 : %.4f, Recall : %.4f , Precision : %.4f'%(RF_f1_test, RF_recall_test, RF_precision_test))
        print(confusion_matrix(y_val, RF_pred_class))
    
    XGB_model = XGBClassifier(n_estimators=5000, max_depth=None, 
                        learning_rate=0.005,
                        objective='binary:logistic', 
                        metric='auc',
                        verbosity  = VERBOSE,
                        # tree_method = 'gpu_hist',
                        use_label_encoder=False,
                        n_jobs=-1, random_state  = SEED )
    
    XGB_model.fit(X_train, y_train,
                    eval_set = [(X_val, y_val)],
                    eval_metric=['logloss'],
                    early_stopping_rounds = 100, verbose = VERBOSE )
        
    XGB_preds = XGB_model.predict_proba(X_val)
    XGB_class = XGB_model.predict(X_val)

    XGB_score = roc_auc_score(y_val, XGB_preds[:,1])
    XGB_f1 = f1_score(y_val, XGB_class)
    XGB_recall = recall_score(y_val, XGB_class)
    XGB_precision = precision_score(y_val, XGB_class)

    if show_fold_stats:        
        print('ROC AUC score for XGBoost model, validation set: %.4f'%XGB_score)
        print('F1 : %.4f, Recall : %.4f , Precision : %.4f'%(XGB_f1, XGB_recall, XGB_precision))
        print(confusion_matrix(y_val, XGB_class))

    RF_preds_test = clf.predict_proba(disease_test)
    XGB_preds_test = XGB_model.predict_proba(disease_test)
    avg_preds_test = (RF_preds_test[:,1] + XGB_preds_test[:,1]) / 2

    RF_test_AUC = roc_auc_score(disease_y_test, RF_preds_test[:,1])
    print('ROC AUC score for RF for test set: %.4f'%RF_test_AUC)
    XGB_test_AUC = roc_auc_score(disease_y_test, XGB_preds_test[:,1])
    print('ROC AUC score for XGBoost model test set: %.4f'%XGB_test_AUC)
    average_AUC = roc_auc_score(disease_y_test, avg_preds_test )
    print('ROC AUC score averaged between 2 models for test set: %.4f'%average_AUC)
    
    avg_class = np.where(avg_preds_test < 0.7, 0, 1)
    print('F1 : %.4f, Recall : %.4f , Precision : %.4f'%(f1_score(disease_y_test, avg_class), recall_score(disease_y_test, avg_class), precision_score(disease_y_test, avg_class)))
    print(confusion_matrix(disease_y_test, avg_class))

Lets try metagenomic data: 30 samples with ASD and 30 TD

In [None]:
# exclude absent spcecies
pd_meta_abundance = pd_meta_abundance[pd_meta_abundance.sum(axis = 1) !=0]

In [None]:
pd_meta_abndc = pd_meta_abundance.drop(['Taxonomy'], axis=1).T
target = pd_meta_abndc.index.to_list()
binary_target = np.array([1 if t.startswith('A') else 0 for t in target ])

In [None]:
# this subset of data is too small to have a separate test set, so we'd have to rely on CV only
skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
for fold, (idxT,idxV) in enumerate(skf.split(pd_meta_abndc, binary_target)):

    X_train = pd_meta_abndc.iloc[idxT]
    X_val = pd_meta_abndc.iloc[idxV]
    y_train = binary_target[idxT]
    y_val = binary_target[idxV]

    clf = RandomForestClassifier(n_estimators = 500, random_state = SEED, verbose = 0)
    clf.fit(X_train, y_train )

    RF_pred_class = clf.predict(X_val)
    RF_preds = clf.predict_proba(X_val)
    
    RF_AUC_test_score = roc_auc_score(y_val, RF_preds[:,1])
    RF_f1_test = f1_score(y_val, RF_pred_class)
    RF_recall_test = recall_score(y_val, RF_pred_class)
    RF_precision_test = precision_score(y_val, RF_pred_class)
    
    if show_fold_stats:
        print('-' * 80)
        print('Fold : %s'%(fold+1))
        print('ROC AUC score for RandomForest model, validation set: %.4f'%RF_AUC_test_score)
        print('F1 : %.4f, Recall : %.4f , Precision : %.4f'%(RF_f1_test, RF_recall_test, RF_precision_test))
        print(confusion_matrix(y_val, RF_pred_class))
    
    XGB_model = XGBClassifier(n_estimators=5000, max_depth=None, 
                        learning_rate=0.005,
                        objective='binary:logistic', 
                        metric='auc',
                        verbosity  = VERBOSE,
                        # tree_method = 'gpu_hist',
                        use_label_encoder=False,
                        n_jobs=-1, random_state  = SEED )
    
    XGB_model.fit(X_train, y_train,
                    eval_set = [(X_val, y_val)],
                    eval_metric=['logloss'],
                    early_stopping_rounds = 100, verbose = VERBOSE )
        
    XGB_preds = XGB_model.predict_proba(X_val)
    XGB_class = XGB_model.predict(X_val)

    XGB_score = roc_auc_score(y_val, XGB_preds[:,1])
    XGB_f1 = f1_score(y_val, XGB_class)
    XGB_recall = recall_score(y_val, XGB_class)
    XGB_precision = precision_score(y_val, XGB_class)

    if show_fold_stats:        
        print('ROC AUC score for XGBoost model, validation set: %.4f'%XGB_score)
        print('F1 : %.4f, Recall : %.4f , Precision : %.4f'%(XGB_f1, XGB_recall, XGB_precision))
        print(confusion_matrix(y_val, XGB_class))  