### Install packages required:

In [None]:
!pip install --upgrade pip
! pip install composition_stats
! pip install --upgrade fastsrm
! pip install xgboost
! pip install pandas
! pip install boruta

### Import packages and functions required:

In [None]:
#for data manipulation:
import numpy as np
import pandas as pd
from collections import defaultdict
from itertools import chain
#for preprocessing:
import composition_stats as cs #for clr function
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import scipy.stats as stats
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.pipeline import Pipeline
#for model development:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from boruta import BorutaPy
#for model performance evaluation and visualization:
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer, confusion_matrix, roc_auc_score, roc_curve, auc
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

### Pre-processing function:

In [None]:
#pre-processng function for microbe data:
def pre_pros(x):
    x_col = list(x.columns)
    x_row = list(x.index)
    as_matrix = x.values
    clr_matrix = cs.clr(as_matrix + 0.00001)
    min_max_matrix = MinMaxScaler().fit_transform(clr_matrix)
    tran_df = pd.DataFrame(min_max_matrix)
    tran_df = tran_df.set_axis(x_row, axis=0)
    tran_df = tran_df.set_axis(x_col, axis=1)
    
    return tran_df

In [None]:
#pre-processing function for metabolite data:
def pre_pros_met(x):
    x_col = list(x.columns)
    x_row = list(x.index)
    as_matrix = x.values
    clr_matrix = cs.clr(as_matrix + 1)
    z_matrix = stats.zscore(clr_matrix)
    tran_df = pd.DataFrame(z_matrix)
    tran_df = tran_df.set_axis(x_row, axis=0)
    tran_df = tran_df.set_axis(x_col, axis=1)
    
    return tran_df

Allows the preprocessing to be one line when used in other code. 

## Import required data:

### Validation data:

In [None]:
#import the microbe data from the validation data set:
val_genus = 'hmp_genera.csv'
val_genus = pd.read_csv(format(val_genus))
#Rename sample column and index it:
val_genus.rename(columns={'Unnamed: 0': 'Sample'}, inplace=True)
val_genus= val_genus.set_index('Sample')
#select the 9 important microbes only:
hmp_genus = val_genus.loc[:, ['CAJJNI01','CAJOIG01','Flavonifractor','JAGTTR01','RGIG426','RUG762','UBA11774','UBA1774','Ventricola']]

In [None]:
#import the metadata of the validation data set:
val_meta = 'hmp_meta.csv'
val_meta = pd.read_csv(format(val_meta))
#Rename sample column and index it:
val_meta= val_meta.set_index('Sample')
val_meta.drop('Unnamed: 0', axis = 1, inplace=True)

In [None]:
#import the metabolite data from the validation data set:
val_mtb = 'hmp_mtb.csv'
val_mtb = pd.read_csv(format(val_mtb))
#Rename sample column and index it:
val_mtb.rename(columns={'Unnamed: 0': 'Sample'}, inplace=True)
val_mtb= val_mtb.set_index('Sample')

### Required main dataset for training:

In [None]:
#import the metadata of the main data set:
data_meta = 'fran_metadata.csv'
df_meta = pd.read_csv(format(data_meta))
#Rename sample column and index it:
df_meta= df_meta.set_index('Sample')
df_meta.drop('Unnamed: 0', axis = 1, inplace=True)

In [None]:
#import the data on the associated 14 metabolites from the main dataset:
data_mtb = 'fran_mtb_val.csv'
df_mtb = pd.read_csv(format(data_mtb))
#Rename sample column and index it:
df_mtb.rename(columns={'Unnamed: 0': 'Sample'}, inplace=True)
df_mtb= df_mtb.set_index('Sample')

In [None]:
#import the microbe data on the 74 selected microbes from the main dataset:
data_genus = 'fran_genera_diff.csv'
df_genus = pd.read_csv(format(data_genus))
#Rename sample column and index it:
df_genus.rename(columns={'Unnamed: 0': 'Sample'}, inplace=True)
df_genus= df_genus.set_index('Sample')
#select the 9 important microbes only:
fran_genus = df_genus.loc[:, ['CAJJNI01','CAJOIG01','Flavonifractor','JAGTTR01','RGIG426','RUG762','UBA11774','UBA1774','Ventricola']]

## Microbe model data:

In [None]:
#extarct the disease group labels for the y vectors:
y_train = df_meta['Disease.Group']
y_test = val_meta['Disease.Group']
y_train = y_train.replace({'IBD': 1, 'Control': 0})
y_test = y_test.replace({'IBD': 1, 'Control': 0})  

In [None]:
#set the training data the main dataset genus data preprocessed and the testing the validation dataset genus data preprocessed:
x_train = pre_pros(fran_genus)
x_test = pre_pros(hmp_genus)

The random forest model with the parameters: entropy, max depth 10, max_features = 0.75, max_samples = 0.9, n_estimators = 100 was the best in the main dataset so chosen for all models here.

In [None]:
#make model:
model_mic = RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features = 0.75, max_samples = 0.9, n_estimators = 100, random_state = 23)
#fit model to training data:
model_mic.fit(x_train, y_train)
#predict the model on the testing data (val data):
y_pred_lab = model_mic.predict(x_test)
y_pred_prob = model_mic.predict_proba(x_test)
y_true_lab = y_test

In [None]:
#obtain the performance metrics of the model:
recall = recall_score(y_true_lab, y_pred_lab)
precision = precision_score(y_true_lab, y_pred_lab)
f1 = f1_score(y_true_lab, y_pred_lab)
confusion = confusion_matrix(y_true_lab, y_pred_lab)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
specificity = TN / (TN + FP)
sensitivity = TP / (TP + FN)

fpr_num_mic, tpr_num_mic, _ = roc_curve(y_test, y_pred_prob[:,1])
auc_score_mic = auc(fpr_num_mic, tpr_num_mic)
print(recall, precision, f1, specificity, sensitivity, auc_score)

## Metabolite data model:

There are missing values for the abundance of the Carnosol and L-1,2,3,4-tetrahydro-beta-carboline-3-carboxylic acid* metabolies in the validation dataset, so imputation is required:

In [None]:
#set up the numeric transformer:
numeric_features = ['Carnosol', 'L-1,2,3,4-tetrahydro-beta-carboline-3-carboxylic acid*']

numeric_transformer= Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5))]) #imputing using 5 nearest neighbours. 

#apply the transformer to the corresponding columns of the validation dataset:
preprocessor= ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])
clt = Pipeline(steps=[('preprocessor', preprocessor)])
new_df= clt.fit_transform(val_mtb)

#reorgnaise the data:
new_df = pd.DataFrame(new_df) #make data frame
new_df.columns = [ numeric_features] #rename the columns back to the metabolite names
new_df.index = val_mtb.index #reset the index as the samples

#replace the columns of the original data frame with these new columns:
val_mtb['Carnosol'] = new_df['Carnosol'] 
val_mtb['L-1,2,3,4-tetrahydro-beta-carboline-3-carboxylic acid*'] = new_df['L-1,2,3,4-tetrahydro-beta-carboline-3-carboxylic acid*'] 

The y vectors remain the same.

In [None]:
#set the training data the main dataset metabolite data preprocessed and the testing the validation dataset metabolite data preprocessed:
x_train_met = pre_pros_met(df_mtb)
x_test_met = pre_pros_met(val_mtb)

In [None]:
#reorder columns so the same order:
cols = list(x_train_met.columns)
x_test_met = x_test_met.reindex(columns = cols)

In [None]:
#make model:
model_met = RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features = 0.75, max_samples = 0.9, n_estimators = 100, random_state = 23)
#fit model to training data:
model_met.fit(x_train_met, y_train)
#predict model on the test data:
y_pred_lab = model_met.predict(x_test_met)
y_pred_prob = model_met.predict_proba(x_test_met)
y_true_lab = y_test

In [None]:
#obtain the performance metrics of the model:
recall = recall_score(y_true_lab, y_pred_lab)
precision = precision_score(y_true_lab, y_pred_lab)
f1 = f1_score(y_true_lab, y_pred_lab)
confusion = confusion_matrix(y_true_lab, y_pred_lab)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
specificity = TN / (TN + FP)
sensitivity = TP / (TP + FN)

fpr_num_mtb, tpr_num_mtb, _ = roc_curve(y_test, y_pred_prob[:,1])
auc_score_mtb = auc(fpr_num_mtb, tpr_num_mtb)
print(recall, precision, f1, specificity, sensitivity, auc_score_mtb)

## Microbe and metabolite combined model:

The y vectors remain the same.

In [None]:
#combine the microbe and metabolite datasets into one data frame:
x_train_both = pd.concat([fran_genus, df_mtb], axis=1)
x_test_both = pd.concat([hmp_genus, val_mtb], axis=1)
#pre-process them:
x_train_both = pre_pros_met(x_train_both)
x_test_both = pre_pros_met(x_test_both)

In [None]:
#reorder columns so the same order:
cols = list(x_train_both.columns)
x_test_both = x_test_both.reindex(columns = cols)

In [None]:
#make model:
model_both = RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features = 0.75, max_samples = 0.9, n_estimators = 100, random_state = 23)
#fit model to training data:
model_both.fit(x_train_both, y_train)
#predict model on the test data:
y_pred_lab = model_both.predict(x_test_both)
y_pred_prob = model_both.predict_proba(x_test_both)
y_true_lab = y_test

In [None]:
#obtain the performance metrics of the model:
recall = recall_score(y_true_lab, y_pred_lab)
precision = precision_score(y_true_lab, y_pred_lab)
f1 = f1_score(y_true_lab, y_pred_lab)
confusion = confusion_matrix(y_true_lab, y_pred_lab)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
specificity = TN / (TN + FP)
sensitivity = TP / (TP + FN)

fpr_num, tpr_num, _ = roc_curve(y_test, y_pred_prob[:,1])
auc_score = auc(fpr_num, tpr_num)
print(recall, precision, f1, specificity, sensitivity, auc_score)

## ROC plot of all models:

In [None]:
plt.figure(figsize=(8, 8))
plt.axes().set_aspect('equal', 'datalim')

plt.plot(fpr_num_smi, tpr_num_smi, 'limegreen', label=r'ROC Microbe Features Only (AUC = {})'.format(round(auc_score_smi,3)))
plt.plot(fpr_num_mtb, tpr_num_mtb, 'darkblue', label=r'ROC Metabolite Features Only (AUC = {})'.format(round(auc_score_mtb,3)))
plt.plot(fpr_num, tpr_num, 'purple', label=r'ROC Microbe and Metabolite Features (AUC = {})'.format(round(auc_score,3)))

plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.legend(loc="lower right")
plt.title('Receiver Operating Characteristic Validation Models')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('val_roc.pdf', format = 'pdf', dpi = 300, bbox_inches = 'tight')
plt.show()