# Interactive predictive model fitting for classifying case/control  

Quang Nguyen   
Last updated 2022-05-03

In [1]:
import numpy as np 
import pandas as pd
import sys
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import KFold, cross_val_score, cross_validate, train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, brier_score_loss, roc_auc_score
from sklearn.inspection import permutation_importance
from skbio.stats.composition import clr, multiplicative_replacement
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import pickle
import timeit

Loading custom functions from the `pred_eval` model

In [2]:
sys.path.insert(1, '../python/')
from pred_eval import prior_preprocess, clr_transform, create_pipeline

In [3]:
np.random.seed(160497)

## Example using IBD pathways data 

In [8]:
feat = pd.read_csv("../data/trait_crc_16s_feat.csv", index_col = 0)
feat

Unnamed: 0_level_0,5ketogluconate;substrate,acetate;substrate,adipate;substrate,adonitol;substrate,aerobic;metabolism,aerobic_chemo_heterotrophy;pathways,alaninamide;substrate,alanine;substrate,alpha_ketovalerate;substrate,anaerobic;metabolism,...,tween_80;substrate,urea;substrate,uridine;substrate,urocanate;substrate,vibrio;cell_shape,xylitol;substrate,xylose;substrate,yeast_extract;substrate,yes;motility,yes;sporulation
sample_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DE-013,-0.351673,3.418530,-0.351673,-0.351673,-0.930896,-0.703519,3.965787,5.107585,-0.703519,1.255473,...,2.256800,-0.930896,-0.786622,11.804102,-0.786622,-0.497382,6.914443,-1.167321,9.445156,9.944103
DE-029,-0.769044,-2.197484,-0.769044,-0.769044,-2.035695,-1.538465,1.709627,-2.307511,16.555985,-0.726200,...,2.138774,-2.035695,19.206194,-0.769044,-1.720196,-1.087681,12.592054,0.712508,4.119880,9.707296
DE-031,-0.529460,4.248113,-0.529460,-0.529460,-1.401506,-1.059179,3.195194,4.711146,-1.059179,6.575038,...,10.011262,-1.401506,-1.184295,-0.529460,-1.184295,-0.748831,7.573442,7.377561,6.341171,7.123596
DE-034,-0.554554,4.266896,-0.554554,-0.554554,-1.467931,-1.109380,9.086601,2.201607,-1.109380,5.611424,...,1.623422,3.557283,-1.240426,-0.554554,-1.240426,-0.784322,9.597736,-1.840750,5.068540,8.541806
DE-037,-0.569359,6.855277,-0.569359,-0.569359,-1.507120,-1.138997,-1.507120,3.406212,5.325069,4.053391,...,5.036675,-1.507120,9.869683,-0.569359,-1.273541,-0.805261,6.985395,1.221175,7.183815,2.496082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FR-824,-0.386614,-1.317490,-0.386614,16.161361,-1.023385,-0.773417,3.825363,0.299747,-0.773417,-2.165747,...,1.145926,5.234223,-0.864777,-0.386614,-0.864777,-0.546799,6.921541,-1.283300,5.719927,8.498497
FR-825,-0.657416,-4.460092,-0.657416,-0.657416,-1.740211,-1.315154,3.543919,3.201706,7.185648,6.125044,...,-3.843715,-1.740211,6.133462,-0.657416,-1.470507,-0.929803,6.514876,-2.182182,0.955234,14.570773
FR-827,-0.738194,0.342690,-0.738194,-0.738194,-1.954033,-1.476749,3.534742,6.099145,5.055626,-0.675354,...,1.547091,-1.954033,4.192020,-0.738194,-1.651190,-1.044049,9.246186,1.165924,2.989604,6.683158
FR-828,-0.865070,7.659595,-0.865070,-0.865070,-2.289880,-1.730563,3.765820,-1.688670,4.940350,-2.221274,...,2.651769,-2.289880,4.032147,-0.865070,-1.934986,-1.223493,5.276623,0.725304,7.252537,17.015238


In [7]:
feat = pd.read_csv("../data/pred_pathway_ibd_feat.csv", index_col=0)
lab = pd.read_csv("../data/pred_pathway_ibd_metadata.csv", index_col=0)

In [8]:
X, y = prior_preprocess(feat, lab, "study_condition", "IBD")
pipe = create_pipeline(True)
pipe

Pipeline(steps=[('clr_transformer',
                 FunctionTransformer(func=<function clr_transform at 0x7f39c0236040>,
                                     validate=True)),
                ('calib_rf',
                 CalibratedClassifierCV(base_estimator=RandomForestClassifier(max_features='sqrt',
                                                                              n_estimators=500),
                                        cv=5))])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
fitted_mod = pipe.fit(X_train, y_train)

In [None]:
r = permutation_importance(fitted_mod, X_test, y_test, n_repeats = 5, n_jobs = 2, scoring = "roc_auc")

In [None]:
r

## Calibration curves

In [None]:
y_pred = fitted_mod.predict_proba()

In [None]:
help(fitted_mod.predict_proba)

# Performing the same analysis for other datasets