# Interactive predictive model fitting for classifying case/control  

Quang Nguyen   
Last updated 2022-05-03

In [9]:
import numpy as np 
import pandas as pd
import sys
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import KFold, cross_val_score, cross_validate, train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, brier_score_loss, roc_auc_score
from sklearn.inspection import permutation_importance
from skbio.stats.composition import clr, multiplicative_replacement
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import pickle
import timeit

Loading custom functions from the `pred_eval` model

In [5]:
sys.path.insert(1, '../python/')
from pred_eval import prior_preprocess, clr_transform, create_pipeline

In [4]:
np.random.seed(160497)

## Example using IBD pathways data 

In [7]:
feat = pd.read_csv("../data/pred_pathway_ibd_feat.csv", index_col=0)
lab = pd.read_csv("../data/pred_pathway_ibd_metadata.csv", index_col=0)

In [8]:
X, y = prior_preprocess(feat, lab, "study_condition", "IBD")
pipe = create_pipeline(True)
pipe

Pipeline(steps=[('clr_transformer',
                 FunctionTransformer(func=<function clr_transform at 0x7f39c0236040>,
                                     validate=True)),
                ('calib_rf',
                 CalibratedClassifierCV(base_estimator=RandomForestClassifier(max_features='sqrt',
                                                                              n_estimators=500),
                                        cv=5))])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
fitted_mod = pipe.fit(X_train, y_train)

In [None]:
r = permutation_importance(fitted_mod, X_test, y_test, n_repeats = 5, n_jobs = 2, scoring = "roc_auc")

In [None]:
r

## Calibration curves

In [None]:
y_pred = fitted_mod.predict_proba()

In [None]:
help(fitted_mod.predict_proba)

# Performing the same analysis for other datasets