# Evaluating machine learning models 

This 



In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys, os
import numpy as np
import pandas as pd
import pickle

sys.path.insert(0, '..')

import warnings 
warnings.filterwarnings('ignore')

## Step 1. Preprocessing by removing samples with fewer than 6000 reads and convert metadata to one hot

In [2]:
reads_threshold = 6000

data_directory = '../data/'
data_filename = 'metaanalysis_data.pickle'
metadata_filename = 'metaanalysis_metadata.pickle' 

data = pickle.load(open(os.path.join(data_directory, data_filename), 'rb'))
metadata = pickle.load(open(os.path.join(data_directory, metadata_filename), 'rb'))

data = data.loc[data.sum(axis = 1) >= reads_threshold]
metadata = metadata.loc[data.index]

In [3]:
# label encoder

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
metadata['Status'] = label_encoder.fit_transform(metadata['Status'])

## Step 2. Define models, pipelines and metrics

In [4]:
# screen through models and different levels of collapse
# unity scaler and collase taxa transformer

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold

classifiers = {
    'KNN':  KNeighborsClassifier(3),
    'SVC Kernel': SVC(kernel="linear", C=0.025),
    'SVC Gamma': SVC(gamma=2, C=1),
    'Gaussian Process': GaussianProcessClassifier(1.0 * RBF(1.0)),
    'Decision Tree': DecisionTreeClassifier(max_depth=5),
    'Random Forest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'Neural Networks': MLPClassifier(alpha=1, max_iter=1000),
    'AdaBoost': AdaBoostClassifier(),
    'Gaussian NB': GaussianNB(),
    'Quadratic Discriminant': QuadraticDiscriminantAnalysis(),
    'XGB': GradientBoostingClassifier(n_estimators = 100, learning_rate = 1, max_depth = 2, random_state = 0),
}

metrics = {'AUC': roc_auc_score,
           'F1': f1_score,
           'Accuracy': accuracy_score
          }

kf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 10, random_state = 1)

taxa = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'none']

## Step 3. Iterate through pipelines and save resulting dataframe

In [5]:
from utils import CollapseTaxaTransformer, UnityScaler
from sklearn.pipeline import make_pipeline

# collapse at different levels
# train test split kfolds -> iterations
results = {}

def open_pipeline(study_name, x, y):
    results = {}
    for level in taxa :

        pipeline = make_pipeline(CollapseTaxaTransformer(level = level),
                                UnityScaler(axis = 1))
        df = pipeline.fit_transform(x)
        iteration = 0
        for train_index, test_index in kf.split(df, y):
            xtrain, xtest = df.iloc[train_index], df.iloc[test_index]
            ytrain, ytest = y.iloc[train_index], y.iloc[test_index]
            for model_name, model in classifiers.items():
                model.fit(xtrain, ytrain)
                predictions = model.predict(xtest)
                for metric_name, metric in metrics.items():
                    results[study_name, level, model_name, metric_name, iteration] = metric(ytest, predictions)
            iteration += 1
    return results

for study_name, x in data.groupby('Filename'):
    print(study_name)
    results.update(open_pipeline(study_name, x, metadata.loc[study_name, 'Status']))


AmericanGut
Berding2020
Cao2021
Chen2020
Dan2020
David2021
Fasano2020
Fouquier2021
Kang2017
Kong2019
Liu2019
Son2015
Zou2020
Zurita2019


In [6]:
# aggregate data
results.update(open_pipeline('Aggregate', data, metadata['Status']))

In [7]:
df = pd.melt(pd.DataFrame(results, index = [0]))
df.columns = ['study','level', 'model', 'metric', 'iteration', 'value']

In [8]:
# save
df.to_csv('ML Screen Results.csv', sep = '\t', index = False)

In [9]:
# load
df = pd.read_csv('ML Screen Results.csv', delimiter ='\t')
df

Unnamed: 0,study,level,model,metric,iteration,value
0,AmericanGut,kingdom,KNN,AUC,0,0.500000
1,AmericanGut,kingdom,KNN,F1,0,0.660000
2,AmericanGut,kingdom,KNN,Accuracy,0,0.492537
3,AmericanGut,kingdom,SVC Kernel,AUC,0,0.500000
4,AmericanGut,kingdom,SVC Kernel,F1,0,0.660000
...,...,...,...,...,...,...
197995,Aggregate,none,Quadratic Discriminant,F1,49,0.481203
197996,Aggregate,none,Quadratic Discriminant,Accuracy,49,0.536913
197997,Aggregate,none,XGB,AUC,49,0.593960
197998,Aggregate,none,XGB,F1,49,0.627692
