In [1]:
import glob
import sklearn.metrics
import pandas as pd
import numpy as np
from tqdm import tqdm
import pathlib
import itertools
import os
from functools import partial

## Computing CIs for Overall Metrics

In [2]:
np.random.seed(0)

class Model():
    def __init__(self, path):
        self.name = os.path.splitext(os.path.basename(path))[0]
        self.type = os.path.basename(os.path.dirname(path))
        self.df = pd.read_csv(path, keep_default_na=False)


def single_replicate_all_models(models, metric_str, replicate_num):
    performances = {}
    for model in models:
        gt = model.df["label"]
        preds = model.df["prediction"]
        sample_ids = np.random.choice(len(gt), size=len(gt), replace=True)
        
        # Uncomment this for testing: 
        # Set sample_ids equal to gt.index and we should get metrics on the full test set.
        # sample_ids = gt.index
        
        y_true = gt.iloc[sample_ids].to_numpy().ravel()
        y_score = preds.iloc[sample_ids].to_numpy().ravel()
        if metric_str == 'acc':
            performance = sklearn.metrics.accuracy_score(y_true, y_score)
        elif metric_str == 'f1':
            performance = sklearn.metrics.f1_score(y_true, y_score, average="weighted")
        else:
            raise ValueError('Metric Not Defined')
        performances[model.name] = performance
    return performances


def multiple_replicate_all_models(models, num_replicates, metric_str):
    func = partial(single_replicate_all_models, models, metric_str)
    results = list(map(func, range(num_replicates)))
    return results


class ConfidenceGenerator():
    def __init__(self, confidence_level):
        self.records = []
        self.confidence_level = confidence_level

    @staticmethod
    def compute_cis(series, confidence_level):
        sorted_perfs = series.sort_values()
        lower_index = int(confidence_level/2 * len(sorted_perfs)) - 1
        upper_index = int((1 - confidence_level/2) * len(sorted_perfs)) - 1
        lower = round(sorted_perfs.iloc[lower_index], 3)
        upper = round(sorted_perfs.iloc[upper_index], 3)
        mean = round(sorted_perfs.mean(), 3)
        return lower, mean, upper

    def create_ci_record(self, perfs, name, perf_type):
        lower, mean, upper = ConfidenceGenerator.compute_cis(
            perfs, self.confidence_level)
        record = {"name": name,
                  "type": perf_type,
                  "lower": lower,
                  "mean": mean,
                  "upper": upper}
        self.records.append(record)

    def generate_cis(self, df):
        for name in df.columns:
            self.create_ci_record(df[name], name, 'individual')
            
        for name1, name2 in itertools.combinations(df.columns, 2):
            model_diffs = df[name1] - df[name2]
            self.create_ci_record(model_diffs, f"{name1}-{name2}", 'difference')

        df = pd.DataFrame.from_records(self.records)
        return df

def run_stage_1_models(models, num_replicates, metric_str, save_path):
    evaluations = multiple_replicate_all_models(
        models,
        num_replicates,
        metric_str)
    df = pd.DataFrame.from_records(evaluations)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)


def run_stage1(num_replicates, metric_str, read_path, save_path=None):
    if metric_str in ['f1', 'acc']:
        model_paths = glob.glob(f'{read_path}/*.csv', recursive=True)
    else:
        raise ValueError("Metric path not defined")

    models = [Model(path) for path in model_paths]

    run_stage_1_models(models, num_replicates, metric_str, save_path)


def run_stage2(confidence_level, read_path, save_path=None):
    perfs = pd.read_csv(f"{read_path}.csv")
    cb = ConfidenceGenerator(confidence_level=confidence_level)
    df = cb.generate_cis(perfs)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)

def main(working_dir, num_replicates):
    metrics = ['acc', 'f1']

    for metric_str in metrics:
        stage_1_save_path = f'{working_dir}/stats/raw_{metric_str}_95'
        run_stage1(num_replicates=num_replicates,
                   metric_str=metric_str,
                   read_path=f'{working_dir}/predictions',
                   save_path=stage_1_save_path,
                   )

        stage_2_save_path = f'{working_dir}/stats/processed_{metric_str}_95'
        run_stage2(read_path=stage_1_save_path,
                   save_path=stage_2_save_path,
                   confidence_level=0.05,
                  )


if __name__ == "__main__":
    num_replicates = 1000
    working_dir = "/deep/group/aihc-bootcamp-fall2021/lymphoma/results"
    main(working_dir, num_replicates)

## Computing CIs for Per-class Metrics

In [None]:
label_to_label = {0: 'DLBCL',
                  1: 'HL',
                  2: 'Agg BCL',
                  3: 'FL',
                  4: 'MCL',
                  5: 'MZL',
                  6: 'NKTCL',
                  7: 'TCL'}

def compute_per_class_f1(y_true, y_score, per_class):
    if per_class not in y_true:
        if label_to_label[per_class] not in y_true:
            score = np.NaN
        else:
            score = f1_score(y_true, y_score, average=None, labels=['DLBCL','HL','Agg BCL','FL','MCL','MZL','NKTCL','TCL'])[per_class]
    else:
        score = f1_score(y_true, y_score, average=None, labels=[0, 1, 2, 3, 4, 5, 6, 7])[per_class]
    return score

def compute_per_class_acc(df, per_class):
    TP = len(df[(df['label']==per_class) & (df['prediction']==per_class)])
    TN = len(df[(df['label']!=per_class) & (df['prediction']!=per_class)])
    FP = len(df[(df['label']!=per_class) & (df['prediction']==per_class)])
    FN = len(df[(df['label']==per_class) & (df['prediction']!=per_class)])

    
    return (TP+TN)/(TP+TN+FP+FN)

In [None]:
np.random.seed(0)

class Model():
    def __init__(self, path, per_class):
        self.name = os.path.splitext(os.path.basename(path))[0]
        self.type = os.path.basename(os.path.dirname(path))
        self.df = pd.read_csv(path, keep_default_na=False)
        self.per_class = per_class


def single_replicate_all_models(models, metric_str, replicate_num):
    performances = {}
    for model in models:
        gt = model.df["label"]
        preds = model.df["prediction"]
        sample_ids = np.random.choice(len(gt), size=len(gt), replace=True)
        
        # Uncomment this for testing: 
        # Set sample_ids equal to gt.index and we should get metrics on the full test set.
        # sample_ids = gt.index
        
        y_true = gt.iloc[sample_ids].to_numpy().ravel()
        y_score = preds.iloc[sample_ids].to_numpy().ravel()
        temp_model = pd.DataFrame({'label': y_true, 'prediction': y_score})
        if metric_str == 'acc':
            performance = compute_per_class_acc(temp_model, model.per_class)
        elif metric_str == 'f1':
            performance = compute_per_class_f1(y_true, y_score, model.per_class)
        else:
            raise ValueError('Metric Not Defined')
        performances[model.name] = performance
    return performances


def multiple_replicate_all_models(models, num_replicates, metric_str):
    func = partial(single_replicate_all_models, models, metric_str)
    results = list(map(func, range(num_replicates)))
    return results


class ConfidenceGenerator():
    def __init__(self, confidence_level):
        self.records = []
        self.confidence_level = confidence_level

    @staticmethod
    def compute_cis(series, confidence_level):
        series = series.dropna()
        if len(series) == 0:
            return 0, 0, 0
        sorted_perfs = series.sort_values()
        lower_index = int(confidence_level/2 * len(sorted_perfs)) - 1
        upper_index = int((1 - confidence_level/2) * len(sorted_perfs)) - 1
        print(len(sorted_perfs), lower_index, upper_index)
        lower = round(sorted_perfs.iloc[lower_index], 3)
        upper = round(sorted_perfs.iloc[upper_index], 3)
        mean = round(sorted_perfs.mean(), 3)
        return lower, mean, upper

    def create_ci_record(self, perfs, name, perf_type):
        lower, mean, upper = ConfidenceGenerator.compute_cis(
            perfs, self.confidence_level)
        record = {"name": name,
                  "type": perf_type,
                  "lower": lower,
                  "mean": mean,
                  "upper": upper}
        self.records.append(record)

    def generate_cis(self, df):
        for name in df.columns:
            self.create_ci_record(df[name], name, 'individual')
            
        for name1, name2 in itertools.combinations(df.columns, 2):
            model_diffs = df[name1] - df[name2]
            self.create_ci_record(model_diffs, f"{name1}-{name2}", 'difference')

        df = pd.DataFrame.from_records(self.records)
        return df

def run_stage_1_models(models, num_replicates, metric_str, save_path):
    evaluations = multiple_replicate_all_models(
        models,
        num_replicates,
        metric_str)
    df = pd.DataFrame.from_records(evaluations)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)


def run_stage1(num_replicates, metric_str, read_path, save_path=None, per_class=None):
    if metric_str in ['f1', 'acc']:
        model_paths = glob.glob(f'{read_path}/*.csv', recursive=True)
    else:
        raise ValueError("Metric path not defined")

    models = [Model(path, per_class) for path in model_paths]

    run_stage_1_models(models, num_replicates, metric_str, save_path)


def run_stage2(confidence_level, read_path, save_path=None):
    perfs = pd.read_csv(f"{read_path}.csv")
    cb = ConfidenceGenerator(confidence_level=confidence_level)
    df = cb.generate_cis(perfs)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)

def main(working_dir, num_replicates, per_class):
    metrics = ['acc', 'f1']

    for metric_str in metrics:
        stage_1_save_path = f'{working_dir}/stats/raw_{metric_str}'
        run_stage1(num_replicates=num_replicates,
                   metric_str=metric_str,
                   read_path=f'{working_dir}/predictions',
                   save_path=stage_1_save_path,
                   per_class=per_class,
                   )

        stage_2_save_path = f'{working_dir}/stats/processed_{metric_str}_class_{per_class}'
        run_stage2(read_path=stage_1_save_path,
                   save_path=stage_2_save_path,
                   confidence_level=0.05,
                  )


if __name__ == "__main__":
    per_class = 0
    num_replicates = 1000
    working_dir = f"/deep/group/aihc-bootcamp-fall2021/lymphoma/results"
    main(working_dir, num_replicates, per_class)