In [1]:
import glob
import sklearn.metrics
import pandas as pd
import numpy as np
from tqdm import tqdm
import pathlib
import itertools
import os
from functools import partial
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score

### Prep Step: Add Labels and Normalize All Pred Probs Df

In [2]:
SCORE_DIR = '/deep2/group/aihc-bootcamp-fall2021/lymphoma/results/model_scores/scores'
# SCORE_DIR = '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/model_out/preds-20230226/dl/dl'
TEST_LABELS = pd.read_csv(f'/deep/group/aihc-bootcamp-fall2021/lymphoma/results/model_scores/section_1_2_Nuclei_test_df.csv').drop(columns=['Unnamed: 0'])[['patient_id', 'label']].drop_duplicates('patient_id')
INM_DIR = '/deep2/group/aihc-bootcamp-fall2021/lymphoma/results/model_scores/mid_scores/'
TEST_LABELS.head()

Unnamed: 0,patient_id,label
0,tma_1_E0003 B,1
8,tma_1_E0008 C,0
12,tma_1_E0011 B,3
20,tma_1_E0017 B,7
28,tma_1_E0023 B,0


In [3]:
# drop all label columns first
def drop_labels(df):
    df.drop([i for i in df.columns if 'label' in i],
               axis=1, inplace=True)
    return df

def scale_probs(df):
    # assume that df does not have label columns
    # remove Unnamed: 0 if any
    df.drop([i for i in df.columns if 'Unnamed' in i],
               axis=1, inplace=True)
    df = df.set_index('patient_id')
    probs_scale = df.apply(np.sum, axis=1)
    scaled_df = df.apply(lambda x: x/probs_scale, axis=0).reset_index()
    return scaled_df

def get_core_probs(df):
    df = df.groupby('patient_id').aggregate(pd.DataFrame.mean)
    return df

def get_core_preds(df):
    df['prediction'] = df.idxmax(axis=1).astype('int').to_numpy()
    df = df.reset_index()
    return df

def add_labels(df):
    new_pred_probs_df = df.merge(TEST_LABELS, how="inner", on="patient_id")
    return new_pred_probs_df

In [4]:
## add labels to it
for file in os.listdir(SCORE_DIR):
    if file == ".ipynb_checkpoints":
        continue
    df = pd.read_csv(os.path.join(SCORE_DIR, file))
    result = drop_labels(df)
    result = scale_probs(result)
    result = get_core_probs(result)
    result = get_core_preds(result)
    result = add_labels(result)
    result.to_csv(os.path.join(INM_DIR, file), index=False)

### Define per-class metric functions

In [3]:
def per_class_auc(core_labels, core_probs, i):
    y_test = core_labels == i
    y_score = core_probs
    auc = roc_auc_score(y_test, y_score)
    return auc

# assume df is core-level, (patient_id, 0, 1, ..., label)

def sensitivity(cnf_matrix, total, core_preds, i):
    actual_positives = cnf_matrix[i,:].sum()
    pred_positives = cnf_matrix[:,i].sum()

    TP = cnf_matrix[i][i]
    FP = pred_positives - TP
    FN = actual_positives - TP
    TN = total - (TP + FP + FN)

    sensitivity = np.nan_to_num(TP / (TP + FN))
    return sensitivity

def precision(cnf_matrix, total, core_preds, i):
    actual_positives = cnf_matrix[i,:].sum()
    pred_positives = cnf_matrix[:,i].sum()

    TP = cnf_matrix[i][i]
    FP = pred_positives - TP
    FN = actual_positives - TP
    TN = total - (TP + FP + FN)

    precision = np.nan_to_num(TP / (TP + FP))
    return precision

def specificity(cnf_matrix, total, core_preds, i):
    actual_positives = cnf_matrix[i,:].sum()
    pred_positives = cnf_matrix[:,i].sum()

    TP = cnf_matrix[i][i]
    FP = pred_positives - TP
    FN = actual_positives - TP
    TN = total - (TP + FP + FN)

    specificity = np.nan_to_num(TN / (TN + FP))
    return specificity


## Computing CIs for Per-class Metrics

In [54]:
np.random.seed(0)

class Model():
    def __init__(self, path):
        self.name = os.path.splitext(os.path.basename(path))[0]
        self.type = os.path.basename(os.path.dirname(path))
        self.df = pd.read_csv(path, keep_default_na=False)


def single_replicate_all_models(models, metric_str, i, replicate_num):
    performances = {}
    for model in models:
        df = model.df
        core_labels_full = df['label'].to_numpy()
        core_preds_full = df['prediction'].to_numpy()
        core_probs_full = df[str(i)].to_numpy()
        
        sample_ids = np.random.choice(len(df), size=len(df), replace=True)
        
        core_labels = core_labels[sample_ids]
        core_preds_full = core_preds[sample_ids]
        core_probs_full = core_preds[sample_ids]
    
        cnf_matrix = confusion_matrix(core_labels, core_preds)
        total = cnf_matrix.sum()
        class_counts = cnf_matrix.sum(axis=1)

        if metric_str == 'auc':
            performance = per_class_auc(core_labels, core_probs, i)
        elif metric_str == 'sensitivity':
            performance = sensitivity(cnf_matrix, total, core_preds, i)
        elif metric_str == 'specificity':
            performance = specificity(cnf_matrix, total, core_preds, i)
        elif metric_str == 'precision':
            performance = precision(cnf_matrix, total, core_preds, i)
        else:
            raise ValueError('Metric Not Defined')
        performances[model.name] = performance
    return performances


def multiple_replicate_all_models(models, num_replicates, metric_str, i):
    func = partial(single_replicate_all_models, models, metric_str, i)
    results = list(map(func, range(num_replicates)))
    return results


class ConfidenceGenerator():
    def __init__(self, confidence_level):
        self.records = []
        self.confidence_level = confidence_level

    @staticmethod
    def compute_cis(series, confidence_level):
        sorted_perfs = series.sort_values()
        lower_index = int(confidence_level/2 * len(sorted_perfs)) - 1
        upper_index = int((1 - confidence_level/2) * len(sorted_perfs)) - 1
        lower = round(sorted_perfs.iloc[lower_index], 3)
        upper = round(sorted_perfs.iloc[upper_index], 3)
        mean = round(sorted_perfs.mean(), 3)
        return lower, mean, upper

    def create_ci_record(self, perfs, name, perf_type):
        lower, mean, upper = ConfidenceGenerator.compute_cis(
            perfs, self.confidence_level)
        record = {"name": name,
                  "type": perf_type,
                  "lower": lower,
                  "mean": mean,
                  "upper": upper}
        self.records.append(record)

    def generate_cis(self, df):
        for name in df.columns:
            self.create_ci_record(df[name], name, 'individual')
            
        for name1, name2 in itertools.combinations(df.columns, 2):
            model_diffs = df[name1] - df[name2]
            self.create_ci_record(model_diffs, f"{name1}-{name2}", 'difference')

        df = pd.DataFrame.from_records(self.records)
        return df

def run_stage_1_models(models, num_replicates, metric_str, i, save_path):
    evaluations = multiple_replicate_all_models(
        models,
        num_replicates,
        metric_str,
        i
    )
    df = pd.DataFrame.from_records(evaluations)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)


def run_stage1(num_replicates, metric_str, i, read_path, save_path=None):
    if metric_str in ['auc', 'sensitivity', 'specificity', 'precision']:
        model_paths = glob.glob(f'{read_path}/*.csv', recursive=True)
    else:
        raise ValueError("Metric path not defined")

    models = [Model(path) for path in model_paths]

    run_stage_1_models(models, num_replicates, metric_str, i, save_path)


def run_stage2(confidence_level, read_path, save_path=None):
    perfs = pd.read_csv(f"{read_path}.csv")
    cb = ConfidenceGenerator(confidence_level=confidence_level)
    df = cb.generate_cis(perfs)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)

def main(working_dir, num_replicates, i):
    metrics = ['sensitivity', 'specificity', 'precision']

    for metric_str in metrics:
        stage_1_save_path = f'{working_dir}/stats/raw_{metric_str}_95'
        run_stage1(num_replicates=num_replicates,
                   metric_str=metric_str,
                   read_path=f'{working_dir}/model_scores/mid_scores/dl',
                   save_path=stage_1_save_path,
                   i = i
                   )

        stage_2_save_path = f'{working_dir}/stats/processed_{metric_str}_95_{i}'
        run_stage2(read_path=stage_1_save_path,
                   save_path=stage_2_save_path,
                   confidence_level=0.025,
                  )


if __name__ == "__main__":
    num_replicates = 1000
    i = 0
    working_dir = "/deep/group/aihc-bootcamp-fall2021/lymphoma/results"
    main(working_dir, num_replicates, i)

/deep/group/aihc-bootcamp-fall2021/lymphoma/results/model_scores/mid_scores/dl
columns Index(['patient_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', 'prediction',
       'label'],
      dtype='object')
columns Index(['patient_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', 'prediction',
       'label'],
      dtype='object')
columns Index(['patient_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', 'prediction',
       'label'],
      dtype='object')
columns Index(['patient_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', 'prediction',
       'label'],
      dtype='object')
columns Index(['patient_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', 'prediction',
       'label'],
      dtype='object')
columns Index(['patient_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', 'prediction',
       'label'],
      dtype='object')
columns Index(['patient_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', 'prediction',
       'label'],
      dtype='object')
columns Index(['patient_id', '0'

## Computing CIs for Overall Metrics - Models

In [4]:
def weighted_metric(class_counts, metrics):
    total_value = 0
    total_count = 0
    for class_count, metric_value in zip(class_counts, metrics):
        total_value += class_count * metric_value
        total_count += class_count
    return total_value / total_count

In [5]:
label_mapping = {0 : 0, # DLBCL -> DLBCL, Agg BCL (0)
                 1 : 1, # HL -> non-DLBCL (1)
                 2 : 0, # Agg BCL -> DLBCL, Agg BCL (0)
                 3 : 1, # FL -> non-DLBCL (1)
                 4 : 1, # MCL -> non-DLBCL (1)
                 5 : 1, # MZL -> non-DLBCL (1)
                 6 : 1, # TCL -> non-DLBCL (1)
                 7 : 1, # NKTCL -> non-DLBCL (1)
                 8 : 1} # Nonmalignant -> non-DLBCL (1)

In [17]:
np.random.seed(0)

class Model():
    def __init__(self, path):
        self.name = os.path.splitext(os.path.basename(path))[0]
        self.type = os.path.basename(os.path.dirname(path))
        self.df = pd.read_csv(path, keep_default_na=False)


def single_replicate_all_models(models, metric_str, replicate_num):
    performances = {}
    for model in models:
        df = model.df
        df['label'] = df['label'].map(label_mapping)
        df['prediction'] = df['prediction'].map(label_mapping)
        core_labels_full = df['label'].to_numpy()
        core_preds_full = df['prediction'].to_numpy()
        
        sample_ids = np.random.choice(len(df), size=len(df), replace=True)
        core_labels = core_labels_full[sample_ids]
        core_preds = core_preds_full[sample_ids]
        
        cnf_matrix = confusion_matrix(core_labels, core_preds)
        total = cnf_matrix.sum()
        class_counts = cnf_matrix.sum(axis=1)
        num_classes = len(class_counts)
        
        results = []
        for i in range(num_classes):
            seed = 1
            core_probs_full = df[str(i)].to_numpy()
            core_probs = core_probs_full[sample_ids]

            while len(set(core_labels == i)) < 2:
                np.random.seed(seed)
                seed += 1
                sample_ids = np.random.choice(len(df), size=len(df), replace=True)
                core_labels = core_labels_full[sample_ids]
                core_preds = core_preds_full[sample_ids]
                core_probs = core_probs_full[sample_ids]
                
            cnf_matrix = confusion_matrix(core_labels, core_preds)
            total = cnf_matrix.sum()
            class_counts = cnf_matrix.sum(axis=1)
            num_classes = len(class_counts)
            
            auc = per_class_auc(core_labels, core_probs, i)
            sens = sensitivity(cnf_matrix, total, core_preds, i)
            spec = specificity(cnf_matrix, total, core_preds, i)
            prec = precision(cnf_matrix, total, core_preds, i)
            f1 = sklearn.metrics.f1_score(core_labels, core_preds, average="weighted")
            print('class', i, 'auc', auc)
            results.append([i, class_counts[i], auc, sens, spec, prec, f1])

        metrics_df = pd.DataFrame(results, columns = ['class', 'n', 'auc', 'sensitivity', 'specificity', 'precision', 'f1'])
    

        if metric_str == 'auc':
            performance = weighted_metric(metrics_df['n'], metrics_df['auc'])
        elif metric_str == 'sensitivity':
            performance = weighted_metric(metrics_df['n'], metrics_df['sensitivity'])
        elif metric_str == 'specificity':
            performance = weighted_metric(metrics_df['n'], metrics_df['specificity'])
        elif metric_str == 'precision':
            performance = weighted_metric(metrics_df['n'], metrics_df['precision'])
        elif metric_str == 'f1':
            performance = f1
        else:
            raise ValueError('Metric Not Defined')
        performances[model.name] = performance
    return performances


def multiple_replicate_all_models(models, num_replicates, metric_str):
    func = partial(single_replicate_all_models, models, metric_str)
    results = list(map(func, range(num_replicates)))
    return results


class ConfidenceGenerator():
    def __init__(self, confidence_level):
        self.records = []
        self.confidence_level = confidence_level

    @staticmethod
    def compute_cis(series, confidence_level):
        sorted_perfs = series.sort_values()
        lower_index = int(confidence_level/2 * len(sorted_perfs)) - 1
        upper_index = int((1 - confidence_level/2) * len(sorted_perfs)) - 1
        lower = round(sorted_perfs.iloc[lower_index], 3)
        upper = round(sorted_perfs.iloc[upper_index], 3)
        mean = round(sorted_perfs.mean(), 3)
        return lower, mean, upper

    def create_ci_record(self, perfs, name, perf_type):
        lower, mean, upper = ConfidenceGenerator.compute_cis(
            perfs, self.confidence_level)
        record = {"name": name,
                  "type": perf_type,
                  "lower": lower,
                  "mean": mean,
                  "upper": upper}
        self.records.append(record)

    def generate_cis(self, df):
        for name in df.columns:
            self.create_ci_record(df[name], name, 'individual')
            
        for name1, name2 in itertools.combinations(df.columns, 2):
            model_diffs = df[name1] - df[name2]
            self.create_ci_record(model_diffs, f"{name1}-{name2}", 'difference')

        df = pd.DataFrame.from_records(self.records)
        return df

def run_stage_1_models(models, num_replicates, metric_str, save_path):
    evaluations = multiple_replicate_all_models(
        models,
        num_replicates,
        metric_str,
        )
    df = pd.DataFrame.from_records(evaluations)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)


def run_stage1(num_replicates, metric_str, read_path, save_path=None):
    if metric_str in ['f1', 'auc', 'sensitivity', 'specificity', 'precision']:
        model_paths = glob.glob(f'{read_path}/*.csv', recursive=True)
    else:
        raise ValueError("Metric path not defined")

    models = [Model(path) for path in model_paths]

    run_stage_1_models(models, num_replicates, metric_str, save_path)


def run_stage2(confidence_level, read_path, save_path=None):
    perfs = pd.read_csv(f"{read_path}.csv")
    cb = ConfidenceGenerator(confidence_level=confidence_level)
    df = cb.generate_cis(perfs)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)

def main(working_dir, num_replicates):
#     metrics = ['auc', 'sensitivity', 'specificity', 'precision']
    metrics = ['f1']

    for metric_str in metrics:
        stage_1_save_path = f'{working_dir}/stats/raw_{metric_str}_95'
        run_stage1(num_replicates=num_replicates,
                   metric_str=metric_str,
                   read_path=f'{working_dir}/model_scores/mid_scores/temp',
                   save_path=stage_1_save_path,
                   )

        stage_2_save_path = f'{working_dir}/stats/processed_{metric_str}_95-best-dlbcl-f1'
        run_stage2(read_path=stage_1_save_path,
                   save_path=stage_2_save_path,
                   confidence_level=0.025,
                  )


if __name__ == "__main__":
    num_replicates = 1000
    i = 0
    working_dir = "/deep/group/aihc-bootcamp-fall2021/lymphoma/results"
    main(working_dir, num_replicates)

class 0 auc 0.8313440581214324
class 1 auc 0.7638816813700052
class 0 auc 0.8826043237807945
class 1 auc 0.7677224736048266
class 0 auc 0.7894736842105263
class 1 auc 0.7132571996027806
class 0 auc 0.7732186732186732
class 1 auc 0.7312039312039311
class 0 auc 0.792548076923077
class 1 auc 0.7197115384615385
class 0 auc 0.924720738222438
class 1 auc 0.7537639630888782
class 0 auc 0.8722222222222221
class 1 auc 0.7864734299516908
class 0 auc 0.8584860173577628
class 1 auc 0.8606557377049181
class 0 auc 0.7948532948532948
class 1 auc 0.7823472823472823
class 0 auc 0.8018590998043053
class 1 auc 0.6756360078277887
class 0 auc 0.7710843373493976
class 1 auc 0.6707700366684127
class 0 auc 0.8528846153846155
class 1 auc 0.7149038461538462
class 0 auc 0.763500482160077
class 1 auc 0.6870781099324976
class 0 auc 0.819672131147541
class 1 auc 0.7509643201542913
class 0 auc 0.8564164648910412
class 1 auc 0.673365617433414
class 0 auc 0.8225961538461539
class 1 auc 0.7103365384615385
class 0 auc 0

## Computing CIs for Overall Metrics - Pathologists

In [4]:
np.random.seed(0)

class Model():
    def __init__(self, path):
        self.name = os.path.splitext(os.path.basename(path))[0]
        self.type = os.path.basename(os.path.dirname(path))
        self.df = pd.read_csv(path, keep_default_na=False)


def single_replicate_all_models(models, metric_str, replicate_num):
    performances = {}
    for model in models:
        df = model.df
        core_labels_full = df['label'].to_numpy()
        core_preds_full = df['prediction'].to_numpy()
        
        sample_ids = np.random.choice(len(df), size=len(df), replace=True)
        
        core_labels = core_labels_full[sample_ids]
        core_preds = core_preds_full[sample_ids]
        
        cnf_matrix = confusion_matrix(core_labels, core_preds)
        total = cnf_matrix.sum()
        class_counts = cnf_matrix.sum(axis=1)
        num_classes = len(class_counts)
        
        results = []
        for i in range(num_classes):
#             core_probs_full = df[str(i)].to_numpy()
#             core_probs = core_probs_full[sample_ids]
            seed = 1
            while len(set(core_labels == i)) < 2:
                np.random.seed(seed)
                seed += 1
                sample_ids = np.random.choice(len(df), size=len(df), replace=True)
                core_labels = core_labels_full[sample_ids]
                core_preds = core_preds_full[sample_ids]
#                 core_probs = core_probs_full[sample_ids]
#             auc = per_class_auc(core_labels, core_probs, i)
#             print('class', i, 'auc', auc)
            sens = sensitivity(cnf_matrix, total, core_preds, i)
            spec = specificity(cnf_matrix, total, core_preds, i)
            prec = precision(cnf_matrix, total, core_preds, i)
            results.append([i, class_counts[i], sens, spec, prec])

        metrics_df = pd.DataFrame(results, columns = ['class', 'n', 'sensitivity', 'specificity', 'precision'])
    


        if metric_str == 'auc':
#             performance = weighted_metric(metrics_df['n'], metrics_df['auc'])
            pass
        elif metric_str == 'sensitivity':
            performance = weighted_metric(metrics_df['n'], metrics_df['sensitivity'])
        elif metric_str == 'specificity':
            performance = weighted_metric(metrics_df['n'], metrics_df['specificity'])
        elif metric_str == 'precision':
            performance = weighted_metric(metrics_df['n'], metrics_df['precision'])
        else:
            raise ValueError('Metric Not Defined')
        performances[model.name] = performance
    return performances


def multiple_replicate_all_models(models, num_replicates, metric_str):
    func = partial(single_replicate_all_models, models, metric_str)
    results = list(map(func, range(num_replicates)))
    return results


class ConfidenceGenerator():
    def __init__(self, confidence_level):
        self.records = []
        self.confidence_level = confidence_level

    @staticmethod
    def compute_cis(series, confidence_level):
        sorted_perfs = series.sort_values()
        lower_index = int(confidence_level/2 * len(sorted_perfs)) - 1
        upper_index = int((1 - confidence_level/2) * len(sorted_perfs)) - 1
        lower = round(sorted_perfs.iloc[lower_index], 3)
        upper = round(sorted_perfs.iloc[upper_index], 3)
        mean = round(sorted_perfs.mean(), 3)
        return lower, mean, upper

    def create_ci_record(self, perfs, name, perf_type):
        lower, mean, upper = ConfidenceGenerator.compute_cis(
            perfs, self.confidence_level)
        record = {"name": name,
                  "type": perf_type,
                  "lower": lower,
                  "mean": mean,
                  "upper": upper}
        self.records.append(record)

    def generate_cis(self, df):
        for name in df.columns:
            self.create_ci_record(df[name], name, 'individual')
            
        for name1, name2 in itertools.combinations(df.columns, 2):
            model_diffs = df[name1] - df[name2]
            self.create_ci_record(model_diffs, f"{name1}-{name2}", 'difference')

        df = pd.DataFrame.from_records(self.records)
        return df

def run_stage_1_models(models, num_replicates, metric_str, save_path):
    evaluations = multiple_replicate_all_models(
        models,
        num_replicates,
        metric_str,
        )
    df = pd.DataFrame.from_records(evaluations)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)


def run_stage1(num_replicates, metric_str, read_path, save_path=None):
    if metric_str in ['sensitivity', 'specificity', 'precision']:
        model_paths = glob.glob(f'{read_path}/*.csv', recursive=True)
    else:
        raise ValueError("Metric path not defined")

    models = [Model(path) for path in model_paths]

    run_stage_1_models(models, num_replicates, metric_str, save_path)


def run_stage2(confidence_level, read_path, save_path=None):
    perfs = pd.read_csv(f"{read_path}.csv")
    cb = ConfidenceGenerator(confidence_level=confidence_level)
    df = cb.generate_cis(perfs)
    if save_path:
        df.to_csv(f"{save_path}.csv", index=False)

def main(working_dir, num_replicates):
    metrics = ['sensitivity', 'specificity', 'precision']

    for metric_str in metrics:
        stage_1_save_path = f'{working_dir}/stats/raw_{metric_str}_95'
        run_stage1(num_replicates=num_replicates,
                   metric_str=metric_str,
                   read_path=f'{working_dir}/model_scores/mid_scores/pathologists/pathologist_int',
                   save_path=stage_1_save_path,
                   )

        stage_2_save_path = f'{working_dir}/stats/processed_{metric_str}_95_path'
        run_stage2(read_path=stage_1_save_path,
                   save_path=stage_2_save_path,
                   confidence_level=0.025,
                  )


if __name__ == "__main__":
    num_replicates = 1000
    i = 0
    working_dir = "/deep/group/aihc-bootcamp-fall2021/lymphoma/results"
    main(working_dir, num_replicates)



KeyboardInterrupt: 