In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys
import random
from pomegranate import *
from hmm_visualization_methods import *
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns

# File paths
file_path_base_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb_saved_splits\metrics.json"
file_path_base_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_randomes\metrics.json"
file_path_new_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_nb\metrics.json"
file_path_new_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_randomes\metrics.json"
file_path_mhcpred = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\2\10fold\10foldmetrics.json"
files = [file_path_base_nb, file_path_base_r, file_path_new_nb, file_path_new_r, file_path_mhcpred]


In [5]:
import pandas as pd
from sklearn.metrics import roc_curve, auc
import os
import json
import subprocess

# MixMHC2pred functions
def MixMHC2pred_for_allele(allele, input_file, output_file):
    mixmhc2pred_path = "C:\\Tools\\MixMHC2pred-2.0\\MixMHC2pred.exe"
    
    command = [
        mixmhc2pred_path,
        "-i", input_file,
        "-o", output_file,
        "-a", allele,
        "--no_context"  
    ]
    try:
        subprocess.run(command, check=True)
        print(f"Prediction completed! Results saved in {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error occurred: {e}")

def is_binder_mhc_pred(file_path, allele, target_path, ind, threshold=5.0):
    def read_mixmhc2pred_file(file_path):
        with open(file_path, 'r') as file:
            lines = file.readlines()
        start_line = next(i for i, line in enumerate(lines) if not line.startswith('#'))
        df = pd.read_csv(file_path, skiprows=start_line, delimiter='\t')
        return df

    df = read_mixmhc2pred_file(file_path)
    df['Binder'] = df.apply(
        lambda row: '1' if (row[f"%Rank_{allele}"] < threshold) else '0',
        axis=1
    )

    output_file = os.path.join(target_path, f'classified_peptides_{allele}-{ind}.csv')
    df.to_csv(output_file, index=False)
    print(f"Prediction completed! Results saved in {output_file}")
    return df

# Function to format allele name
def format_allele_name(allele):
    allele_formatted = allele.replace("HLA-", "")  # Remove 'HLA-'
    allele_formatted = allele_formatted.replace("-", "")  # Replace '-' with ''
    
    parts = allele_formatted.split('DRB')
    formatted_allele = f"DRB{parts[1][0:1]}_{parts[1][1:3]}_{parts[1][3:]}"
    return formatted_allele

def load_y_test(allele, fold, peptide_length):
    y_test_path = rf"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb_saved_splits\{allele}\fold_{fold}\test_data.csv"
    y_test_df = pd.read_csv(y_test_path)
    
    # Filter y_test based on peptide length
    if peptide_length != 'all':
        y_test_df = y_test_df[y_test_df['peptide'].apply(lambda x: len(x) == peptide_length)]
    else:
        y_test_df = y_test_df[y_test_df['peptide'].apply(lambda x: len(x) >= 12)]
    
    return y_test_df

def run_mixmhc(allele, fold, target_path_allele, peptide_length):
    allele_formatted = allele.replace("HLA-", "").replace("-", "")
    parts = allele_formatted.split('DRB')
    formatted_allele = f"DRB{parts[1][0:1]}_{parts[1][1:3]}_{parts[1][3:]}"
    
    y_test_df = load_y_test(allele, fold, peptide_length)
    test_file = os.path.join(target_path_allele, f"test_peptides_{fold}.txt")
    y_test_df["peptide"].to_csv(test_file, index=False, header=False)
    
    output_file = os.path.join(target_path_allele, f"mixmhc2pred_results_{fold}.txt")
    MixMHC2pred_for_allele(formatted_allele, test_file, output_file)
    
    if not os.path.exists(output_file):
        raise FileNotFoundError(f"MixMHC2pred output file not found: {output_file}")
    
    df_pred = is_binder_mhc_pred(output_file, formatted_allele, target_path_allele, fold)
    return (100 - df_pred[f"%Rank_{formatted_allele}"]).to_list()

# Initialize dictionary for storing results
metrics_dict = {}

alleles = [
    'HLA-DRB1*03:01', 'HLA-DRB3*01:01', 'HLA-DRB1*07:01',
    'HLA-DRB1*11:01', 'HLA-DRB1*12:01', 'HLA-DRB1*15:01',
    'HLA-DRB4*01:01', 'HLA-DRB3*02:02', 'HLA-DRB5*01:01', 'HLA-DRB1*01:01'
]
peptide_lengths = ['all']

target_path_allele = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output"

for allele in alleles:
    allele_formatted = allele.replace(":", "-").replace("*", "").replace("/", "_")
    metrics_dict[allele_formatted] = {}
    
    for peptide_length in peptide_lengths:
        metrics_dict[allele_formatted][peptide_length] = {}
        
        for fold in range(1, 11):
            y_test_df = load_y_test(allele_formatted, fold, peptide_length)
            y_test = y_test_df['label']
            
            mixmhc_scores = run_mixmhc(allele_formatted, fold, target_path_allele, peptide_length)
            
            fpr, tpr, _ = roc_curve(y_test, mixmhc_scores)
            roc_auc = auc(fpr, tpr)
            
            metrics_dict[allele_formatted][peptide_length][fold] = {
                "auc": roc_auc,
                "fpr": fpr.tolist(),
                "tpr": tpr.tolist()
            }

# Save metrics dictionary
metrics_path = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output\mixmhc_metrics.json"
with open(metrics_path, "w") as f:
    json.dump(metrics_dict, f, indent=4)

print(f"Metrics saved to {metrics_path}")


Prediction completed! Results saved in C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output\mixmhc2pred_results_1.txt
Prediction completed! Results saved in C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output\classified_peptides_DRB1_03_01-1.csv
Prediction completed! Results saved in C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output\mixmhc2pred_results_2.txt
Prediction completed! Results saved in C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output\classified_peptides_DRB1_03_01-2.csv
Prediction completed! Results saved in C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output\mixmhc2pred_results_3.txt
Prediction completed! Results saved in C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output\classified_peptides_DRB1_03_01-3.csv
Prediction completed! Results saved in C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output\mixmhc2pred_results_4.txt
Prediction completed! Results saved in C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_

In [27]:
import pandas as pd
import json
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
import os
from collections import defaultdict
def format_allele_name(allele):
    allele_formatted = allele.replace("HLA-", "")  # Remove 'HLA-'
    allele_formatted = allele_formatted.replace("-", "")  # Replace '-' with ''
    
    parts = allele_formatted.split('DRB')
    formatted_allele = f"DRB{parts[1][0:1]}_{parts[1][1:3]}_{parts[1][3:]}"
    return formatted_allele


def load_y_test(allele, fold, peptide_length):
    y_test_path = rf"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb_saved_splits\{allele}\fold_{fold}\test_data.csv"
    y_test_df = pd.read_csv(y_test_path)
    
    if peptide_length != 'all':
        y_test_df = y_test_df[y_test_df['peptide'].apply(lambda x: len(x) == peptide_length)]
    else:
        y_test_df = y_test_df[y_test_df['peptide'].apply(lambda x: len(x) >= 12)]
    
    return y_test_df

def load_saved_mixmhc_scores(allele, fold, target_path_allele):
    output_file = os.path.join(target_path_allele, f'classified_peptides_{allele}-{fold}.csv')
    if not os.path.exists(output_file):
        raise FileNotFoundError(f"Saved MixMHC output file not found: {output_file}")
    
    df_pred = pd.read_csv(output_file)
    scores = (100 - df_pred[f"%Rank_{allele}"]).to_list()
    return scores

def calculate_metrics(y_test, y_scores):
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    auc_score = auc(fpr, tpr)
    accuracy = accuracy_score(y_test, [1 if score > 50 else 0 for score in y_scores])
    precision = precision_score(y_test, [1 if score > 50 else 0 for score in y_scores])
    recall = recall_score(y_test, [1 if score > 50 else 0 for score in y_scores])
    f1 = f1_score(y_test, [1 if score > 50 else 0 for score in y_scores])
    
    return {
        "auc_score": auc_score,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }


metrics_dict =  defaultdict(list)
alleles = ['HLA-DRB1*03:01',
           'HLA-DRB3*01:01',
           'HLA-DRB1*07:01',
           'HLA-DRB1*11:01',
           'HLA-DRB1*12:01',
           'HLA-DRB1*15:01',
           'HLA-DRB4*01:01',
           'HLA-DRB3*02:02',
           'HLA-DRB5*01:01',
           'HLA-DRB1*01:01'] 
peptide_lengths = ['all']

for allele in alleles:
    allele = allele.replace(":", "-").replace("*", "").replace("/", "_")
    allele_f = format_allele_name(allele)
    for peptide_length in peptide_lengths:
        for fold in range(1, 11):
            y_test_df = load_y_test(allele, fold, peptide_length)
            y_test = y_test_df['label']
            
            target_path_allele = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output"
            mixmhc_scores = load_saved_mixmhc_scores(allele_f, fold, target_path_allele)
            
            metrics = calculate_metrics(y_test, mixmhc_scores)
            metrics_dict[allele].append(metrics)

with open("mixmhc_metrics.json", "w") as f:
    json.dump(metrics_dict, f, indent=4)



In [None]:
import pandas as pd
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import os

# Function to load y_test (true labels)
def load_y_test(allele, fold, peptide_length):
    y_test_path = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb_saved_splits\{allele}\fold_{fold}\test_data.csv"
    y_test_file = y_test_path.format(allele=allele, fold=fold)
    y_test_df = pd.read_csv(y_test_file)

    # Filter y_test based on peptide length
    if peptide_length != 'all':
        y_test_df = y_test_df[y_test_df['peptide'].apply(lambda x: len(x) == peptide_length)]
    else:
        y_test_df = y_test_df[y_test_df['peptide'].apply(lambda x: len(x) >=12)]
    
    return y_test_df

# Function to load y_scores from a model's result file
def load_y_scores(file_path, peptide_length):
    try:
        df = pd.read_csv(file_path)
        if peptide_length != 'all':
            df = df[df['peptide'].apply(lambda x: len(x) == peptide_length)]
        else:
            df = df[df['peptide'].apply(lambda x: len(x) >=12)]
        
        # Debugging step: check if 'y_score' column exists
        if 'y_score' not in df.columns:
            print(f"Warning: 'y_score' column not found in {file_path}")
            return None  # Return None if the column is missing
        
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None  # Return None if the file is missing

# Function to run MixMHC2pred
def run_mixmhc(allele, fold, target_path_allele, peptide_length):
    allele_formatted = allele.replace("HLA-", "")  # Remove 'HLA-'
    allele_formatted = allele_formatted.replace("-", "")  # Replace '-' with '_'
    parts = allele_formatted.split('DRB')
    formatted_allele = f"DRB{parts[1][0:1]}_{parts[1][1:3]}_{parts[1][3:]}"
    
    # Load test peptides
    y_test_df = load_y_test(allele, fold, peptide_length)
    test_file = os.path.join(target_path_allele, f"test_peptides_{fold}.txt")
    y_test_df["peptide"].to_csv(test_file, index=False, header=False)

    # Define output file for MixMHC
    output_file = os.path.join(target_path_allele, f"mixmhc2pred_results_{fold}.txt")
    
    # Call MixMHC2pred function (assumes it's defined elsewhere)
    MixMHC2pred_for_allele(formatted_allele, test_file, output_file)

    # Check if MixMHC output file exists
    if not os.path.exists(output_file):
        raise FileNotFoundError(f"MixMHC2pred output file not found: {output_file}")

    # Process MixMHC2pred results
    df_pred = is_binder_mhc_pred(output_file, formatted_allele, target_path_allele, fold)
    return (100 - df_pred[f"%Rank_{formatted_allele}"]).to_list()  # Return MixMHC scores

# Initialize the color map for models
model_colors = {
    'base_nb': 'blue',
    'base_r': 'green',
    'new_nb': 'red',
    'new_r': 'purple',
    'mixmhc': 'orange'
}

# List of alleles and peptide lengths
alleles = ['HLA-DRB1*03:01',
           'HLA-DRB3*01:01',
           'HLA-DRB1*07:01',
           'HLA-DRB1*11:01',
           'HLA-DRB1*12:01',
           'HLA-DRB1*15:01',
           'HLA-DRB4*01:01',
           'HLA-DRB3*02:02',
           'HLA-DRB5*01:01',
           'HLA-DRB1*01:01']  # Example alleles, replace with your list
peptide_lengths = [12, 15, 'all']

# Iterate over alleles and peptide lengths (k-mers)
for allele in alleles:
    allele = allele.replace(":", "-").replace("*", "").replace("/", "_")
    for peptide_length in peptide_lengths:
        plt.figure(figsize=(10, 8))  # Create a new plot for each allele and peptide length
        
        # Set a flag to track which models have already been added to the legend
        legend_added = {model: False for model in model_colors}
        
        # Iterate over folds
        for fold in range(1, 11):
            # Load true labels (y_test)
            y_test_df = load_y_test(allele, fold, peptide_length)
            y_test = y_test_df['label']

            # Define paths for each model (base_nb, base_r, new_nb, new_r)
            base_nb_path = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb_saved_splits\{allele}\fold_{fold_number}\y_scores.csv"
            base_r_path = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\random_model_natural_distr\{allele}\fold_{fold_number}\y_scores.csv"
            new_nb_path = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_natural_randomes\{allele}\fold_{fold_number}\y_scores_nb.csv"
            new_r_path = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_natural_randomes\{allele}\fold_{fold_number}\y_scores_r.csv"

            # Iterate over models
            for model_name, file_path_template in zip(['base_nb', 'base_r', 'new_nb', 'new_r'],
                                                      [base_nb_path, base_r_path, new_nb_path, new_r_path]):
                # Update file path with current allele and fold number
                file_path = file_path_template.format(allele=allele, fold_number=fold)

                # Load model's y_scores
                y_scores = load_y_scores(file_path, peptide_length)
                if y_scores is None or y_scores.empty:
                    continue  # Skip if y_scores couldn't be loaded or is empty
                
                # Ensure peptides in y_scores match those in y_test (align by peptide sequence)
                if len(y_test) != len(y_scores):
                    print(f"Warning: Mismatch in number of peptides between y_test and y_scores for {allele}, Fold {fold}, Peptide Length {peptide_length}")
                    print(f"y_test length: {len(y_test)}, y_scores length: {len(y_scores)}")
                    # Align the data based on peptide sequence
                    y_scores = y_scores[y_scores['peptide'].isin(y_test_df['peptide'])]
                    print(f"After alignment, y_scores length: {len(y_scores)}")

                # Ensure lengths match before calculating ROC
                if len(y_test) != len(y_scores):
                    print(f"Skipping {allele}, Fold {fold}, Length {peptide_length} due to length mismatch.")
                    continue

                # Calculate ROC curve
                fpr, tpr, _ = roc_curve(y_test, y_scores['y_score'])
                roc_auc = auc(fpr, tpr)

                # Plot the ROC curve for the current model
                plt.plot(fpr, tpr, color=model_colors[model_name], lw=2)

                # Add model name to the legend only once
                if not legend_added[model_name]:
                    plt.plot([], [], color=model_colors[model_name], label=model_name)
                    legend_added[model_name] = True

            # Run MixMHC for current fold and allele
            target_path_allele = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\model_output"
            mixmhc_scores = run_mixmhc(allele, fold, target_path_allele, peptide_length)

            # Calculate ROC curve for MixMHC
            fpr, tpr, _ = roc_curve(y_test, mixmhc_scores)
            roc_auc = auc(fpr, tpr)

            # Plot the ROC curve for MixMHC
            plt.plot(fpr, tpr, color=model_colors['mixmhc'], lw=2)

            # Add MixMHC to legend
            if not legend_added['mixmhc']:
                plt.plot([], [], color=model_colors['mixmhc'], label='MixMHC')
                legend_added['mixmhc'] = True

        # Final plot setup for the current allele and peptide length
        plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curves for {allele} (Peptide Length: {peptide_length})')
        plt.legend(loc='lower right')
        plt.show()

In [5]:
# Load metrics
def load_metrics(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

In [8]:
metrics_per_allele = {}
for file_path in files:
    metrics = load_metrics(file_path)
    for allele, folds in metrics.items():
        if allele not in metrics_per_allele:
            metrics_per_allele[allele] = {}

        for fold_data in folds:  # Check if folds is a list of dicts
            if not isinstance(fold_data, dict):  
                print(f"Unexpected format in {file_path} for allele {allele}: {type(fold_data)}")
                continue

            for metric, values in fold_data.items():
                if file_path == file_path_mhcpred and not metric.endswith("_mixmhc"):
                    continue  # Keep only _mixmhc metrics for file_path_mhcpred
                
                if metric not in metrics_per_allele[allele]:
                    metrics_per_allele[allele][metric] = []

                # Ensure values is a list
                if isinstance(values, list):
                    metrics_per_allele[allele][metric].extend(values)
                else:  # If it's a float or single value, wrap it in a list
                    metrics_per_allele[allele][metric].append(values)





In [9]:
metrics_per_allele

{'HLA-DRB101-01': {'accuracy': [0.7478260869565218,
   0.7717391304347826,
   0.7706521739130435,
   0.7429347826086956,
   0.7347826086956522,
   0.7396739130434783,
   0.7326086956521739,
   0.7478260869565218,
   0.75,
   0.7438825448613376,
   0.7432909604519774,
   0.7387005649717514,
   0.713276836158192,
   0.7588276836158192,
   0.7521186440677966,
   0.7630649717514124,
   0.7213983050847458,
   0.7482344632768362,
   0.7470858353938538,
   0.7559166372306605,
   0.6923913043478261,
   0.6864130434782608,
   0.6918478260869565,
   0.6695652173913044,
   0.6652173913043479,
   0.6679347826086957,
   0.6755434782608696,
   0.6923913043478261,
   0.6934782608695652,
   0.6884176182707994,
   0.7817796610169492,
   0.7623587570621468,
   0.7680084745762712,
   0.7980225988700564,
   0.7786016949152542,
   0.7612994350282486,
   0.763771186440678,
   0.763771186440678,
   0.7813493465206641,
   0.7855881314023313],
  'precision': [0.9174630755864466,
   0.9286912751677853,
   0.915

# Boxplotes

In [None]:

# Load metrics
def load_metrics(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Dictionary to store metrics per allele
metrics_per_allele = {}

for file_path in files:
    metrics = load_metrics(file_path)
    for allele, folds in metrics.items():
        if allele not in metrics_per_allele:
            metrics_per_allele[allele] = {}
        
        for metric, values in folds.items():
            if file_path == file_path_mhcpred and not metric.endswith("_mixmhc"):
                continue  # Keep only _mixmhc metrics for file_path_mhcpred
            
            if metric not in metrics_per_allele[allele]:
                metrics_per_allele[allele][metric] = []
            
            metrics_per_allele[allele][metric].extend(values)

# Generate boxplots
output_dir = "C:\Projects\grandmaster\notebooks\plots"
os.makedirs(output_dir, exist_ok=True)

for allele, metrics in metrics_per_allele.items():
    for metric, values in metrics.items():
        plt.figure(figsize=(8, 6))
        sns.boxplot(y=values)
        plt.title(f"{allele} - {metric}")
        plt.ylabel(metric)
        plt.savefig(os.path.join(output_dir, f"{allele}_{metric}.png"))
        plt.close()


In [15]:
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# File paths
file_path_base_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb\metrics.json"
file_path_base_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_randomes\metrics.json"
file_path_new_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_nb\metrics.json"
file_path_new_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_randomes\metrics.json"
file_path_mhcpred = r"C:\Projects\grandmaster\notebooks\mixmhc_metrics.json"
files = [file_path_base_nb, file_path_base_r, file_path_new_nb, file_path_new_r, file_path_mhcpred]

# Load metrics
def load_metrics(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Dictionary to store metrics per allele
metrics_per_allele = {}
file_labels = {
    file_path_base_nb: "Base_NB",
    file_path_base_r: "Base_R",
    file_path_new_nb: "New_NB",
    file_path_new_r: "New_R",
    file_path_mhcpred: "MHCPred"
}

for file_path in files:
    metrics = load_metrics(file_path)
    for allele, folds in metrics.items():
        if allele not in metrics_per_allele:
            metrics_per_allele[allele] = []
        
        for fold_data in folds:  # Iterate through each fold's dictionary
            if not isinstance(fold_data, dict):
                print(f"Unexpected format in {file_path} for allele {allele}: {type(fold_data)}")
                continue

            for metric, values in fold_data.items():
                if file_path == file_path_mhcpred and not metric.endswith("_mixmhc"):
                    continue  # Keep only _mixmhc metrics for file_path_mhcpred
                
                for fold_idx, value in enumerate(values if isinstance(values, list) else [values]):
                    metrics_per_allele[allele].append({
                        "Metric": metric,
                        "Value": value,
                        "Fold": fold_idx,
                        "Dataset": file_labels[file_path]
                    })

# Generate boxplots
output_dir = r"C:\Projects\grandmaster\notebooks\plots"
os.makedirs(output_dir, exist_ok=True)

for allele, data in metrics_per_allele.items():
    df = pd.DataFrame(data)
    for metric in df["Metric"].unique():
        plt.figure(figsize=(10, 6))
        sns.boxplot(x="Dataset", y="Value", data=df[df["Metric"] == metric])
        plt.title(f"{allele} - {metric}")
        plt.ylabel(metric)
        plt.xlabel("Dataset")
        plt.xticks(rotation=45)
        plt.savefig(os.path.join(output_dir, f"{allele}_{metric}.png"))
        plt.close()


In [21]:
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re

# File paths
file_path_base_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb\metrics.json"
file_path_base_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_randomes\metrics.json"
file_path_new_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_nb\metrics.json"
file_path_new_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_randomes\metrics.json"
file_path_mhcpred = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\2\10fold\10foldmetrics.json"
files = [file_path_base_nb, file_path_base_r, file_path_new_nb, file_path_new_r, file_path_mhcpred]

# Load metrics
def load_metrics(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Dictionary to store metrics per allele
metrics_per_allele = {}
file_labels = {
    file_path_base_nb: "Base_NB",
    file_path_base_r: "Base_R",
    file_path_new_nb: "New_NB",
    file_path_new_r: "New_R",
    file_path_mhcpred: "MHCPred"
}

# Mapping MHCPred metrics to match other dataset names
metric_name_mapping = {
    "auc_score": "auc",
    "accuracy_mixmhc":'accuracy',
    'precision_mixmhc':'precision',
    'recall_mixmhc':"recall",
    'f1_score':'f1_score_mixmhc'

}
for file_path in files:
    metrics = load_metrics(file_path)
    for allele, folds in metrics.items():
        if allele not in metrics_per_allele:
            metrics_per_allele[allele] = []
        
        for fold_idx, fold_data in enumerate(folds):  # Iterate through each fold's dictionary
            if not isinstance(fold_data, dict):
                print(f"Unexpected format in {file_path} for allele {allele}: {type(fold_data)}")
                continue

            for metric, value in fold_data.items():
                # Normalize MHCPred metric names
                base_metric = metric_name_mapping.get(metric, metric)  # Map specific names
                
                if file_path == file_path_mhcpred and not metric.endswith("_mixmhc"):
                    continue  # Keep only _mixmhc metrics for file_path_mhcpred
                
                metrics_per_allele[allele].append({
                    "Metric": base_metric,
                    "Value": value,
                    "Dataset": file_labels[file_path]
                })

# Generate boxplots
output_dir = r"C:\Projects\grandmaster\notebooks\plots"
os.makedirs(output_dir, exist_ok=True)

for allele, data in metrics_per_allele.items():
    df = pd.DataFrame(data)
    for metric in df["Metric"].unique():
        plt.figure(figsize=(10, 6))
        sns.boxplot(x="Dataset", y="Value", data=df[df["Metric"] == metric])
        plt.title(f"{allele} - {metric}")
        plt.ylabel(metric)
        plt.xlabel("Dataset")
        plt.xticks(rotation=45)
        plt.savefig(os.path.join(output_dir, f"{allele}_{metric}.png"))
        plt.close()



In [21]:
metrics_per_alleleimport json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# File paths
file_path_base_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb\metrics.json"
file_path_base_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_randomes\metrics.json"
file_path_new_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_nb\metrics.json"
file_path_new_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_randomes\metrics.json"
file_path_mhcpred = r"C:\Projects\grandmaster\notebooks\mixmhc_metrics.json"
files = [file_path_base_nb, file_path_base_r, file_path_new_nb, file_path_new_r, file_path_mhcpred]

# Load metrics
def load_metrics(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Dictionary to store metrics per allele
metrics_per_allele = {}
file_labels = {
    file_path_base_nb: "Base_NB",
    file_path_base_r: "Base_R",
    file_path_new_nb: "New_NB",
    file_path_new_r: "New_R",
    file_path_mhcpred: "MHCPred"
}

for file_path in files:
    metrics = load_metrics(file_path)
    for allele, folds in metrics.items():
        if allele not in metrics_per_allele:
            metrics_per_allele[allele] = []
        
        for fold_data in folds:  # Iterate through each fold's dictionary
            if not isinstance(fold_data, dict):
                print(f"Unexpected format in {file_path} for allele {allele}: {type(fold_data)}")
                continue

            for metric, values in fold_data.items():
                if file_path == file_path_mhcpred and not metric.endswith("_mixmhc"):
                    continue  # Keep only _mixmhc metrics for file_path_mhcpred
                
                for fold_idx, value in enumerate(values if isinstance(values, list) else [values]):
                    metrics_per_allele[allele].append({
                        "Metric": metric,
                        "Value": value,
                        "Fold": fold_idx,
                        "Dataset": file_labels[file_path]
                    })

# Generate boxplots
output_dir = r"C:\Projects\grandmaster\notebooks\plots"
os.makedirs(output_dir, exist_ok=True)

for allele, data in metrics_per_allele.items():
    df = pd.DataFrame(data)
    for metric in df["Metric"].unique():
        plt.figure(figsize=(10, 6))
        sns.boxplot(x="Dataset", y="Value", data=df[df["Metric"] == metric])
        plt.title(f"{allele} - {metric}")
        plt.ylabel(metric)
        plt.xlabel("Dataset")
        plt.xticks(rotation=45)
        plt.savefig(os.path.join(output_dir, f"{allele}_{metric}.png"))
        plt.close()


{'HLA-DRB101-01': [{'Metric': 'accuracy',
   'Value': 0.7478260869565218,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'precision',
   'Value': 0.9174630755864466,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'recall',
   'Value': 0.7410526315789474,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'f1_score',
   'Value': 0.8198757763975155,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'auc',
   'Value': 0.8311824138659902,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'accuracy',
   'Value': 0.7717391304347826,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'precision',
   'Value': 0.9286912751677853,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'recall',
   'Value': 0.7676837725381415,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'f1_score',
   'Value': 0.8405466970387244,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'auc',
   'Value': 0.8462074589312721,
   'Fold': 0,
   'Dataset': 'Base_NB'},
  {'Metric': 'accuracy',


In [28]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sci_stats
from statannotations.Annotator import Annotator  # For adding statistical annotations

file_path_base_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb\metrics.json"
file_path_base_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_randomes\metrics.json"
file_path_new_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_nb\metrics.json"
file_path_new_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_randomes\metrics.json"
file_path_mhcpred = r"C:\Projects\grandmaster\notebooks\mixmhc_metrics.json"
files = [file_path_base_nb, file_path_base_r, file_path_new_nb, file_path_new_r, file_path_mhcpred]

# Load metrics
def load_metrics(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Dictionary to store metrics per allele
metrics_per_allele = {}
file_labels = {
    file_path_base_nb: "Base_NB",
    file_path_base_r: "Base_R",
    file_path_new_nb: "New_NB",
    file_path_new_r: "New_R",
    file_path_mhcpred: "MHCPred"
}

metric_name_mapping = {
    "auc_score": "auc"
}

for file_path in files:
    metrics = load_metrics(file_path)
    for allele, folds in metrics.items():
        if allele not in metrics_per_allele:
            metrics_per_allele[allele] = []
        
        for fold_idx, fold_data in enumerate(folds):  # Iterate through each fold's dictionary
            if not isinstance(fold_data, dict):
                print(f"Unexpected format in {file_path} for allele {allele}: {type(fold_data)}")
                continue

            for metric, value in fold_data.items():
                # Normalize MHCPred metric names
                base_metric = metric_name_mapping.get(metric, metric)  # Map specific names
                
                if file_path == file_path_mhcpred and not metric.endswith("_mixmhc"):
                    continue  # Keep only _mixmhc metrics for file_path_mhcpred
                
                metrics_per_allele[allele].append({
                    "Metric": base_metric,
                    "Value": value,
                    "Dataset": file_labels[file_path]
                })



output_dir = r"C:\Projects\grandmaster\notebooks\plots\stars"
os.makedirs(output_dir, exist_ok=True)

# Iterate through each allele's data
for allele, data in metrics_per_allele.items():
    df = pd.DataFrame(data)

    # Iterate through each metric separately
    for metric in df["Metric"].unique():
        plt.figure(figsize=(10, 6))
        ax = sns.boxplot(x="Dataset", y="Value", data=df[df["Metric"] == metric])

        # Extract values for comparisons
        datasets = df["Dataset"].unique()
        dataset_pairs = [(datasets[i], datasets[j]) for i in range(len(datasets)) for j in range(i + 1, len(datasets))]

        # Compute p-values
        pvalues = [
            sci_stats.mannwhitneyu(
                df[(df["Dataset"] == pair[0]) & (df["Metric"] == metric)]["Value"],
                df[(df["Dataset"] == pair[1]) & (df["Metric"] == metric)]["Value"],
                alternative="two-sided"
            ).pvalue
            for pair in dataset_pairs
        ]

        # Format p-values
        formatted_pvalues = [f'p={pval:.2e}' for pval in pvalues]

        # Annotate significance with stars
        def pval_to_stars(p):
            if p < 0.001:
                return "***"
            elif p < 0.01:
                return "**"
            elif p < 0.05:
                return "*"
            else:
                return "ns"  # Not significant

        stars = [pval_to_stars(p) for p in pvalues]

        # Add annotations
        annotator = Annotator(ax, dataset_pairs, data=df[df["Metric"] == metric], x="Dataset", y="Value")
        annotator.set_pvalues(pvalues)
        annotator.annotate()


        plt.title(f"{allele} - {metric}")
        plt.ylabel(metric)
        plt.xlabel("Dataset")
        plt.xticks(rotation=45)
        plt.savefig(os.path.join(output_dir, f"{allele}_{metric}.png"))
        plt.close()


p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Base_NB vs. Base_R: Custom statistical test, P_val:1.000e+00
Base_R vs. New_NB: Custom statistical test, P_val:1.817e-04
New_NB vs. New_R: Custom statistical test, P_val:1.806e-04
Base_NB vs. New_NB: Custom statistical test, P_val:1.806e-04
Base_R vs. New_R: Custom statistical test, P_val:3.281e-04
Base_NB vs. New_R: Custom statistical test, P_val:2.807e-03
p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Base_NB vs. Base_R: Custom statistical test, P_val:1.827e-04
Base_R vs. New_NB: Custom statistical test, P_val:1.827e-04
New_NB vs. New_R: Custom statistical test, P_val:1.827e-04
Base_NB vs. New_NB: Custom statistical test, P_val:2.461e-04
Base_R vs. New_

In [29]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sci_stats
from statannotations.Annotator import Annotator  # For adding statistical annotations

file_path_base_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb\metrics.json"
file_path_base_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_randomes\metrics.json"
file_path_new_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_nb\metrics.json"
file_path_new_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_randomes\metrics.json"
file_path_mhcpred = r"C:\Projects\grandmaster\notebooks\mixmhc_metrics.json"
files = [file_path_base_nb, file_path_base_r, file_path_new_nb, file_path_new_r, file_path_mhcpred]

# Load metrics
def load_metrics(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Dictionary to store metrics per allele
metrics_per_allele = {}
file_labels = {
    file_path_base_nb: "Base_NB",
    file_path_base_r: "Base_R",
    file_path_new_nb: "New_NB",
    file_path_new_r: "New_R",
    file_path_mhcpred: "MHCPred"
}

metric_name_mapping = {
    "auc_score": "auc"
}

# Collect allele names from all files
allele_sets = [set(load_metrics(fp).keys()) for fp in files]
common_alleles = set.intersection(*allele_sets)  # Find alleles that exist in all files

# Process metrics ensuring MHCPred aligns
for file_path in files:
    metrics = load_metrics(file_path)
    
    for allele in common_alleles:  # Only process alleles present in all datasets
        if allele not in metrics_per_allele:
            metrics_per_allele[allele] = []

        if allele not in metrics:
            print(f"Warning: {allele} missing in {file_path}")
            continue

        for fold_idx, fold_data in enumerate(metrics[allele]):  
            if not isinstance(fold_data, dict):
                print(f"Unexpected format in {file_path} for allele {allele}: {type(fold_data)}")
                continue

            for metric, value in fold_data.items():
                base_metric = metric_name_mapping.get(metric, metric)  # Normalize metric names
                
                if file_path == file_path_mhcpred and not metric.endswith("_mixmhc"):
                    continue  # Keep only _mixmhc metrics for file_path_mhcpred
                
                metrics_per_allele[allele].append({
                    "Metric": base_metric,
                    "Value": value,
                    "Dataset": file_labels[file_path]
                })

output_dir = r"C:\Projects\grandmaster\notebooks\plots\stars"
os.makedirs(output_dir, exist_ok=True)

# Plot data
for allele, data in metrics_per_allele.items():
    df = pd.DataFrame(data)

    for metric in df["Metric"].unique():
        plt.figure(figsize=(10, 6))
        ax = sns.boxplot(x="Dataset", y="Value", data=df[df["Metric"] == metric])

        # Extract values for comparisons
        datasets = df["Dataset"].unique()
        dataset_pairs = [(datasets[i], datasets[j]) for i in range(len(datasets)) for j in range(i + 1, len(datasets))]

        # Compute p-values
        pvalues = [
            sci_stats.mannwhitneyu(
                df[(df["Dataset"] == pair[0]) & (df["Metric"] == metric)]["Value"],
                df[(df["Dataset"] == pair[1]) & (df["Metric"] == metric)]["Value"],
                alternative="two-sided"
            ).pvalue
            for pair in dataset_pairs
        ]

        # Format p-values
        def pval_to_stars(p):
            if p < 0.001:
                return "***"
            elif p < 0.01:
                return "**"
            elif p < 0.05:
                return "*"
            else:
                return "ns"

        stars = [pval_to_stars(p) for p in pvalues]

        # Add annotations
        annotator = Annotator(ax, dataset_pairs, data=df[df["Metric"] == metric], x="Dataset", y="Value")
        annotator.set_pvalues(pvalues)
        annotator.annotate()

        plt.title(f"{allele} - {metric}")
        plt.ylabel(metric)
        plt.xlabel("Dataset")
        plt.xticks(rotation=45)
        plt.savefig(os.path.join(output_dir, f"{allele}_{metric}.png"))
        plt.close()


p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Base_NB vs. Base_R: Custom statistical test, P_val:1.040e-01
Base_R vs. New_NB: Custom statistical test, P_val:3.281e-04
New_NB vs. New_R: Custom statistical test, P_val:1.817e-04
Base_NB vs. New_NB: Custom statistical test, P_val:1.388e-02
Base_R vs. New_R: Custom statistical test, P_val:5.828e-04
Base_NB vs. New_R: Custom statistical test, P_val:1.817e-04
p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Base_NB vs. Base_R: Custom statistical test, P_val:1.827e-04
Base_R vs. New_NB: Custom statistical test, P_val:1.827e-04
New_NB vs. New_R: Custom statistical test, P_val:1.827e-04
Base_NB vs. New_NB: Custom statistical test, P_val:3.298e-04
Base_R vs. New_

In [32]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sci_stats
from statannotations.Annotator import Annotator  # For adding statistical annotations

file_path_base_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_nb\metrics.json"
file_path_base_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\base_model\adjusted_scores\model_with_randomes\metrics.json"
file_path_new_nb = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_nb\metrics.json"
file_path_new_r = r"C:\Projects\grandmaster\notebooks\viterbi\mhc2\start9_end9\reculculation_with_length_devision\model_with_randomes\metrics.json"
file_path_mhcpred = r"C:\Projects\grandmaster\notebooks\mixmhc_metrics.json"

files = [file_path_base_nb, file_path_base_r, file_path_new_nb, file_path_new_r, file_path_mhcpred]

# Load metrics
def load_metrics(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Dictionary to store metrics per allele
metrics_per_allele = {}
all_alleles = set()  # Keep track of all alleles seen
file_labels = {
    file_path_base_nb: "Base_NB",
    file_path_base_r: "Base_R",
    file_path_new_nb: "New_NB",
    file_path_new_r: "New_R",
    file_path_mhcpred: "MHCPred"
}

metric_name_mapping = {
    "auc_score": "auc"
}

# Collect all allele names first
for file_path in files:
    metrics = load_metrics(file_path)
    all_alleles.update(metrics.keys())  # Add alleles to the set

# Initialize data structure with all alleles
for allele in all_alleles:
    metrics_per_allele[allele] = []

# Process files and ensure all alleles exist in each dataset
for file_path in files:
    metrics = load_metrics(file_path)
    dataset_label = file_labels[file_path]

    for allele in all_alleles:
        folds = metrics.get(allele, [])  # Use an empty list if allele is missing

        for fold_idx, fold_data in enumerate(folds):
            if not isinstance(fold_data, dict):
                print(f"Unexpected format in {file_path} for allele {allele}: {type(fold_data)}")
                continue

            for metric, value in fold_data.items():
                base_metric = metric_name_mapping.get(metric, metric)  # Standardize metric names


                metrics_per_allele[allele].append({
                    "Metric": base_metric,
                    "Value": value,
                    "Dataset": dataset_label
                })

# Output directory
output_dir = r"C:\Projects\grandmaster\notebooks\plots\stars"
os.makedirs(output_dir, exist_ok=True)

# Generate plots
for allele, data in metrics_per_allele.items():
    df = pd.DataFrame(data)

    if df.empty:
        print(f"Skipping {allele} due to no data.")
        continue

    for metric in df["Metric"].unique():
        plt.figure(figsize=(10, 6))
        ax = sns.boxplot(x="Dataset", y="Value", data=df[df["Metric"] == metric])

        # Extract values for comparisons
        datasets = df["Dataset"].unique()
        dataset_pairs = [(datasets[i], datasets[j]) for i in range(len(datasets)) for j in range(i + 1, len(datasets))]

        # Compute p-values
        pvalues = [
            sci_stats.mannwhitneyu(
                df[(df["Dataset"] == pair[0]) & (df["Metric"] == metric)]["Value"],
                df[(df["Dataset"] == pair[1]) & (df["Metric"] == metric)]["Value"],
                alternative="two-sided"
            ).pvalue
            for pair in dataset_pairs
        ]

        # Format p-values
        def pval_to_stars(p):
            if p < 0.001:
                return "***"
            elif p < 0.01:
                return "**"
            elif p < 0.05:
                return "*"
            else:
                return "ns"

        stars = [pval_to_stars(p) for p in pvalues]

        # Add annotations
        annotator = Annotator(ax, dataset_pairs, data=df[df["Metric"] == metric], x="Dataset", y="Value")
        annotator.set_pvalues(pvalues)
        annotator.annotate()

        plt.title(f"{allele} - {metric}")
        plt.ylabel(metric)
        plt.xlabel("Dataset")
        plt.xticks(rotation=45)
        plt.savefig(os.path.join(output_dir, f"{allele}_{metric}.png"))
        plt.close()


p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Base_NB vs. Base_R: Custom statistical test, P_val:1.040e-01
Base_R vs. New_NB: Custom statistical test, P_val:3.281e-04
New_NB vs. New_R: Custom statistical test, P_val:1.817e-04
New_R vs. MHCPred: Custom statistical test, P_val:1.827e-04
Base_NB vs. New_NB: Custom statistical test, P_val:1.388e-02
Base_R vs. New_R: Custom statistical test, P_val:5.828e-04
New_NB vs. MHCPred: Custom statistical test, P_val:4.375e-04
Base_NB vs. New_R: Custom statistical test, P_val:1.817e-04
Base_R vs. MHCPred: Custom statistical test, P_val:5.205e-01
Base_NB vs. MHCPred: Custom statistical test, P_val:2.411e-01
p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Base_NB vs. 

In [31]:
metrics_per_allele

{'HLA-DRB501-01': [{'Metric': 'accuracy',
   'Value': 0.7740384615384616,
   'Dataset': 'Base_NB'},
  {'Metric': 'precision', 'Value': 0.8734793187347932, 'Dataset': 'Base_NB'},
  {'Metric': 'recall', 'Value': 0.8013392857142857, 'Dataset': 'Base_NB'},
  {'Metric': 'f1_score', 'Value': 0.8358556461001164, 'Dataset': 'Base_NB'},
  {'Metric': 'auc', 'Value': 0.8332741477272729, 'Dataset': 'Base_NB'},
  {'Metric': 'accuracy', 'Value': 0.7387820512820513, 'Dataset': 'Base_NB'},
  {'Metric': 'precision', 'Value': 0.8909574468085106, 'Dataset': 'Base_NB'},
  {'Metric': 'recall', 'Value': 0.7330415754923414, 'Dataset': 'Base_NB'},
  {'Metric': 'f1_score', 'Value': 0.8043217286914766, 'Dataset': 'Base_NB'},
  {'Metric': 'auc', 'Value': 0.8206213393781364, 'Dataset': 'Base_NB'},
  {'Metric': 'accuracy', 'Value': 0.782051282051282, 'Dataset': 'Base_NB'},
  {'Metric': 'precision', 'Value': 0.9122807017543859, 'Dataset': 'Base_NB'},
  {'Metric': 'recall', 'Value': 0.7827956989247312, 'Dataset': 'B

In [24]:
data

[{'Metric': 'accuracy', 'Value': 0.7740384615384616, 'Dataset': 'Base_NB'},
 {'Metric': 'precision', 'Value': 0.8734793187347932, 'Dataset': 'Base_NB'},
 {'Metric': 'recall', 'Value': 0.8013392857142857, 'Dataset': 'Base_NB'},
 {'Metric': 'f1_score', 'Value': 0.8358556461001164, 'Dataset': 'Base_NB'},
 {'Metric': 'auc', 'Value': 0.8332741477272729, 'Dataset': 'Base_NB'},
 {'Metric': 'accuracy', 'Value': 0.7387820512820513, 'Dataset': 'Base_NB'},
 {'Metric': 'precision', 'Value': 0.8909574468085106, 'Dataset': 'Base_NB'},
 {'Metric': 'recall', 'Value': 0.7330415754923414, 'Dataset': 'Base_NB'},
 {'Metric': 'f1_score', 'Value': 0.8043217286914766, 'Dataset': 'Base_NB'},
 {'Metric': 'auc', 'Value': 0.8206213393781364, 'Dataset': 'Base_NB'},
 {'Metric': 'accuracy', 'Value': 0.782051282051282, 'Dataset': 'Base_NB'},
 {'Metric': 'precision', 'Value': 0.9122807017543859, 'Dataset': 'Base_NB'},
 {'Metric': 'recall', 'Value': 0.7827956989247312, 'Dataset': 'Base_NB'},
 {'Metric': 'f1_score', 'V