This notebook is a continuation of 1_allele.ipynb and shows how the functions work in case of multiple alleles and splits, within which multiple training runs are performed.


*Data Functions (data_funcs) – Contains rewritten and extended functions for data preprocessing, feature extraction, and dataset handling to ensure compatibility with the HMM structure.*

*Logic (logic) – Implements custom HMM-related algorithms, state transitions, and probability computations, refining or replacing certain pomegranate functions as needed.*

*Visualization (visualisation) – Provides tailored plotting and analysis tools to interpret model performance, state sequences, and emission distributions.*

## Imports

In [2]:
import pomegranate.io
from pomegranate import *

In [3]:
import json
import numpy as np
import pandas as pd

In [4]:
from networkx import all_simple_paths
from tqdm import tqdm
from collections import defaultdict
from itertools import islice

In [5]:
import sys
PATH_TO_PREDICTOR_HOME = "../.."
sys.path.append(PATH_TO_PREDICTOR_HOME)
METHOD = "hmm_pomegranate"

In [6]:
from copy import deepcopy
from peptides_utils import split_to_dicts, join_dicts, get_frequencies

In [7]:
from hmm_visualization_methods import *
from training_parameters import *

In [8]:
from data_reading_methods import *

In [9]:
import training_parameters
from peptides_utils import defineClass, defineOrganism, make_logo_for_data
from data_reading_methods import remove_unused_lengths, transform_data_to_properties_and_join_alleles, calculate_weights_based_on_length_counts

In [10]:
import logomaker
from collections import deque
import sys
import matplotlib.pyplot as plt
from scipy.spatial.distance import jensenshannon

## data functions

In [19]:
def get_train_test_data(experiment_params :ExperimentParams, type_of_peptides= "binders"):
    data_params : DataScenarioParams = experiment_params.data_scenario_params
    data_scenario = data_params.data_scenario
    DATA_PATH = data_params.input_data_path
    TOTAL_SPLITS = data_params.splits_to_read
    print(f"Will read files from the folder {DATA_PATH}")
    assert data_scenario in ["IEDB_preprocessed", "simulated", "simulated_preprocessed", 'MixMHCpred']
    additional_return = list()
    if isinstance(data_params, PreprocessedIEDBDataParams):
        ALLELES = get_available_alleles(DATA_PATH)
        per_allele_per_kfold_per_length_binders_train = read_data(DATA_PATH,ALLELES, "train", type_of_peptides)
        per_allele_per_kfold_per_length_binders_test = read_data(DATA_PATH,ALLELES, "test", type_of_peptides)
        sample_allele = list(per_allele_per_kfold_per_length_binders_train.keys())[0]
        per_allele_df = join_dicts(per_allele_per_kfold_per_length_binders_train)
        for allele_name in ALLELES:
            per_allele_df[allele_name]['allele'] = allele_name
        assert len(per_allele_per_kfold_per_length_binders_train[sample_allele].keys()) >= TOTAL_SPLITS # check number of splits
        additional_return.append(per_allele_df)
    elif isinstance(data_params, SimulatedPreprocessedDataParams):
        ALLELES = get_available_alleles(DATA_PATH, do_not_parse_alleles=True)
        per_allele_per_kfold_per_length_binders_train = read_data(DATA_PATH,ALLELES, "train", type_of_peptides, do_not_parse_alleles=True)
        per_allele_per_kfold_per_length_binders_test = read_data(DATA_PATH,ALLELES, "test", type_of_peptides,do_not_parse_alleles=True)
        sample_allele = list(per_allele_per_kfold_per_length_binders_train.keys())[0]
        per_allele_df = join_dicts(per_allele_per_kfold_per_length_binders_train)
        for allele_name in ALLELES:
            per_allele_df[allele_name]['allele'] = allele_name
        assert len(per_allele_per_kfold_per_length_binders_train[sample_allele].keys()) >= TOTAL_SPLITS # check number of splits
        additional_return.append(per_allele_df)
    elif isinstance(data_params, SimulatedDataParams):
        simulated_exact_file = data_params.simulated_exact_file_name
        dummy_allele_name = data_params.dummy_allele_name
        simulated_scenario = data_params.simulated_scenario
        SIMULATED_DATA_PATH = f"{DATA_PATH}/{simulated_scenario}/{simulated_exact_file}"
        ALLELES = [dummy_allele_name]
        per_allele_df = dict()
        # For now just read the same data multiple times for alleles/splits
        for allele_name in ALLELES:
            allele_df = pd.read_csv(SIMULATED_DATA_PATH, sep=";")
            list_dfs = [allele_df.copy() for i in range(TOTAL_SPLITS)]
            for split_num, split_df in enumerate(list_dfs):
                split_df['split'] = split_num
                split_df['allele_name'] = allele_name
                result_allele_df = pd.concat(list_dfs)
            per_allele_df[allele_name] = result_allele_df
            result_allele_df['length'] = split_df.peptide.str.len()
            TARGET_LENGTHS = list(split_df['length'].unique())
        # split data into dicts
        per_allele_per_kfold_per_length_binders_train = split_to_dicts(per_allele_df,
                                                                  ALLELES=ALLELES,
                                                                  TARGET_LENGTHS=TARGET_LENGTHS,
                                                                  TOTAL_SPLITS=np.arange(TOTAL_SPLITS))
        per_allele_per_kfold_per_length_binders_test =  split_to_dicts(per_allele_df,
                                                                  ALLELES=ALLELES,
                                                                  TARGET_LENGTHS=TARGET_LENGTHS,
                                                                  TOTAL_SPLITS=np.arange(TOTAL_SPLITS))
        additional_return.append(per_allele_df)
    elif isinstance(data_params, MixMHCpredDataParams):
        mixture_name = data_params.mixmhc_mixture_name
        dummy_allele_name = data_params.dummy_allele_name
        df = pd.read_csv(DATA_PATH, sep=';')
        print(df.columns)
        df = df.loc[
            df.Peptide.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")
        ]
        print("Total table length", len(df))
        #Filter out selected mixture
        df = df.loc[df.Sample_IDs.str.split(', ').apply(lambda x: mixture_name in x),]
        print("Filtered for given mixture", len(df))
        sample_data = pd.DataFrame(
            {"peptide": df.Peptide.values,
             "old_sample_id": df.Sample_IDs,
             "sample_id": mixture_name,
             "mixmhc_predicted_mixed_alleles": df.Allele.values})

        list_dfs = [sample_data.copy() for i in range(TOTAL_SPLITS)]
        per_allele_df = dict()
        allele_name = "d_" + dummy_allele_name
        ALLELES = [allele_name]
        for allele_name in ALLELES:
            for split_num, split_df in enumerate(list_dfs):
                split_df['split'] = split_num
                split_df['allele'] = allele_name
                split_df['length'] = split_df.peptide.str.len()
            result_allele_df = pd.concat(list_dfs)
            result_allele_df = result_allele_df.drop_duplicates(subset=['peptide'])
            TARGET_LENGTHS = list(split_df['length'].unique())
            per_allele_df[allele_name] = result_allele_df
        per_allele_per_kfold_per_length_binders_train = split_to_dicts(per_allele_df,
                                                                  ALLELES=ALLELES,
                                                                  TARGET_LENGTHS=TARGET_LENGTHS,
                                                                  TOTAL_SPLITS=np.arange(TOTAL_SPLITS))
        per_allele_per_kfold_per_length_binders_test = split_to_dicts(per_allele_df,
                                                                  ALLELES=ALLELES,
                                                                  TARGET_LENGTHS=TARGET_LENGTHS,
                                                                  TOTAL_SPLITS=np.arange(TOTAL_SPLITS))
        additional_return.append(per_allele_df)
        additional_return.append(df)
    return per_allele_per_kfold_per_length_binders_train,  per_allele_per_kfold_per_length_binders_test, additional_return

from pomegranate.io import BatchedDataGenerator, SequenceGenerator
def create_char_arrays(peptide_sequences):
    return np.array([[char for char in peptide] for peptide in peptide_sequences], dtype=object)

def prepare_split_data_separeted_length(per_length_data, per_length_weights, per_length_test_data, target_lengths):
    binders_array = np.array([per_length_data[length][i] for length in target_lengths
                              for i in range(len(per_length_data[length]))], dtype=object)
    weights_array = np.array([per_length_weights[length][i] for length in target_lengths
                               for i in range(len(per_length_weights[length]))], dtype=object)
    binders_test_array = np.array([per_length_test_data[length][i] for length in target_lengths
                                   for i in range(len(per_length_test_data[length]))], dtype=object)
    return binders_array, weights_array, binders_test_array

## logic functions

In [12]:

def make_models_for_runs(allele_name, experiment_params: ExperimentParams, split_num = 1, custom_models_id: str = "root"):
    model_training_params = experiment_params.model_training_params
    original_num_runs = model_training_params.num_runs
    total_runs = original_num_runs * model_training_params.decrease_anchor_aas_steps
    

    target_allele_name = allele_name.replace('-', '_').replace('*', '_').replace(':', '_').replace('/', '_')
    models_for_runs = {}
    
    for run_num in range(total_runs):
        run_index = f"{custom_models_id}[{run_num:04d}]"
        acids_to_subtract = run_num // original_num_runs
        current_params = deepcopy(model_training_params)
        current_params.anchor_top_aas -= acids_to_subtract
        
        prepared_model = build_model_based_on_params(current_params)
        prepared_model.name = (
            f'{run_index}_run-{current_params.get_model_common_names()}_model-{target_allele_name}-{split_num}'
        )
        
        models_for_runs[run_index] = prepared_model
    
    return models_for_runs

def prepare_multiple_models(experiment_params: ExperimentParams):
    data_scenario_params = experiment_params.data_scenario_params
    model_training_params = experiment_params.model_training_params
    alleles_to_use = model_training_params.alleles_to_use

    per_allele_per_split_prepared_models = {}
    
    for allele_name in alleles_to_use:
        per_allele_per_split_prepared_models[allele_name] = {}
        
        for split_num in range(data_scenario_params.splits_to_read):
            per_allele_per_split_prepared_models[allele_name][split_num] = make_models_for_runs(
                allele_name, experiment_params, split_num
            )
    
    return per_allele_per_split_prepared_models

In [13]:

def create_char_arrays(peptide_sequences):
    return np.array([[char for char in peptide] for peptide in peptide_sequences], dtype=object)

def prepare_split_data_separeted_length(per_length_data, per_length_weights, per_length_test_data, target_lengths):
    binders_array = np.array([per_length_data[length][i] for length in target_lengths
                              for i in range(len(per_length_data[length]))], dtype=object)
    weights_array = np.array([per_length_weights[length][i] for length in target_lengths
                               for i in range(len(per_length_weights[length]))], dtype=object)
    binders_test_array = np.array([per_length_test_data[length][i] for length in target_lengths
                                   for i in range(len(per_length_test_data[length]))], dtype=object)
    return binders_array, weights_array, binders_test_array

def process_split_data(per_length_data, per_length_weights, per_length_test_data, target_lengths):
    """Prepare and shuffle data arrays."""
    binders_array, weights_array, binders_test_array = prepare_split_data_separeted_length(per_length_data, per_length_weights, per_length_test_data, target_lengths)
    rng = np.random.default_rng()
    new_indexes = rng.permutation(len(binders_array))
    binders_array = binders_array[new_indexes]
    weights_array = weights_array[new_indexes]
    rng.shuffle(binders_test_array)
    sample_X = create_char_arrays(binders_array)
    sample_X_test = create_char_arrays(binders_test_array)
    return sample_X, sample_X_test, weights_array

In [14]:
def train_single_model( model, sample_X, sample_X_test, weights_array, model_training_params):
    """Fit a single model for a given split."""
    verbose = model_training_params.verbose
    multiple_check_input = model_training_params.multiple_check_inpit
    algorithm = model_training_params.algorithm
    lr_decay = model_training_params.lr_decay
    minibatch_training = model_training_params.minibatch_training
    batches_per_epoch = model_training_params.batches_per_epoch
    batch_size = model_training_params.batch_size
    min_iterations = model_training_params.min_iters
    max_iterations = model_training_params.maxiters
    emission_pseudocount = model_training_params.emission_pseudocount
    transition_pseudocount = model_training_params.transition_pseudocount
    use_pseudocount = model_training_params.use_pseudocounts
    edge_inertia = model_training_params.edge_inertia
    distribution_inertia = model_training_params.distribution_inertia
    stop_threshold = model_training_params.stop_threshold
    n_jobs = 1  # Modify if needed
    
    sequence_test_generator = SequenceGenerator(sample_X_test)
    
    if minibatch_training:
        sequence_generator = BatchedDataGenerator(sample_X,
                                                  batches_per_epoch=batches_per_epoch,
                                                  batch_size=batch_size)
        sequence_generator.reset()
    else:
        sequence_generator = sample_X
    
    model, history = model.fit(sequence_generator,
                               sequences_test=sequence_test_generator,
                               stop_threshold=stop_threshold,
                               return_history=True,
                               verbose=verbose,
                               multiple_check_input=multiple_check_input,
                               n_jobs=n_jobs,
                               algorithm=algorithm,
                               lr_decay=lr_decay,
                               distribution_inertia=distribution_inertia,
                               edge_inertia=edge_inertia,
                               batches_per_epoch=batches_per_epoch,
                               min_iterations=min_iterations,
                               max_iterations=max_iterations,
                               emission_pseudocount=emission_pseudocount,
                               transition_pseudocount=transition_pseudocount,
                               use_pseudocount=use_pseudocount,
                               weights=weights_array)
    
    return model, history


In [15]:
def train_models_for_runs(prepared_models, train_data, test_data, train_data_weights, model_training_params, path_to_save_models):
    """
    Train models for different runs within a single split.
    """
    result_models = {}
    result_histories = {}
    target_lengths = model_training_params.lengths_to_use

    per_run_models = dict()
    per_run_histories = dict()
    
    for run_index, model in prepared_models.items():
        print(f"Run {run_index}", end=" ")
        if model and run_index in prepared_models:
            sample_X, sample_X_test, weights_array = process_split_data(train_data, train_data_weights, test_data, target_lengths)
            model, history = train_single_model(model, sample_X, sample_X_test, weights_array, model_training_params)
            per_run_models[run_index] = model
            per_run_histories[run_index] = history
            
            path_to_save_models_runs = path_to_save_models 
            save_model(path_to_save_models_runs, model, history)
        else:
            print(f"Skipping training for split {run_index}, model not provided.")
            break
    
    print(" ")
    return per_run_models, per_run_histories

In [16]:
def train_models_for_splits(prepared_models, train_data, test_data, train_data_weights, model_training_params, experiment_params, path_to_save_models):
    """
    Train models for different splits within a single allele.
    """
    result_models = {}
    result_histories = {}
    
    for split_num, split_models in prepared_models.items():
        print(f"Split {split_num}", end=" ")
        split_path = f"{path_to_save_models}/s{split_num}/"
        split_models_trained, split_histories = train_models_for_runs(
            prepared_models=split_models,
            train_data=train_data[split_num],
            test_data=test_data[split_num],
            train_data_weights=train_data_weights[split_num],
            model_training_params=model_training_params,
            path_to_save_models=split_path,
        )


        if model_training_params.check_model_integrity:
            models_to_be_updated = check_models_integrity_and_shift_params(split_models_trained, train_data, experiment_params=experiment_params)
            if (models_to_be_updated):
                print("some models had shifts, we will try one more iteration of training")
                extra_models, extra_histories = train_models_for_runs(
                    prepared_models=split_models,
                    train_data=train_data[split_num],
                    test_data=test_data[split_num],
                    train_data_weights=train_data_weights[split_num],
                    model_training_params=model_training_params,
                    path_to_save_models=split_path,
                )
                split_models_trained.update(extra_models)
                split_histories.update(extra_histories)

        result_models[split_num] = split_models_trained
        result_histories[split_num] = split_histories    
    return result_models, result_histories

In [17]:
def train_models_for_alleles(prepared_models, train_data, test_data, train_data_weights, experiment_params: ExperimentParams, subfolder_to_save_result: str):
    """
    Train models for different alleles, iterating over splits first.
    """
    model_training_params = experiment_params.model_training_params
    path_to_save_models = f"{experiment_params.experiment_result_data_path}/{subfolder_to_save_result}/"
    result_models = dict()
    result_histories = dict()
    
    for allele_name in train_data.keys():
        print(f"Allele {allele_name}:", end=" ")
        result_models[allele_name] = dict()
        result_histories[allele_name] = dict()
        target_allele_name = allele_name.replace('-', '_').replace('*', '_').replace(':', '_')
        path_to_save_models_for_allele = f"{path_to_save_models}/{target_allele_name}/"
        
        allele_models, allele_histories = train_models_for_splits(
            prepared_models=prepared_models[allele_name],
            train_data=train_data[allele_name],
            test_data=test_data[allele_name],
            train_data_weights=train_data_weights[allele_name],
            model_training_params=model_training_params,
            experiment_params=experiment_params,
            path_to_save_models=path_to_save_models_for_allele
        )
        
        result_models[allele_name] = allele_models
        result_histories[allele_name] = allele_histories
    
    return result_models, result_histories

## visoalisation functions

In [20]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

def save_visualizations_for_model(model, history, split_path, experiment_params, predefined_hierarchical_layout):
    """
    Saves various visualizations for a given model, including the learning curve,
    state graph, distributions, and PyViz graph.
    """
    path_for_model = f"{split_path}/{model.name}/"
    if not os.path.exists(path_for_model):
        os.makedirs(path_for_model)
    

    # Learning Curve
    path_to_save_learning_curve = path_for_model + "LearningCurve"
    fig, ax = plt.subplots(1, 1)
    sns.lineplot(history.log_probabilities, ax=ax)
    plt.savefig(path_to_save_learning_curve)
    plt.close(fig)

    # State Graph
    #print('ModelGraph', end = ' ')
    path_to_save_ModelGraph = path_for_model + "ModelGraph.png"

    with open(path_to_save_ModelGraph, 'w+') as f:
        model.plot(file=f, crop_zero=True)

    # Distributions
    #print('Distributions', end = ' ')
    plot_distributions_for_states(model, split_path, horizontal=False,
                                  discrete=experiment_params.model_training_params.aa_labels_training,
                                  initial_params=experiment_params.model_training_params.initial_params)

    # PyViz Graph
    #print('PyViz', end = ' ')
    make_pyviz_graph(model, split_path, precision=3,
                     prefefined_hierarchical_layout=predefined_hierarchical_layout)
    
def save_visualizations_for_runs(run_model, history_per_run, experiment_params, split_path, predefined_hierarchical_layout=True):
    for run_index, model in run_model.items():
        history = history_per_run[run_index]
        save_visualizations_for_model(model, history, split_path, 
                                    experiment_params, predefined_hierarchical_layout)

def save_visualizations_for_multiple_models(per_allele_new_models, per_allele_histories, 
                                           experiment_params, subfolder_to_safe_result,
                                           predefined_hierarchical_layout=True):
    """
    Processes the models in the hierarchical structure and calls save_visualizations_for_model
    for each one found in the nested dictionaries.
    """
    base_path = f"{experiment_params.experiment_result_data_path}/{subfolder_to_safe_result}"

    for allele_name, splits in per_allele_new_models.items():
        target_allele_name = allele_name.replace('-', '_').replace('*', '_').replace(':', '_')
        allele_path = f"{base_path}/{target_allele_name}"
        print("Allele: ", allele_name)

        for split_index, run_model in splits.items():
            split_path = f"{allele_path}/s{split_index}"
            history_per_run = per_allele_histories[allele_name][split_index]
            save_visualizations_for_runs(run_model, history_per_run, experiment_params, split_path, predefined_hierarchical_layout=True)


# For multiple alleles

## Initialization

In [21]:

hmm_params = training_parameters.ExperimentParams(experiment_name="no_allele_no_split")
hmm_params.model_training_params = training_parameters.SimpleModelClassIIParamsMixMHC()
hmm_params.data_scenario_params = training_parameters.PreprocessedIEDBDataParams()

In [22]:
hmm_params.data_scenario_params.input_data_path = r'C:\Projects\grandmaster\notebooks\alleles_data\simple_model_enrichment\per_length_per_kfold_split'
hmm_params.data_scenario_params.splits_to_read = 2

In [23]:
model_training_params = hmm_params.model_training_params
model_training_params.num_runs = 2
hmm_params.model_training_params.cycle_chain = True
hmm_params.model_training_params.cycle_chain_length = 9

In [24]:
hmm_params.model_training_params.anchor_top_aas = 7

## Data preparing

In [25]:
per_allele_per_kfold_per_length_binders_train, \
per_allele_per_kfold_per_length_binders_test, additional_data = get_train_test_data(hmm_params)

per_allele_per_kfold_per_length_non_binders_train, \
per_allele_per_kfold_per_length_non_binders_test, additional_data = get_train_test_data(hmm_params, "nonbinders")


PARSED_ALLELES = list(per_allele_per_kfold_per_length_binders_train.keys())
PARSED_ALLELES_NB = list(per_allele_per_kfold_per_length_non_binders_train.keys())

print(sorted(PARSED_ALLELES))
print(sorted(PARSED_ALLELES_NB))

Will read files from the folder C:\Projects\grandmaster\notebooks\alleles_data\simple_model_enrichment\per_length_per_kfold_split
Will read files from the folder C:\Projects\grandmaster\notebooks\alleles_data\simple_model_enrichment\per_length_per_kfold_split
['HLA-DRB1*03:01', 'HLA-DRB1*07:01', 'HLA-DRB1*10:01', 'HLA-DRB1*11:01', 'HLA-DRB1*12:01', 'HLA-DRB1*13:03', 'HLA-DRB1*15:01', 'HLA-DRB3*01:01', 'HLA-DRB3*02:02', 'HLA-DRB4*01:01']
['HLA-DRB1*03:01', 'HLA-DRB1*07:01', 'HLA-DRB1*10:01', 'HLA-DRB1*11:01', 'HLA-DRB1*12:01', 'HLA-DRB1*13:03', 'HLA-DRB1*15:01', 'HLA-DRB3*01:01', 'HLA-DRB3*02:02', 'HLA-DRB4*01:01']


In [26]:
model_training_params.lengths_to_use = [12, 13, 14, 15, 16, 17, 18, 19, 20]
df = additional_data[0][list(additional_data[0].keys())[0]]
[additional_data[0][key] for key in list(additional_data[0].keys())[:3]]
current_mix = ['HLA-DRB1*03:01', 'HLA-DRB1*07:01', 
               'HLA-DRB1*12:01', 'HLA-DRB1*11:01', 
               'HLA-DRB1*15:01', 'HLA-DRB3*01:01', 
               'HLA-DRB3*02:02', 'HLA-DRB4*01:01']
model_training_params: training_parameters.ModelTrainingParams = hmm_params.model_training_params
model_training_params.alleles_to_use = [ item for item in current_mix]

#model_training_params.alleles_to_use = [ item for item in PARSED_ALLELES if item in [current_mix]]
t = remove_unused_lengths(per_allele_per_kfold_per_length_binders_train, experiment_params=hmm_params)

In [27]:
per_allele_per_kfold_per_length_binders_train = remove_unused_lengths(per_allele_per_kfold_per_length_binders_train, experiment_params=hmm_params)
per_allele_per_kfold_per_length_binders_test = remove_unused_lengths(per_allele_per_kfold_per_length_binders_test,experiment_params=hmm_params)

# transform to properties if needed and join multiple alleles
train_data_b, test_data_b, NEW_ALLELES, \
old_train_data_b, old_test_data_b, OLD_ALLELES = transform_data_to_properties_and_join_alleles(
    per_allele_per_kfold_per_length_binders_train,
    per_allele_per_kfold_per_length_binders_test,
    hmm_params.model_training_params
)
# Calculate weights for the training data based on peptide couns/lengths for unmerged data
train_data_weigths_b = calculate_weights_based_on_length_counts(old_train_data_b, experiment_params=hmm_params)

HLA-DRB1*03:01
Length 12: HLA-DRB1*03:01 - 135 
Length 13: HLA-DRB1*03:01 - 1555 
Length 14: HLA-DRB1*03:01 - 508 
Length 15: HLA-DRB1*03:01 - 1240 
Length 16: HLA-DRB1*03:01 - 794 
Length 17: HLA-DRB1*03:01 - 651 
Length 18: HLA-DRB1*03:01 - 460 
Length 19: HLA-DRB1*03:01 - 282 
Length 20: HLA-DRB1*03:01 - 408 
HLA-DRB1*07:01
Length 12: HLA-DRB1*07:01 - 163 
Length 13: HLA-DRB1*07:01 - 502 
Length 14: HLA-DRB1*07:01 - 732 
Length 15: HLA-DRB1*07:01 - 3366 
Length 16: HLA-DRB1*07:01 - 1000 
Length 17: HLA-DRB1*07:01 - 654 
Length 18: HLA-DRB1*07:01 - 365 
Length 19: HLA-DRB1*07:01 - 199 
Length 20: HLA-DRB1*07:01 - 330 
HLA-DRB1*11:01
Length 12: HLA-DRB1*11:01 - 54 
Length 13: HLA-DRB1*11:01 - 281 
Length 14: HLA-DRB1*11:01 - 532 
Length 15: HLA-DRB1*11:01 - 1634 
Length 16: HLA-DRB1*11:01 - 591 
Length 17: HLA-DRB1*11:01 - 441 
Length 18: HLA-DRB1*11:01 - 310 
Length 19: HLA-DRB1*11:01 - 170 
Length 20: HLA-DRB1*11:01 - 356 
HLA-DRB1*12:01
Length 12: HLA-DRB1*12:01 - 116 
Length 13: H

In [28]:
per_allele_per_kfold_per_length_non_binders_train = remove_unused_lengths(per_allele_per_kfold_per_length_non_binders_train, experiment_params=hmm_params)
per_allele_per_kfold_per_length_non_binders_test = remove_unused_lengths(per_allele_per_kfold_per_length_non_binders_test,experiment_params=hmm_params)

# transform to properties if needed and join multiple alleles
train_data_nb, test_data_nb, NEW_ALLELES, \
old_train_data_nb, old_test_data_nb, OLD_ALLELES = transform_data_to_properties_and_join_alleles(
    per_allele_per_kfold_per_length_non_binders_train,
    per_allele_per_kfold_per_length_non_binders_test,
    hmm_params.model_training_params
)
# Calculate weights for the training data based on peptide couns/lengths for unmerged data
train_data_weigths_nb = calculate_weights_based_on_length_counts(old_train_data_nb, experiment_params=hmm_params)

HLA-DRB1*03:01
Length 12: HLA-DRB1*03:01 - 63 
Length 13: HLA-DRB1*03:01 - 61656 
Length 14: HLA-DRB1*03:01 - 120 
Length 15: HLA-DRB1*03:01 - 1794 
Length 16: HLA-DRB1*03:01 - 113 
Length 17: HLA-DRB1*03:01 - 75 
Length 18: HLA-DRB1*03:01 - 68 
Length 19: HLA-DRB1*03:01 - 50 
Length 20: HLA-DRB1*03:01 - 505 
HLA-DRB1*07:01
Length 12: HLA-DRB1*07:01 - 55 
Length 13: HLA-DRB1*07:01 - 76 
Length 14: HLA-DRB1*07:01 - 54 
Length 15: HLA-DRB1*07:01 - 8864 
Length 16: HLA-DRB1*07:01 - 126 
Length 17: HLA-DRB1*07:01 - 85 
Length 18: HLA-DRB1*07:01 - 76 
Length 19: HLA-DRB1*07:01 - 31 
Length 20: HLA-DRB1*07:01 - 410 
HLA-DRB1*11:01
Length 12: HLA-DRB1*11:01 - 53 
Length 13: HLA-DRB1*11:01 - 104 
Length 14: HLA-DRB1*11:01 - 51 
Length 15: HLA-DRB1*11:01 - 1871 
Length 16: HLA-DRB1*11:01 - 104 
Length 17: HLA-DRB1*11:01 - 69 
Length 18: HLA-DRB1*11:01 - 65 
Length 19: HLA-DRB1*11:01 - 19 
Length 20: HLA-DRB1*11:01 - 350 
HLA-DRB1*12:01
Length 12: HLA-DRB1*12:01 - 0 
Length 13: HLA-DRB1*12:01 - 

## Prepare models

In [29]:
import random

random.seed(10)

from hmm_logic_methods import train_model_prepared,  \
    train_model_batched, save_model, \
    add_more_states_and_reset_transitions, \
    build_model_based_on_params, \
    train_multiple_models, \
reorder_models_by_score_and_flatten_to_by_name_list, hierarchically_train_splited_models, load_model
from hmm_visualization_methods import *

Binders

In [30]:
per_allele_per_run_per_split_prepared_models_binders = prepare_multiple_models(hmm_params)

In [31]:
hmm_params.experiment_result_data_path = 'C:\\Projects\\grandmaster\\notebooks\\MHC_predictor\\experiments\\core_identification_simple_model_enrichment\\experiment_results\\mult\\anc7' 

In [32]:
result_models_b, result_histories_b =  train_models_for_alleles(
    prepared_models=per_allele_per_run_per_split_prepared_models_binders,
    train_data=train_data_b,
    test_data=test_data_b,
    train_data_weights=train_data_weigths_b,
    experiment_params=hmm_params,
    subfolder_to_save_result="binders")

Allele HLA-DRB1*03:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB1*07:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB1*11:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB1*12:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB1*15:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB3*01:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB3*02:02: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB4*01:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  


non binders

In [33]:
per_allele_per_run_per_split_prepared_models_nonbinders = prepare_multiple_models(hmm_params)

In [34]:
result_models_nb, result_histories_nb =  train_models_for_alleles(
    prepared_models=per_allele_per_run_per_split_prepared_models_nonbinders,
    train_data=train_data_nb,
    test_data=test_data_nb,
    train_data_weights=train_data_weigths_nb,
    experiment_params=hmm_params,
    subfolder_to_save_result="nonbinders")

Allele HLA-DRB1*03:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB1*07:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB1*11:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB1*12:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB1*15:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB3*01:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB3*02:02: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
Allele HLA-DRB4*01:01: Split 0 Run root[0000] Run root[0001]  
Split 1 Run root[0000] Run root[0001]  
