<a href="https://colab.research.google.com/github/prakritipaul/mhcglobe/blob/main/prakriti_MHCGlobe_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Adapted MHCGlobe Pipeline

In [None]:
!git clone https://github.com/prakritipaul/mhcglobe.git

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
sys.path.append("/content/mhcglobe/src")

## 1. Make your dataset. Make a one-hot version and a BERT version of X (aka inputs). Y (outputs) is the same.

Starting with one-hot.

### Get data
Contains both BA+EL, human and non-human

In [None]:
import mhc_data
# 729,538/1,229,838
pMHC_true = mhc_data.pMHC_Data(only_EL=False, drop_duplicate_records=True)
pMHC_true_data = pMHC_true.data
# pMHC_true_data.shape

#### Subset only for human data + get 4 relevant columns

In [None]:
# 678,024/729,538
human_pMHC_true_data = pMHC_true_data[pMHC_true_data["allele"].str.contains("HLA")]
human_pMHC_true_data
# human_pMHC_true_data.shape

In [None]:
human_pMHC_true_data = human_pMHC_true_data[["allele", "peptide", "measurement_inequality", "measurement_value"]]
human_pMHC_true_data

### Select 80% for training & 20% for testing

In [None]:
# 542419
human_pMHC_true_data_train = human_pMHC_true_data.sample(frac=0.8)
# 135605
human_pMHC_true_data_test = human_pMHC_true_data[~human_pMHC_true_data.index.isin(human_pMHC_true_data_train.index)]
print(human_pMHC_true_data_train.shape, human_pMHC_true_data_test.shape)

### Make X_tr, Y_tr, X_es, Y_es for training

In [None]:
import mhcglobe
ensemble = mhcglobe.ensemble(train_type="init")
setup_data_training = ensemble.setup_data_training(human_pMHC_true_data_train)
X_tr, Y_tr, X_es, Y_es = setup_data_training

## 2. Load one of my base tensorflow models and train.

In [None]:
import train_functions as trainf
model_dir = "/content/mhcglobe/model/mhcglobe/init/model14_fold13_ONE_HOT_init/"
model_1 = trainf.load_trained_mhcglobe_model(model_path=model_dir)

In [None]:
new_model_path, verbose = "/content/mhcglobe/outputs", 0
init_model = model_1
new_model = trainf.train_mhcglobe_model(init_model, X_tr, Y_tr, X_es, Y_es, new_model_path, verbose)

## 3. Ensure that model can predict with one-hot input.

In [None]:
import sequence_functions as seqf
# Can refer to prakriti_MHCglobe_scratch

# Load the trained model above.
new_model_dir = ""

new_model = trainf.load_trained_mhcglobe_model(model_path=new_model_dir)

# Am I getting all 4 cols?
X = seqf.get_XY(human_pMHC_true_data_test, encode_type="ONE_HOT", get_Y=False)
new_model.predict(X)