# Setup

In [None]:
from google.colab import files
uploaded = files.upload()

This notebook demonstrates how to generate predictions using MHCflurry.

In [1]:
# Install the package and download models
!pip install -q mhcflurry
!mhcflurry-downloads --quiet fetch models_class1_presentation

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/140.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.9/140.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.7/103.7 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for typechecks (setup.py) ... [?25l[?25hdone
135MB [00:06, 19.8MB/s]               
Extracting: 100% 62/62 [00:12<00:00,  4.95it/s]


In [2]:
# Imports
import mhcflurry
from google.colab import files

# Quiet warnings
import warnings
warnings.filterwarnings('ignore')

In [10]:
# Load a predictor
predictor = mhcflurry.Class1PresentationPredictor.load()
predictor

predictor1 = mhcflurry.Class1AffinityPredictor.load()
predictor1

<Class1AffinityPredictor at 0x7c5b8ded2890 [mhcflurry 2.1.5] [pan] generated on Thu Jun 11 13:31:45 2020>

In [4]:
import pandas as pd

# Load your uploaded dataset
df = pd.read_csv("/content/mhcflurry_input.csv")
df.head()

Unnamed: 0,peptide,allele,affinity
0,AEMKTDAA,HLA-A01:01,0.047934
1,HGVEFDFI,HLA-A01:01,0.0
2,HHIWQNLL,HLA-A01:01,0.089799
3,KPTGSAVV,HLA-A01:01,0.0
4,LASIDLKY,HLA-A01:01,0.246733


In [16]:


import pandas as pd

# Step 1: Load
df = pd.read_csv("/content/mhcflurry_input.csv")

# Step 2: Filter valid peptide lengths (5–15)
df = df[df["peptide"].str.len().between(5, 15)].copy()
print(f"✅ Filtered dataset: {len(df)} peptides within valid range [5, 15]")

# Step 3: Predict
results = predictor1.predict(peptides=df["peptide"].tolist(),
                             alleles=df["allele"].tolist())

# Step 4: Add predictions
df["mhcflurry_pred"] = results

# Step 5: Save
df.to_csv("mhcflurry_affinity_predictions_filtered.csv", index=False)

✅ Filtered dataset: 156741 peptides within valid range [5, 15]


In [17]:
# ⬇️ Step 1: Upload your file


# ⬇️ Step 2: Load the CSV
import pandas as pd
import numpy as np


df = pd.read_csv("/content/mhcflurry_affinity_predictions_filtered.csv")

# ⬇️ Step 3: Convert IC50 nM → normalized score
# Formula: norm = 1 - log10(IC50) / log10(50000)
df["mhcflurry_norm"] = 1 - np.log10(df["mhcflurry_pred"]) / np.log10(50000)

# ⬇️ Step 4: Show the new columns
print(df[["affinity", "mhcflurry_pred", "mhcflurry_norm"]].head())

# ⬇️ Step 5: Save result to new CSV
output_filename = "mhcflurry_with_normalized.csv"
df.to_csv(output_filename, index=False)
files.download(output_filename)

   affinity  mhcflurry_pred  mhcflurry_norm
0  0.047934    33050.977416        0.038261
1  0.000000    31654.189107        0.042252
2  0.089799    32726.647166        0.039172
3  0.000000    33914.989825        0.035876
4  0.246733     1407.537559        0.329968


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error

# Load your data
df = pd.read_csv("/content/mhcflurry_with_normalized.csv")

# Compute metrics between your model (affinity) and MHCflurry (normalized)
y_true = df["affinity"]
y_pred = df["mhcflurry_norm"]

# Pearson Correlation Coefficient (PCC)
pcc, _ = pearsonr(y_true, y_pred)

# Spearman Correlation Coefficient (SCC)
scc, _ = spearmanr(y_true, y_pred)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_true, y_pred)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Display results
print(f"📊 Evaluation Metrics:")
print(f"Pearson Correlation Coefficient (PCC): {pcc:.4f}")
print(f"Spearman Correlation Coefficient (SCC): {scc:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

📊 Evaluation Metrics:
Pearson Correlation Coefficient (PCC): 0.7889
Spearman Correlation Coefficient (SCC): 0.7055
Mean Squared Error (MSE): 0.0294
Root Mean Squared Error (RMSE): 0.1716


# Predict for specified peptides

In [8]:
peptides = """
NLVPMVATV
RANDMPEPTIDE
SIINFEKL
""".split()

alleles = "A*02:01 B*27:01 H2-Kb".split()

results1 = predictor.predict(peptides, alleles)
results1


Predicting processing.


  0%|          | 0/1 [00:00<?, ?it/s]











100%|██████████| 1/1 [00:04<00:00,  4.25s/it]


Predicting affinities.


  0%|          | 0/3 [00:00<?, ?it/s]



 33%|███▎      | 1/3 [00:01<00:02,  1.10s/it]



100%|██████████| 3/3 [00:01<00:00,  2.33it/s]


Unnamed: 0,peptide,peptide_num,sample_name,affinity,best_allele,processing_score,presentation_score,presentation_percentile
0,NLVPMVATV,0,sample1,16.57014,A*02:01,0.533029,0.97019,0.018723
1,RANDMPEPTIDE,1,sample1,21780.330988,B*27:01,0.008493,0.004732,62.744674
2,SIINFEKL,2,sample1,19.705306,H2-Kb,0.264716,0.914121,0.099511


In [None]:
# Download results
results1.to_csv('mhcflurry-results.csv')
files.download('mhcflurry-results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# See help for more options:
help(predictor.predict)

Help on method predict in module mhcflurry.class1_presentation_predictor:

predict(peptides, alleles, sample_names=None, n_flanks=None, c_flanks=None, include_affinity_percentile=False, verbose=1, throw=True) method of mhcflurry.class1_presentation_predictor.Class1PresentationPredictor instance
    Predict presentation scores across a set of peptides.
    
    Presentation scores combine predictions for MHC I binding affinity
    and antigen processing.
    
    This method returns a pandas.DataFrame giving presentation scores plus
    the binding affinity and processing predictions and other intermediate
    results.
    
    Example:
    
    >>> predictor = Class1PresentationPredictor.load()
    >>> predictor.predict(
    ...    peptides=["SIINFEKL", "PEPTIDE"],
    ...    n_flanks=["NNN", "SNS"],
    ...    c_flanks=["CCC", "CNC"],
    ...    alleles={
    ...        "sample1": ["A0201", "A0301", "B0702"],
    ...        "sample2": ["A0101", "C0202"],
    ...    },
    ...    verbo

# Predict by scanning across protein sequences

In [None]:
# Paste your fasta here
proteins_fasta = """
>tr|A0A6B9WFC7|A0A6B9WFC7_SARS2 Envelope small membrane protein
MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNIVNVSLVKPSFYVYS
RVKNLNSSRVPDLLV
>tr|A0A6B9W0L4|A0A6B9W0L4_SARS2 ORF6 protein
MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTENKYSQLDEEQPMEI
D
>tr|A0A6G7S6S0|A0A6G7S6S0_SARS2 Nonstructural protein NS3
MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWLIVGVALLAVFQSAS
KIITLKKRWQLALSKGVHFVCNLLLLFVTVYSHLLLVAAGLEAPFLYLYALVYFLQSINF
VRIIMRLWLCWKCRSKNPLLYDANYFLCWHTNCYDYCIPYNSVTSSIVITSGDGTTSPIS
EHDYQIGGYTEKWESGVKDCVVLHSYFTSDYYQLYSTQLSTDTGVEHVTFFIYNKIVDEP
EEHVQIHTIDGSSGVVNPVMEPIYDEPTTTTSVPL
>tr|A0A6B9VLF3|A0A6B9VLF3_SARS2 Membrane protein
MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFLYIIKLIFLWLLWPV
TLACFVLAAVYRINWITGGIAIAMACLVGLMWLSYFIASFRLFARTRSMWSFNPETNILL
NVPLHGTILTRPLLESELVIGAVILRGHLRIAGHHLGRCDIKDLPKEITVATSRTLSYYK
LGASQRVAGDSGFAAYSRYRIGNYKLNTDHSSSSDNIALLVQ
"""

import mhcflurry.fasta

with open("temp.fa", "w") as fd:
    fd.write(proteins_fasta)

proteins = mhcflurry.fasta.read_fasta_to_dataframe("temp.fa").set_index("sequence_id")
proteins

Unnamed: 0_level_0,sequence
sequence_id,Unnamed: 1_level_1
tr|A0A6B9WFC7|A0A6B9WFC7_SARS2,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...
tr|A0A6B9W0L4|A0A6B9W0L4_SARS2,MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTE...
tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...
tr|A0A6B9VLF3|A0A6B9VLF3_SARS2,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...


In [None]:
# Define alleles for each sample
alleles={
    "my-sample": ["A0201", "A0301", "B0702", "C0802"],
}

In [None]:
# Predict across protein sequences and return peptides with predicted affinity
# less than 500 nM.
results2 = predictor.predict_sequences(
    sequences=proteins.sequence.to_dict(),
    alleles=alleles,
    result="filtered",
    comparison_quantity="affinity",
    filter_value=500)
results2

Predicting processing.


100%|██████████| 1/1 [00:13<00:00, 13.82s/it]


Predicting affinities.


100%|██████████| 4/4 [00:06<00:00,  1.62s/it]


Unnamed: 0,sequence_name,pos,peptide,n_flank,c_flank,sample_name,affinity,best_allele,affinity_percentile,processing_score,presentation_score,presentation_percentile
0,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,138,LLYDANYFL,RSKNP,CWHTN,my-sample,10.659104,A0201,0.003625,0.157175,0.921852,0.088804
1,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,106,YLYALVYFL,EAPFL,QSINF,my-sample,11.053785,A0201,0.006750,0.014756,0.868851,0.171848
2,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,71,ALSKGVHFV,KRWQL,CNLLL,my-sample,11.501204,A0201,0.011500,0.676803,0.987502,0.002065
3,tr|A0A6B9WFC7|A0A6B9WFC7_SARS2,49,SLVKPSFYV,NIVNV,YSRVK,my-sample,11.930823,A0201,0.013500,0.091771,0.891807,0.135353
4,tr|A0A6B9WFC7|A0A6B9WFC7_SARS2,19,FLAFVVFLL,NSVLL,VTLAI,my-sample,12.318483,A0201,0.015875,0.007210,0.852791,0.196277
...,...,...,...,...,...,...,...,...,...,...,...,...
188,tr|A0A6B9WFC7|A0A6B9WFC7_SARS2,15,SVLLFLAFVV,TLIVN,FLLVT,my-sample,466.913027,A0201,1.297625,0.010083,0.145307,2.271005
189,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,57,SASKIITL,LAVFQ,KKRWQ,my-sample,471.300226,C0802,0.774375,0.772850,0.753873,0.351359
190,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,169,TSGDGTTSPI,SSIVI,SEHDY,my-sample,473.865753,C0802,0.774375,0.000247,0.138992,2.345462
191,tr|A0A6B9VLF3|A0A6B9VLF3_SARS2,71,RINWITGGI,LAAVY,AIAMA,my-sample,475.852826,A0201,1.306500,0.166255,0.232094,1.656413


In [None]:
# Download results
results2.to_csv('mhcflurry-results.csv')
files.download('mhcflurry-results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# See help for more options:
help(predictor.predict_sequences)

Help on method predict_sequences in module mhcflurry.class1_presentation_predictor:

predict_sequences(sequences, alleles, result='best', comparison_quantity=None, filter_value=None, peptide_lengths=(8, 9, 10, 11), use_flanks=True, include_affinity_percentile=True, verbose=1, throw=True) method of mhcflurry.class1_presentation_predictor.Class1PresentationPredictor instance
    Predict presentation across protein sequences.
    
    Example:
    
    >>> predictor = Class1PresentationPredictor.load()
    >>> predictor.predict_sequences(
    ...    sequences={
    ...        'protein1': "MDSKGSSQKGSRLLLLLVVSNLL",
    ...        'protein2': "SSLPTPEDKEQAQQTHH",
    ...    },
    ...    alleles={
    ...        "sample1": ["A0201", "A0301", "B0702"],
    ...        "sample2": ["A0101", "C0202"],
    ...    },
    ...    result="filtered",
    ...    comparison_quantity="affinity",
    ...    filter_value=500,
    ...    verbose=0)
      sequence_name  pos     peptide n_flank c_flank sample