<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Adaptive-ImmuneCODE:-sars-cov" data-toc-modified-id="Adaptive-ImmuneCODE:-sars-cov-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Adaptive ImmuneCODE: sars-cov</a></span></li><li><span><a href="#check-overlap-mcpas" data-toc-modified-id="check-overlap-mcpas-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>check overlap mcpas</a></span><ul class="toc-item"><li><span><a href="#Different-ways-of-checking-overlap" data-toc-modified-id="Different-ways-of-checking-overlap-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Different ways of checking overlap</a></span></li><li><span><a href="#Find-unique-and-non-unique-CDR3-epitope-pairs-in-the-McPAS-dataset-when-compared-to-VDJdb-via-set-overlap" data-toc-modified-id="Find-unique-and-non-unique-CDR3-epitope-pairs-in-the-McPAS-dataset-when-compared-to-VDJdb-via-set-overlap-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Find unique and non-unique CDR3-epitope pairs in the McPAS dataset when compared to VDJdb via set overlap</a></span></li></ul></li><li><span><a href="#VDJdb-update" data-toc-modified-id="VDJdb-update-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>VDJdb update</a></span><ul class="toc-item"><li><span><a href="#Without-length-restriction" data-toc-modified-id="Without-length-restriction-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Without length restriction</a></span></li><li><span><a href="#With-length-restriction" data-toc-modified-id="With-length-restriction-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>With length restriction</a></span></li><li><span><a href="#10x-data" data-toc-modified-id="10x-data-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>10x data</a></span></li></ul></li></ul></div>

Creation of external validation sets based on mcpas and the held-out 10x genomics data. Based on retrieving all unique epitopes (i.e. those not occurring in the VDJdb pairs used for training).

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import seaborn as sns
import scipy

from src.scripts.evaluate.visualize import predict_variations
from src.config import PROJECT_ROOT

In [2]:
import json

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from pathlib import Path

from src.config import PROJECT_ROOT
from src.scripts.preprocessing import preprocess_vdjdb

# Adaptive ImmuneCODE: sars-cov

Source: https://immunerace.adaptivebiotech.com/more-data-and-whats-coming-next/

Extracted CDR3 and epitope sequences for all samples with a single epitope.

In [67]:
df = pd.read_csv(PROJECT_ROOT / "data/interim/immunecode-adaptive/adaptive-sars-cov.csv", sep=";")

In [68]:
df.groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
HTTDPSFLGRY    5397
FVDGVPFVV      2397
FLNGSCGSV      2282
KLSYGIATV      2132
KLPDDFTGCV     1147
               ... 
AEGSRGGSQA        2
RLNEVAKNL         1
IITTDNTFV         1
LAAVYRINWI        1
LMIERFVSL         1
Length: 136, dtype: int64

In [69]:
train_df = pd.read_csv(PROJECT_ROOT / "data/interim/vdjdb-2019-08-08/vdjdb-human-trb-mhci-no10x-size.csv", sep=';')

In [70]:
unique_cov = set(df["antigen.epitope"].unique()) - set(train_df["antigen.epitope"].unique())

In [71]:
set(df["antigen.epitope"].unique()).intersection(set(train_df["antigen.epitope"].unique()))

set()

In [72]:
unique_cov = set(df["antigen.epitope"].unique()) - set(train_df["antigen.epitope"].unique())
len(unique_cov)

136

# check overlap mcpas

In [3]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2019-08-08/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()

df_human_no10x = df.loc[(df["species"] == "HomoSapiens") & 
#                                  (df["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 ~(df["reference.id"].str.contains("10x"))]

df_human_no10x = df_human_no10x.loc[ (df_human_no10x["antigen.epitope"].str.len() >= 8) &
                                                 (df_human_no10x["antigen.epitope"].str.len() <= 11) &
                                                 (df_human_no10x["cdr3"].str.len() >= 10) &
                                                 (df_human_no10x["cdr3"].str.len() <= 20)]

df_human_no10x = df_human_no10x.drop_duplicates(columns)

print(f"Filtering reduced the number of unique pairs from {df.drop_duplicates(columns).shape[0]} to {df_human_no10x.shape[0]}.")

  import sys
  
  if __name__ == '__main__':


Filtering reduced the number of unique pairs from 61047 to 19842.


In [4]:
df = df_human_no10x

In [5]:
df_mcpas = pd.read_csv(PROJECT_ROOT / "data/raw/mcpas.csv")

In [6]:
df_mcpas["merged"] = df_mcpas["CDR3_beta"] + "-" + df_mcpas["HLA_peptide"]

In [7]:
df_mcpas = df_mcpas.loc[ (df_mcpas["HLA_peptide"].str.len() >= 8) &
                         (df_mcpas["HLA_peptide"].str.len() <= 11) &
                         (df_mcpas["CDR3_beta"].str.len() >= 10) &
                         (df_mcpas["CDR3_beta"].str.len() <= 20)]

In [8]:
df_mcpas = df_mcpas.drop_duplicates(["CDR3_beta","HLA_peptide"])
df_mcpas = df_mcpas.rename(columns={"CDR3_beta": "cdr3", "HLA_peptide": "antigen.epitope"})

In [9]:
# df_mcpas.to_csv(PROJECT_ROOT / "data/interim/mcpas/mcpas-human-trb-mhci-size.csv", index=False, sep=";")

In [10]:
df_mcpas

Unnamed: 0,TRBV_gene,cdr3,TRBJ_gene,antigen.epitope,Epitope species,Reference,Source,merged
0,TRBV27,CASSLGSSYEQYF,TRBJ2-7,YLEPGPVTA,Melanoma,PubMed ID: 8752841,McPas-TCR,CASSLGSSYEQYF-YLEPGPVTA
1,TRBV4,CASLAGQGYNEQF,TRBJ2-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLAGQGYNEQF-SAYGEPRKL
2,TRBV12,CASLGAQNNEQF,TRBJ2-1,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLGAQNNEQF-AARAVFLAL
3,TRBV6,CASRLWFWALEAF,TRBJ1-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASRLWFWALEAF-SAYGEPRKL
4,TRBV6,CASSYSTGDEQYF,TRBJ2-7,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASSYSTGDEQYF-AARAVFLAL
...,...,...,...,...,...,...,...,...
6680,TRBV19,GASSIGIFGYTF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,GASSIGIFGYTF-GILGFVFTL
6681,TRBV19,CASRIGIYGYPF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASRIGIYGYPF-GILGFVFTL
6682,TRBV19,CASSTGSYGYTF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASSTGSYGYTF-GILGFVFTL
6684,TRBV19,CASSSRSAIEQFF,TRBJ2-1,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASSSRSAIEQFF-GILGFVFTL


In [11]:
merged = pd.concat([df,df_mcpas], axis=1, join='inner')
merged

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,cdr3.len,antigen.epitope.len,TRBV_gene,cdr3.1,TRBJ_gene,antigen.epitope.1,Epitope species,Reference,Source,merged
0,0,TRA,CAVTDDKIIF,TRAV12-2*01,TRAJ30*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,10,9,TRBV27,CASSLGSSYEQYF,TRBJ2-7,YLEPGPVTA,Melanoma,PubMed ID: 8752841,McPas-TCR,CASSLGSSYEQYF-YLEPGPVTA
1,0,TRA,CAVDSGGYQKVTF,TRAV12-2*01,TRAJ13*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,13,9,TRBV4,CASLAGQGYNEQF,TRBJ2-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLAGQGYNEQF-SAYGEPRKL
2,0,TRA,CAGGDDKIIF,TRAV12-2*01,TRAJ30*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,10,9,TRBV12,CASLGAQNNEQF,TRBJ2-1,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLGAQNNEQF-AARAVFLAL
3,0,TRA,CAVKDARLMF,TRAV12-2*01,TRAJ31*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,10,9,TRBV6,CASRLWFWALEAF,TRBJ1-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASRLWFWALEAF-SAYGEPRKL
4,0,TRA,CAVGSDKIIF,TRAV12-2*01,TRAJ30*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,10,9,TRBV6,CASSYSTGDEQYF,TRBJ2-7,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASSYSTGDEQYF-AARAVFLAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1560,355,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEMGGL,...,20,8,TRBV27,CASSSGQEAF,TRBJ1-1,TPRVTGGGAM,Cytomegalovirus(CMV),PubMed ID: 25339770,McPas-TCR,CASSSGQEAF-TPRVTGGGAM
1561,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEMGGL,...,19,8,TRBV5-07,CASSSGTSGYEQYF,TRBJ2-7,QIKVRVKMV,Cytomegalovirus(CMV),PubMed ID: 25339770,McPas-TCR,CASSSGTSGYEQYF-QIKVRVKMV
1562,356,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEMGGL,...,14,8,TRBV27,CASSSGTSGYYNEQF,TRBJ2-1,GLCTLVAML,EpsteinBarrvirus(EBV),PubMed ID: 25339770,McPas-TCR,CASSSGTSGYYNEQF-GLCTLVAML
1563,356,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEMGGL,...,19,8,TRBV27,CASSSPRESTDTQYF,TRBJ2-3,YSEHPTFTSQY,Cytomegalovirus(CMV),PubMed ID: 25339770,McPas-TCR,CASSSPRESTDTQYF-YSEHPTFTSQY


## Different ways of checking overlap

Dataframe merges

In [60]:
intersection = df[["cdr3","antigen.epitope"]].merge(df_mcpas[["cdr3","antigen.epitope"]]).drop_duplicates()
intersection
# pd.concat([df.merge(intersection), df_mcpas.merge(intersection)])

Unnamed: 0,cdr3,antigen.epitope
0,CASSAGTGAYEQYF,LLWNGPMAV
1,CSARDRTGNGYTF,GLCTLVAML
2,CSVGTGGTNEKLFF,GLCTLVAML
3,CSVGSGGTNEKLFF,GLCTLVAML
4,CSVGAGGTNEKLFF,GLCTLVAML
...,...,...
2511,CASSIISVDGYTF,GILGFVFTL
2512,CASSTRSGTEQYF,GILGFVFTL
2513,CASSATGSTYEQYF,GILGFVFTL
2514,CASSARATDTQYF,GILGFVFTL


In [61]:
in_vdjdb = df_mcpas["merged"].apply(lambda x: df["merged"].str.contains(x).any())

In [62]:
in_vdjdb

0       False
1       False
2       False
3       False
4       False
        ...  
6680    False
6681    False
6682     True
6684    False
6685    False
Name: merged, Length: 6324, dtype: bool

In [63]:
in_vdjdb.sum()

2516

In [64]:
df_mcpas[~in_vdjdb]

Unnamed: 0,TRBV_gene,cdr3,TRBJ_gene,antigen.epitope,Epitope species,Reference,Source,merged
0,TRBV27,CASSLGSSYEQYF,TRBJ2-7,YLEPGPVTA,Melanoma,PubMed ID: 8752841,McPas-TCR,CASSLGSSYEQYF-YLEPGPVTA
1,TRBV4,CASLAGQGYNEQF,TRBJ2-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLAGQGYNEQF-SAYGEPRKL
2,TRBV12,CASLGAQNNEQF,TRBJ2-1,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLGAQNNEQF-AARAVFLAL
3,TRBV6,CASRLWFWALEAF,TRBJ1-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASRLWFWALEAF-SAYGEPRKL
4,TRBV6,CASSYSTGDEQYF,TRBJ2-7,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASSYSTGDEQYF-AARAVFLAL
...,...,...,...,...,...,...,...,...
6677,TRBV19,CASSKRSNPPQHF,TRBJ1-5,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASSKRSNPPQHF-GILGFVFTL
6680,TRBV19,GASSIGIFGYTF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,GASSIGIFGYTF-GILGFVFTL
6681,TRBV19,CASRIGIYGYPF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASRIGIYGYPF-GILGFVFTL
6684,TRBV19,CASSSRSAIEQFF,TRBJ2-1,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASSSRSAIEQFF-GILGFVFTL


In [65]:
df_mcpas.loc[~in_vdjdb,"antigen.epitope"].unique()

array(['YLEPGPVTA', 'SAYGEPRKL', 'AARAVFLAL', 'EAAGIGILTV', 'IVTDFSVIK',
       'AVFDRKSDAK', 'GLCTLVAML', 'RAKFKQLL', 'NLVPMVATV', 'EENLLDFVRF',
       'KAFSPEVIPMF', 'RAFSPEVIPMF', 'RPRGEVRFL', 'ELAGIGILTV',
       'HSKKKCDEL', 'ATDALMTGY', 'TPRVTGGGAM', 'FLRGRAYGL', 'TQGYFPDWQNY',
       'EIYKRWII', 'FLKEKGGL', 'KRWIIMGLNK', 'ISPRTLNAW', 'QASQEVKNW',
       'GLNKIVRMY', 'KRWIILGLNK', 'QYDPVAALF', 'YVLDHLIVV', 'FWIDLFETIG',
       'RYPLTFGWCF', 'RYPLTFGW', 'GILGFVFTL', 'YSEHPTFTSQY', 'QIKVRVKMV',
       'HPVGEADYFEY', 'RPPIFIRRL', 'EPLPQGQLTAY', 'VLEETSVML',
       'ALTPVVVTL', 'FPRPWLHGL', 'IIKDYGKQM', 'LPPIVAKEI', 'HPKVSSEVHI',
       'TPQDLNTML', 'TPGPGVRYPL', 'PQPELPYPQPE', 'LLWNGPMAV', 'VTEHDTLLY',
       'LPRRSGAAGA', 'VLFGLGFAI'], dtype=object)

In [66]:
len(df_mcpas.loc[~in_vdjdb,"antigen.epitope"].unique())

50

In [67]:
df_mcpas.loc[~in_vdjdb,"antigen.epitope"].unique()

array(['YLEPGPVTA', 'SAYGEPRKL', 'AARAVFLAL', 'EAAGIGILTV', 'IVTDFSVIK',
       'AVFDRKSDAK', 'GLCTLVAML', 'RAKFKQLL', 'NLVPMVATV', 'EENLLDFVRF',
       'KAFSPEVIPMF', 'RAFSPEVIPMF', 'RPRGEVRFL', 'ELAGIGILTV',
       'HSKKKCDEL', 'ATDALMTGY', 'TPRVTGGGAM', 'FLRGRAYGL', 'TQGYFPDWQNY',
       'EIYKRWII', 'FLKEKGGL', 'KRWIIMGLNK', 'ISPRTLNAW', 'QASQEVKNW',
       'GLNKIVRMY', 'KRWIILGLNK', 'QYDPVAALF', 'YVLDHLIVV', 'FWIDLFETIG',
       'RYPLTFGWCF', 'RYPLTFGW', 'GILGFVFTL', 'YSEHPTFTSQY', 'QIKVRVKMV',
       'HPVGEADYFEY', 'RPPIFIRRL', 'EPLPQGQLTAY', 'VLEETSVML',
       'ALTPVVVTL', 'FPRPWLHGL', 'IIKDYGKQM', 'LPPIVAKEI', 'HPKVSSEVHI',
       'TPQDLNTML', 'TPGPGVRYPL', 'PQPELPYPQPE', 'LLWNGPMAV', 'VTEHDTLLY',
       'LPRRSGAAGA', 'VLFGLGFAI'], dtype=object)

In [68]:
unique_epitopes = set(df_mcpas.loc[~in_vdjdb,"antigen.epitope"].unique()) - set(df["antigen.epitope"].unique())
unique_epitopes

{'AARAVFLAL',
 'ALTPVVVTL',
 'FWIDLFETIG',
 'PQPELPYPQPE',
 'RAFSPEVIPMF',
 'RYPLTFGW',
 'SAYGEPRKL',
 'VLFGLGFAI',
 'YLEPGPVTA'}

In [69]:
df_mcpas.loc[df_mcpas["antigen.epitope"].isin(unique_epitopes),"antigen.epitope"].shape

(64,)

## Find unique and non-unique CDR3-epitope pairs in the McPAS dataset when compared to VDJdb via set overlap

In [12]:
unique = set(df["antigen.epitope"].unique()) - set(df_mcpas["antigen.epitope"].unique())
unique

{'AAGIGILTV',
 'ALGIGILTV',
 'ALWGPDPAAA',
 'AMFWSVPTV',
 'ARMILMTHF',
 'CINGVCWTV',
 'CLGGLLTMV',
 'CLNEYHLFL',
 'CVETMCNEY',
 'CVNGSCFTV',
 'DEEDAIAAY',
 'EEYLKAWTF',
 'EEYLQAFTY',
 'EFFWDANDIY',
 'ELAAIGILTV',
 'ELAGIGALTV',
 'ELAGIGLTV',
 'ELKRKMIYM',
 'ELRRKMMYM',
 'FLGKIWPSHK',
 'FLKEMGGL',
 'FLKEQGGL',
 'FLKETGGL',
 'FLYNLLTRV',
 'FPQGEAREL',
 'FPTKDVAL',
 'GEIYKRWII',
 'GPGHKARVL',
 'GPGMKARVL',
 'GTSGSPIIDK',
 'GTSGSPIINR',
 'GTSGSPIVNR',
 'HPVGDADYFEY',
 'HPVGQADYFEY',
 'IILVAVPHV',
 'ILAKFLHWL',
 'ILKEPVHGV',
 'IPSINVHHY',
 'KIFGSLAFL',
 'KINAWIKVV',
 'KLMNIQQKL',
 'KLSALGINAV',
 'KLVALGINAV',
 'LGYGFVNYI',
 'LLFGFPVYV',
 'LLFGKPVYV',
 'LLFGPVYV',
 'LLFGYAVYV',
 'LLFGYPRYV',
 'LLFGYPVAV',
 'LLFGYPVYV',
 'LLLGIGILV',
 'MLGEQLFPL',
 'MLNIPSINV',
 'MLWGYLQYV',
 'NEGVKAAW',
 'NLSALGIFST',
 'QIKVRVDMV',
 'QVPLRPMTYK',
 'RFPLTFGWCF',
 'RLRPGGKKK',
 'RLRPGGKKR',
 'RLRPGGRKR',
 'RMFPNAPYL',
 'RPHERNGFTV',
 'RPHERNGFTVL',
 'RYPLTLGWCF',
 'SFHSLHLLF',
 'SLLMWITQC',
 'SLLMWITQV',
 'SLY

Check for sequence pairs where the epitope is unique to McPAS

In [70]:
unique = set(df_mcpas["antigen.epitope"].unique()) - set(df["antigen.epitope"].unique())
unique

{'AARAVFLAL',
 'ALTPVVVTL',
 'FWIDLFETIG',
 'PQPELPYPQPE',
 'RAFSPEVIPMF',
 'RYPLTFGW',
 'SAYGEPRKL',
 'VLFGLGFAI',
 'YLEPGPVTA'}

In [71]:
df_mcpas[df_mcpas["antigen.epitope"].isin(unique)].groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
VLFGLGFAI      15
FWIDLFETIG     14
PQPELPYPQPE    10
RYPLTFGW        8
RAFSPEVIPMF     6
ALTPVVVTL       6
SAYGEPRKL       2
AARAVFLAL       2
YLEPGPVTA       1
dtype: int64

In [72]:
df_mcpas_unique = df_mcpas[df_mcpas["antigen.epitope"].isin(unique)]

In [73]:
# df_mcpas_unique.to_csv(PROJECT_ROOT / "data/interim/mcpas/mcpas-human-trb-mhci-size-unique.csv", index=False, sep=";")

Check for CDR3-epitope pairs where the epitope occurs in the VDJdb dataset, but the actual sequence pair does not.

In [74]:
shared = set(df_mcpas["antigen.epitope"].unique()).intersection(set(df["antigen.epitope"].unique()))
len(shared)

42

In [75]:
df_mcpas[df_mcpas["antigen.epitope"].isin(shared)].groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
LPRRSGAAGA     2134
GILGFVFTL       992
GLCTLVAML       791
NLVPMVATV       418
VTEHDTLLY       273
EAAGIGILTV      262
LLWNGPMAV       178
RAKFKQLL        137
KAFSPEVIPMF     114
TPRVTGGGAM      111
KRWIILGLNK       98
FPRPWLHGL        85
ELAGIGILTV       62
HPKVSSEVHI       52
TPQDLNTML        45
YSEHPTFTSQY      45
LPPIVAKEI        44
TPGPGVRYPL       42
EIYKRWII         36
YVLDHLIVV        36
RPRGEVRFL        30
RYPLTFGWCF       30
HSKKKCDEL        29
IIKDYGKQM        25
QYDPVAALF        25
QIKVRVKMV        24
CTELKLSDY        24
ATDALMTGY        24
IVTDFSVIK        19
FLKEKGGL         10
ISPRTLNAW        10
AVFDRKSDAK       10
FLRGRAYGL         8
TQGYFPDWQNY       7
KRWIIMGLNK        7
GLNKIVRMY         5
EPLPQGQLTAY       5
HPVGEADYFEY       4
QASQEVKNW         3
RPPIFIRRL         2
EENLLDFVRF        2
VLEETSVML         2
dtype: int64

In [76]:
df_mcpas_shared = df_mcpas[df_mcpas["antigen.epitope"].isin(shared)]
df_mcpas_shared
# this is not enough, the dataset still contains many duplicate entries that already exist in VDJdb.

Unnamed: 0,TRBV_gene,cdr3,TRBJ_gene,antigen.epitope,Epitope species,Reference,Source,merged
5,TRBV7,CASSQGTSQFNEQF,TRBJ2-1,EAAGIGILTV,Melanoma,PubMed ID: 9103444,McPas-TCR,CASSQGTSQFNEQF-EAAGIGILTV
6,TRBV4,CASSSGQGLNIQYF,TRBJ2-4,EAAGIGILTV,Melanoma,PubMed ID: 9103444,McPas-TCR,CASSSGQGLNIQYF-EAAGIGILTV
7,TRBV2,CASSESAGGYYNEQF,TRBJ2-1,IVTDFSVIK,EpsteinBarrvirus(EBV),PubMed ID: 9207000,McPas-TCR,CASSESAGGYYNEQF-IVTDFSVIK
8,TRBV11-01,CASSFGFGSSYGYTF,TRBJ1-2,IVTDFSVIK,EpsteinBarrvirus(EBV),PubMed ID: 9207000,McPas-TCR,CASSFGFGSSYGYTF-IVTDFSVIK
9,TRBV28,CASSLASATGELF,TRBJ2-2,AVFDRKSDAK,EpsteinBarrvirus(EBV),PubMed ID: 9207000,McPas-TCR,CASSLASATGELF-AVFDRKSDAK
...,...,...,...,...,...,...,...,...
6680,TRBV19,GASSIGIFGYTF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,GASSIGIFGYTF-GILGFVFTL
6681,TRBV19,CASRIGIYGYPF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASRIGIYGYPF-GILGFVFTL
6682,TRBV19,CASSTGSYGYTF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASSTGSYGYTF-GILGFVFTL
6684,TRBV19,CASSSRSAIEQFF,TRBJ2-1,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASSSRSAIEQFF-GILGFVFTL


In [77]:
df_mcpas_shared = df_mcpas_shared[["cdr3", "antigen.epitope"]].merge(df[["cdr3", "antigen.epitope"]],indicator = True, how='left').loc[lambda x : x['_merge']=='left_only']
df_mcpas_shared

Unnamed: 0,cdr3,antigen.epitope,_merge
0,CASSQGTSQFNEQF,EAAGIGILTV,left_only
1,CASSSGQGLNIQYF,EAAGIGILTV,left_only
2,CASSESAGGYYNEQF,IVTDFSVIK,left_only
3,CASSFGFGSSYGYTF,IVTDFSVIK,left_only
4,CASSLASATGELF,AVFDRKSDAK,left_only
...,...,...,...
6254,CASSKRSNPPQHF,GILGFVFTL,left_only
6255,GASSIGIFGYTF,GILGFVFTL,left_only
6256,CASRIGIYGYPF,GILGFVFTL,left_only
6258,CASSSRSAIEQFF,GILGFVFTL,left_only


In [78]:
df_mcpas_shared[["cdr3", "antigen.epitope"]].merge(df[["cdr3", "antigen.epitope"]],indicator = True, how='left').loc[lambda x : x['_merge']!='both']["_merge"].value_counts()

left_only     3744
both             0
right_only       0
Name: _merge, dtype: int64

In [79]:
# df_mcpas_shared.to_csv(PROJECT_ROOT / "data/interim/mcpas/mcpas-human-trb-mhci-size-shared.csv", index=False, sep=";")

# VDJdb update

## Without length restriction

In [36]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2020-01-20/vdjdb.txt"
df_2020 = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df_2020 = (
    df_2020.join(json_normalize(df_2020["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df_2020["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df_2020["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df_2020["merged"] = df_2020["cdr3"] + "-" + df_2020["antigen.epitope"]

# create length columns for figures
df_2020["cdr3.len"] = df_2020["cdr3"].str.len()
df_2020["antigen.epitope.len"] = df_2020["antigen.epitope"].str.len()

df_2020_human_no10x = df_2020.loc[(df_2020["species"] == "HomoSapiens") & 
#                                  (df_2020["gene"] == "TRB") & 
                                 (df_2020["cdr3fix.good"]) & 
                                 (df_2020["mhc.class"] == "MHCI") &
                                 ~(df_2020["reference.id"].str.contains("10x"))]

# df_2020_human_no10x = df_2020_human_no10x.loc[ (df_2020_human_no10x["antigen.epitope"].str.len() >= 8) &
#                                                  (df_2020_human_no10x["antigen.epitope"].str.len() <= 11) &
#                                                  (df_2020_human_no10x["cdr3"].str.len() >= 10) &
#                                                  (df_2020_human_no10x["cdr3"].str.len() <= 20)]

df_2020_human_no10x = df_2020_human_no10x.drop_duplicates(columns)

print(f"Filtering reduced the number of unique pairs from {df_2020.drop_duplicates(columns).shape[0]} to {df_2020_human_no10x.shape[0]}.")

  import sys
  
  if __name__ == '__main__':


Filtering reduced the number of unique pairs from 61555 to 20218.


In [38]:
df_2020 = df_2020_human_no10x

In [39]:
unique = set(df_2020_human_no10x["antigen.epitope"].unique()) - set(df["antigen.epitope"].unique())
df_2020_human_no10x[df_2020_human_no10x["antigen.epitope"].isin(unique)].groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
DATYQRTRALVR       190
LPEPLPQGQLTAY        9
CPSQEPMSIYVY         6
ARNLVPMVATVQGQN      3
LSEFCRVLCCYVLEE      2
LPEPLPQGQLTGY        2
LPEPLPQGQLGAY        2
LPEPLPQGQGTAY        2
LPEPLPQGGLTAY        2
LPEPLPQAQLTAY        2
LPEPLGQGQLTAY        2
LPEGLPQGQLTAY        2
GPEPLPQGQLTAY        2
EDVPSGKLFMHVTLG      1
dtype: int64

In [40]:
df_2020_human_no10x.loc[df_2020_human_no10x["antigen.epitope"].isin(unique), "mhc.class"].nunique()

1

New epitopes all fall outside of length restrictions

In [41]:
columns = ["cdr3", "antigen.epitope"]
df_diff_unique = df_2020_human_no10x.drop_duplicates(columns).fillna("NULL").merge(df[columns].drop_duplicates(columns).fillna("NULL"),indicator = True, how='outer', on=columns).loc[lambda x : x['_merge']!='both']
df_diff_unique["_merge"].value_counts()

left_only     376
both            0
right_only      0
Name: _merge, dtype: int64

In [42]:
df_diff_unique[df_diff_unique["cdr3fix.good"]].drop_duplicates(columns).groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
DATYQRTRALVR       190
NLVPMVATV           58
GILGFVFTL           44
LLLGIGILV           16
ELAGIGILTV          12
LPEPLPQGQLTAY        9
CPSQEPMSIYVY         6
LLWNGPMAV            3
KRWIILGLNK           3
ARNLVPMVATVQGQN      3
LPEPLPQGGLTAY        2
KLVALGINAV           2
CINGVCWTV            2
QVPLRPMTYK           2
LSEFCRVLCCYVLEE      2
LPEPLPQGQLTGY        2
LPEPLPQAQLTAY        2
GPEPLPQGQLTAY        2
LPEPLPQGQLGAY        2
LPEPLPQGQGTAY        2
LPEGLPQGQLTAY        2
LPEPLGQGQLTAY        2
IVTDFSVIK            1
IPSINVHHY            1
GLCTLVAML            1
SFHSLHLLF            1
EDVPSGKLFMHVTLG      1
NEGVKAAW             1
CLGGLLTMV            1
TPQDLNTML            1
dtype: int64

In [43]:
df_diff_unique[df_diff_unique["cdr3fix.good"]].drop_duplicates(columns).groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
DATYQRTRALVR       190
NLVPMVATV           58
GILGFVFTL           44
LLLGIGILV           16
ELAGIGILTV          12
LPEPLPQGQLTAY        9
CPSQEPMSIYVY         6
LLWNGPMAV            3
KRWIILGLNK           3
ARNLVPMVATVQGQN      3
LPEPLPQGGLTAY        2
KLVALGINAV           2
CINGVCWTV            2
QVPLRPMTYK           2
LSEFCRVLCCYVLEE      2
LPEPLPQGQLTGY        2
LPEPLPQAQLTAY        2
GPEPLPQGQLTAY        2
LPEPLPQGQLGAY        2
LPEPLPQGQGTAY        2
LPEGLPQGQLTAY        2
LPEPLGQGQLTAY        2
IVTDFSVIK            1
IPSINVHHY            1
GLCTLVAML            1
SFHSLHLLF            1
EDVPSGKLFMHVTLG      1
NEGVKAAW             1
CLGGLLTMV            1
TPQDLNTML            1
dtype: int64

## With length restriction

In [44]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2020-01-20/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()

df = df.loc[(df["species"] == "HomoSapiens") & 
#                                  (df_2020["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 ~(df["reference.id"].str.contains("10x"))]

df = df.loc[ (df["antigen.epitope"].str.len() >= 8) &
                                                 (df["antigen.epitope"].str.len() <= 11) &
                                                 (df["cdr3"].str.len() >= 10) &
                                                 (df["cdr3"].str.len() <= 20)]

df = df.drop_duplicates(columns)

df_jan = df

###

# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2019-08-08/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()


df = df.loc[(df["species"] == "HomoSapiens") & 
#                                  (df_2020["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 ~(df["reference.id"].str.contains("10x"))]

df = df.loc[ (df["antigen.epitope"].str.len() >= 8) &
                                                 (df["antigen.epitope"].str.len() <= 11) &
                                                 (df["cdr3"].str.len() >= 10) &
                                                 (df["cdr3"].str.len() <= 20)]

df = df.drop_duplicates(columns)

df_aug = df

  import sys
  
  if __name__ == '__main__':


In [45]:
df_diff_unique = df_jan.drop_duplicates(columns).fillna("NULL").merge(df_aug[columns].drop_duplicates(columns).fillna("NULL"),indicator = True, how='outer', on=columns).loc[lambda x : x['_merge']!='both']

In [46]:
df_diff_unique["_merge"].value_counts()

left_only     1
both          0
right_only    0
Name: _merge, dtype: int64

In [47]:
df_diff_unique[df_diff_unique["cdr3fix.good"]].drop_duplicates(columns).groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
SFHSLHLLF    1
dtype: int64

## 10x data

Some concerns with this data.

In [48]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2019-08-08/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()

df_human_10x = df.loc[(df["species"] == "HomoSapiens") & 
                                 (df["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 (df["reference.id"].str.contains("10x"))]

df_human_10x = df_human_10x.loc[ (df_human_10x["antigen.epitope"].str.len() >= 8) &
                                                 (df_human_10x["antigen.epitope"].str.len() <= 11) &
                                                 (df_human_10x["cdr3"].str.len() >= 10) &
                                                 (df_human_10x["cdr3"].str.len() <= 20)]

df_human_10x = df_human_10x.drop_duplicates(columns)

print(f"Filtering reduced the number of unique pairs from {df.drop_duplicates(columns).shape[0]} to {df_human_10x.shape[0]}.")

  import sys
  
  if __name__ == '__main__':


Filtering reduced the number of unique pairs from 61047 to 17200.


In [49]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2019-08-08/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()

df_human_no10x = df.loc[(df["species"] == "HomoSapiens") & 
#                                  (df["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 ~(df["reference.id"].str.contains("10x"))]

df_human_no10x = df_human_no10x.loc[ (df_human_no10x["antigen.epitope"].str.len() >= 8) &
                                                 (df_human_no10x["antigen.epitope"].str.len() <= 11) &
                                                 (df_human_no10x["cdr3"].str.len() >= 10) &
                                                 (df_human_no10x["cdr3"].str.len() <= 20)]

df_human_no10x = df_human_no10x.drop_duplicates(columns)

print(f"Filtering reduced the number of unique pairs from {df.drop_duplicates(columns).shape[0]} to {df_human_no10x.shape[0]}.")

  import sys
  
  if __name__ == '__main__':


Filtering reduced the number of unique pairs from 61047 to 19842.


In [50]:
unique_10x = set(df_human_10x["antigen.epitope"].unique()) - set(df_human_no10x["antigen.epitope"].unique())
df_human_10x[df_human_10x["antigen.epitope"].isin(unique_10x)].groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
KLGGALQAK     12548
RLRAEAQVK       407
RTLNAWVKV        46
AYAQKIFKI        39
SLFNTVATLY       38
FLASKIGRLV       34
FLYALALLL        28
LLDFVRFMGV       21
KTWGQYWQV        16
CYTWNQMNL        15
MLDLQPETT        14
YLLEMLWRL        13
IMDQVPFSV        12
KVLEYVIKV         8
CLLWSFQTSA        7
YLNDHLEPWI        7
QPRAPIRPI         5
SLFNTVATL         5
CLLGTYTQDV        3
SLYNTVATLY        2
KLQCVDLHV         1
KVAELVHFL         1
RIAAWMATY         1
dtype: int64

In [52]:
df_human_10x[df_human_10x["antigen.epitope"].isin(unique_10x)].to_csv(PROJECT_ROOT / "data/interim/vdjdb-2019-08-08/vdjdb-human-trb-mhci-10xunique-size.csv", index=False, sep=";")


In [53]:
df_human_10x.loc[df_human_10x["antigen.epitope"].isin(unique_10x), "antigen.epitope"].value_counts()

KLGGALQAK     12548
RLRAEAQVK       407
RTLNAWVKV        46
AYAQKIFKI        39
SLFNTVATLY       38
FLASKIGRLV       34
FLYALALLL        28
LLDFVRFMGV       21
KTWGQYWQV        16
CYTWNQMNL        15
MLDLQPETT        14
YLLEMLWRL        13
IMDQVPFSV        12
KVLEYVIKV         8
CLLWSFQTSA        7
YLNDHLEPWI        7
QPRAPIRPI         5
SLFNTVATL         5
CLLGTYTQDV        3
SLYNTVATLY        2
RIAAWMATY         1
KVAELVHFL         1
KLQCVDLHV         1
Name: antigen.epitope, dtype: int64

In [54]:
sample_indices = df_human_10x[df_human_10x["antigen.epitope"] == "KLGGALQAK"].sample(n=400, random_state=42).index
drop_indices = df_human_10x[df_human_10x["antigen.epitope"] == "KLGGALQAK"].index.difference(sample_indices)

In [55]:
df_human_10x[df_human_10x["antigen.epitope"].isin(unique_10x)].drop(drop_indices).to_csv(PROJECT_ROOT / "data/interim/vdjdb-2019-08-08/vdjdb-human-trb-mhci-10xuniquedown-size.csv", index=False, sep=";")