<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#check-overlap-mcpas" data-toc-modified-id="check-overlap-mcpas-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>check overlap mcpas</a></span><ul class="toc-item"><li><span><a href="#VDJdb-update" data-toc-modified-id="VDJdb-update-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>VDJdb update</a></span></li><li><span><a href="#10x-data" data-toc-modified-id="10x-data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>10x data</a></span></li></ul></li></ul></div>

Creation of external validation sets based on mcpas and the held-out 10x genomics data. Based on retrieving all unique epitopes (i.e. those not occurring in the VDJdb pairs used for training).

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import seaborn as sns
import scipy

from src.scripts.evaluate.visualize import predict_variations
from src.config import PROJECT_ROOT

In [2]:
import json

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from pathlib import Path

from src.config import PROJECT_ROOT
from src.scripts.preprocessing import preprocess_vdjdb

# check overlap mcpas

In [3]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2019-08-08/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()

df_human_no10x = df.loc[(df["species"] == "HomoSapiens") & 
#                                  (df["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 ~(df["reference.id"].str.contains("10x"))]

df_human_no10x = df_human_no10x.loc[ (df_human_no10x["antigen.epitope"].str.len() >= 8) &
                                                 (df_human_no10x["antigen.epitope"].str.len() <= 11) &
                                                 (df_human_no10x["cdr3"].str.len() >= 10) &
                                                 (df_human_no10x["cdr3"].str.len() <= 20)]

df_human_no10x = df_human_no10x.drop_duplicates(columns)

print(f"Filtering reduced the number of unique pairs from {df.drop_duplicates(columns).shape[0]} to {df_human_no10x.shape[0]}.")

  import sys
  
  if __name__ == '__main__':


Filtering reduced the number of unique pairs from 61047 to 19842.


In [4]:
df = df_human_no10x

In [5]:
df_mcpas = pd.read_csv("/media/pieter/DATA/Wetenschap/Doctoraat/projects/deepTCR/deepTCR/data/raw/mcpas.csv")

In [6]:
df_mcpas["merged"] = df_mcpas["CDR3_beta"] + "-" + df_mcpas["HLA_peptide"]

In [7]:
df_mcpas = df_mcpas.loc[ (df_mcpas["HLA_peptide"].str.len() >= 8) &
                         (df_mcpas["HLA_peptide"].str.len() <= 11) &
                         (df_mcpas["CDR3_beta"].str.len() >= 10) &
                         (df_mcpas["CDR3_beta"].str.len() <= 20)]

In [8]:
df_mcpas = df_mcpas.drop_duplicates(["CDR3_beta","HLA_peptide"])

In [9]:
merged = pd.concat([df,df_mcpas], axis=1, join='inner')
merged

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,cdr3.len,antigen.epitope.len,TRBV_gene,CDR3_beta,TRBJ_gene,HLA_peptide,Epitope species,Reference,Source,merged
0,0,TRA,CAVTDDKIIF,TRAV12-2*01,TRAJ30*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,10,9,TRBV27,CASSLGSSYEQYF,TRBJ2-7,YLEPGPVTA,Melanoma,PubMed ID: 8752841,McPas-TCR,CASSLGSSYEQYF-YLEPGPVTA
1,0,TRA,CAVDSGGYQKVTF,TRAV12-2*01,TRAJ13*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,13,9,TRBV4,CASLAGQGYNEQF,TRBJ2-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLAGQGYNEQF-SAYGEPRKL
2,0,TRA,CAGGDDKIIF,TRAV12-2*01,TRAJ30*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,10,9,TRBV12,CASLGAQNNEQF,TRBJ2-1,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLGAQNNEQF-AARAVFLAL
3,0,TRA,CAVKDARLMF,TRAV12-2*01,TRAJ31*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,10,9,TRBV6,CASRLWFWALEAF,TRBJ1-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASRLWFWALEAF-SAYGEPRKL
4,0,TRA,CAVGSDKIIF,TRAV12-2*01,TRAJ30*01,HomoSapiens,HLA-A*02,B2M,MHCI,LLWNGPMAV,...,10,9,TRBV6,CASSYSTGDEQYF,TRBJ2-7,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASSYSTGDEQYF-AARAVFLAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1560,355,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEMGGL,...,20,8,TRBV27,CASSSGQEAF,TRBJ1-1,TPRVTGGGAM,Cytomegalovirus(CMV),PubMed ID: 25339770,McPas-TCR,CASSSGQEAF-TPRVTGGGAM
1561,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEMGGL,...,19,8,TRBV5-07,CASSSGTSGYEQYF,TRBJ2-7,QIKVRVKMV,Cytomegalovirus(CMV),PubMed ID: 25339770,McPas-TCR,CASSSGTSGYEQYF-QIKVRVKMV
1562,356,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEMGGL,...,14,8,TRBV27,CASSSGTSGYYNEQF,TRBJ2-1,GLCTLVAML,EpsteinBarrvirus(EBV),PubMed ID: 25339770,McPas-TCR,CASSSGTSGYYNEQF-GLCTLVAML
1563,356,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEMGGL,...,19,8,TRBV27,CASSSPRESTDTQYF,TRBJ2-3,YSEHPTFTSQY,Cytomegalovirus(CMV),PubMed ID: 25339770,McPas-TCR,CASSSPRESTDTQYF-YSEHPTFTSQY


In [10]:
df_mcpas = df_mcpas.rename(columns={"CDR3_beta": "cdr3", "HLA_peptide": "antigen.epitope"})

In [11]:
intersection = df[["cdr3","antigen.epitope"]].merge(df_mcpas[["cdr3","antigen.epitope"]]).drop_duplicates()
intersection
# pd.concat([df.merge(intersection), df_mcpas.merge(intersection)])

Unnamed: 0,cdr3,antigen.epitope
0,CASSAGTGAYEQYF,LLWNGPMAV
1,CSARDRTGNGYTF,GLCTLVAML
2,CSVGTGGTNEKLFF,GLCTLVAML
3,CSVGSGGTNEKLFF,GLCTLVAML
4,CSVGAGGTNEKLFF,GLCTLVAML
...,...,...
2511,CASSIISVDGYTF,GILGFVFTL
2512,CASSTRSGTEQYF,GILGFVFTL
2513,CASSATGSTYEQYF,GILGFVFTL
2514,CASSARATDTQYF,GILGFVFTL


In [12]:
in_vdjdb = df_mcpas["merged"].apply(lambda x: df["merged"].str.contains(x).any())

In [13]:
in_vdjdb

0       False
1       False
2       False
3       False
4       False
        ...  
6680    False
6681    False
6682     True
6684    False
6685    False
Name: merged, Length: 6324, dtype: bool

In [14]:
in_vdjdb.sum()

2516

In [15]:
df_mcpas[~in_vdjdb]

Unnamed: 0,TRBV_gene,cdr3,TRBJ_gene,antigen.epitope,Epitope species,Reference,Source,merged
0,TRBV27,CASSLGSSYEQYF,TRBJ2-7,YLEPGPVTA,Melanoma,PubMed ID: 8752841,McPas-TCR,CASSLGSSYEQYF-YLEPGPVTA
1,TRBV4,CASLAGQGYNEQF,TRBJ2-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLAGQGYNEQF-SAYGEPRKL
2,TRBV12,CASLGAQNNEQF,TRBJ2-1,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLGAQNNEQF-AARAVFLAL
3,TRBV6,CASRLWFWALEAF,TRBJ1-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASRLWFWALEAF-SAYGEPRKL
4,TRBV6,CASSYSTGDEQYF,TRBJ2-7,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASSYSTGDEQYF-AARAVFLAL
...,...,...,...,...,...,...,...,...
6677,TRBV19,CASSKRSNPPQHF,TRBJ1-5,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASSKRSNPPQHF-GILGFVFTL
6680,TRBV19,GASSIGIFGYTF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,GASSIGIFGYTF-GILGFVFTL
6681,TRBV19,CASRIGIYGYPF,TRBJ1-2,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASRIGIYGYPF-GILGFVFTL
6684,TRBV19,CASSSRSAIEQFF,TRBJ2-1,GILGFVFTL,Influenza,PubMed ID: 28300170,McPas-TCR,CASSSRSAIEQFF-GILGFVFTL


In [16]:
df_mcpas.loc[~in_vdjdb,"antigen.epitope"].unique()

array(['YLEPGPVTA', 'SAYGEPRKL', 'AARAVFLAL', 'EAAGIGILTV', 'IVTDFSVIK',
       'AVFDRKSDAK', 'GLCTLVAML', 'RAKFKQLL', 'NLVPMVATV', 'EENLLDFVRF',
       'KAFSPEVIPMF', 'RAFSPEVIPMF', 'RPRGEVRFL', 'ELAGIGILTV',
       'HSKKKCDEL', 'ATDALMTGY', 'TPRVTGGGAM', 'FLRGRAYGL', 'TQGYFPDWQNY',
       'EIYKRWII', 'FLKEKGGL', 'KRWIIMGLNK', 'ISPRTLNAW', 'QASQEVKNW',
       'GLNKIVRMY', 'KRWIILGLNK', 'QYDPVAALF', 'YVLDHLIVV', 'FWIDLFETIG',
       'RYPLTFGWCF', 'RYPLTFGW', 'GILGFVFTL', 'YSEHPTFTSQY', 'QIKVRVKMV',
       'HPVGEADYFEY', 'RPPIFIRRL', 'EPLPQGQLTAY', 'VLEETSVML',
       'ALTPVVVTL', 'FPRPWLHGL', 'IIKDYGKQM', 'LPPIVAKEI', 'HPKVSSEVHI',
       'TPQDLNTML', 'TPGPGVRYPL', 'PQPELPYPQPE', 'LLWNGPMAV', 'VTEHDTLLY',
       'LPRRSGAAGA', 'VLFGLGFAI'], dtype=object)

In [17]:
len(df_mcpas.loc[~in_vdjdb,"antigen.epitope"].unique())

50

In [18]:
df_mcpas.loc[~in_vdjdb,"antigen.epitope"].unique()

array(['YLEPGPVTA', 'SAYGEPRKL', 'AARAVFLAL', 'EAAGIGILTV', 'IVTDFSVIK',
       'AVFDRKSDAK', 'GLCTLVAML', 'RAKFKQLL', 'NLVPMVATV', 'EENLLDFVRF',
       'KAFSPEVIPMF', 'RAFSPEVIPMF', 'RPRGEVRFL', 'ELAGIGILTV',
       'HSKKKCDEL', 'ATDALMTGY', 'TPRVTGGGAM', 'FLRGRAYGL', 'TQGYFPDWQNY',
       'EIYKRWII', 'FLKEKGGL', 'KRWIIMGLNK', 'ISPRTLNAW', 'QASQEVKNW',
       'GLNKIVRMY', 'KRWIILGLNK', 'QYDPVAALF', 'YVLDHLIVV', 'FWIDLFETIG',
       'RYPLTFGWCF', 'RYPLTFGW', 'GILGFVFTL', 'YSEHPTFTSQY', 'QIKVRVKMV',
       'HPVGEADYFEY', 'RPPIFIRRL', 'EPLPQGQLTAY', 'VLEETSVML',
       'ALTPVVVTL', 'FPRPWLHGL', 'IIKDYGKQM', 'LPPIVAKEI', 'HPKVSSEVHI',
       'TPQDLNTML', 'TPGPGVRYPL', 'PQPELPYPQPE', 'LLWNGPMAV', 'VTEHDTLLY',
       'LPRRSGAAGA', 'VLFGLGFAI'], dtype=object)

In [19]:
unique_epitopes = set(df_mcpas.loc[~in_vdjdb,"antigen.epitope"].unique()) - set(df["antigen.epitope"].unique())
unique_epitopes

{'AARAVFLAL',
 'ALTPVVVTL',
 'FWIDLFETIG',
 'PQPELPYPQPE',
 'RAFSPEVIPMF',
 'RYPLTFGW',
 'SAYGEPRKL',
 'VLFGLGFAI',
 'YLEPGPVTA'}

In [20]:
df_mcpas.loc[df_mcpas["antigen.epitope"].isin(unique_epitopes),"antigen.epitope"].shape

(64,)

In [21]:
unique = set(df_mcpas["antigen.epitope"].unique()) - set(df["antigen.epitope"].unique())
unique

{'AARAVFLAL',
 'ALTPVVVTL',
 'FWIDLFETIG',
 'PQPELPYPQPE',
 'RAFSPEVIPMF',
 'RYPLTFGW',
 'SAYGEPRKL',
 'VLFGLGFAI',
 'YLEPGPVTA'}

In [22]:
df_mcpas[df_mcpas["antigen.epitope"].isin(unique)]

Unnamed: 0,TRBV_gene,cdr3,TRBJ_gene,antigen.epitope,Epitope species,Reference,Source,merged
0,TRBV27,CASSLGSSYEQYF,TRBJ2-7,YLEPGPVTA,Melanoma,PubMed ID: 8752841,McPas-TCR,CASSLGSSYEQYF-YLEPGPVTA
1,TRBV4,CASLAGQGYNEQF,TRBJ2-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLAGQGYNEQF-SAYGEPRKL
2,TRBV12,CASLGAQNNEQF,TRBJ2-1,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASLGAQNNEQF-AARAVFLAL
3,TRBV6,CASRLWFWALEAF,TRBJ1-1,SAYGEPRKL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASRLWFWALEAF-SAYGEPRKL
4,TRBV6,CASSYSTGDEQYF,TRBJ2-7,AARAVFLAL,Melanoma,PubMed ID: 8921424,McPas-TCR,CASSYSTGDEQYF-AARAVFLAL
...,...,...,...,...,...,...,...,...
6528,TRBV28,CASSSFVIAGGDYEQNF,TRBJ2-7,VLFGLGFAI,DiabetesType1,PubMed ID: 28300170,McPas-TCR,CASSSFVIAGGDYEQNF-VLFGLGFAI
6529,TRBV3-1,CASRQDRWEKMGQNIRRF,TRBJ2-4,VLFGLGFAI,DiabetesType1,PubMed ID: 28300170,McPas-TCR,CASRQDRWEKMGQNIRRF-VLFGLGFAI
6530,TRBV6-5,CASSPVPGLDEQFF,TRBJ2-1,VLFGLGFAI,DiabetesType1,PubMed ID: 28300170,McPas-TCR,CASSPVPGLDEQFF-VLFGLGFAI
6531,TRBV24-1,CATSDLPYIGTRLNEQFF,TRBJ2-1,VLFGLGFAI,DiabetesType1,PubMed ID: 28300170,McPas-TCR,CATSDLPYIGTRLNEQFF-VLFGLGFAI


In [23]:
df_mcpas_unique = df_mcpas[df_mcpas["antigen.epitope"].isin(unique)]

In [24]:
df_mcpas_unique.to_csv(PROJECT_ROOT / "data/interim/vdjdb-2019-08-08/mcpas-human-trb-mhci-size.csv", index=False, sep=";")

## VDJdb update

In [25]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2020-01-20/vdjdb.txt"
df_2020 = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df_2020 = (
    df_2020.join(json_normalize(df_2020["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df_2020["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df_2020["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df_2020["merged"] = df_2020["cdr3"] + "-" + df_2020["antigen.epitope"]

# create length columns for figures
df_2020["cdr3.len"] = df_2020["cdr3"].str.len()
df_2020["antigen.epitope.len"] = df_2020["antigen.epitope"].str.len()

df_2020_human_no10x = df_2020.loc[(df_2020["species"] == "HomoSapiens") & 
#                                  (df_2020["gene"] == "TRB") & 
                                 (df_2020["cdr3fix.good"]) & 
                                 (df_2020["mhc.class"] == "MHCI") &
                                 ~(df_2020["reference.id"].str.contains("10x"))]

# df_2020_human_no10x = df_2020_human_no10x.loc[ (df_2020_human_no10x["antigen.epitope"].str.len() >= 8) &
#                                                  (df_2020_human_no10x["antigen.epitope"].str.len() <= 11) &
#                                                  (df_2020_human_no10x["cdr3"].str.len() >= 10) &
#                                                  (df_2020_human_no10x["cdr3"].str.len() <= 20)]

df_2020_human_no10x = df_2020_human_no10x.drop_duplicates(columns)

print(f"Filtering reduced the number of unique pairs from {df_2020.drop_duplicates(columns).shape[0]} to {df_2020_human_no10x.shape[0]}.")

  import sys
  
  if __name__ == '__main__':


Filtering reduced the number of unique pairs from 61555 to 20218.


In [26]:
df_2020 = df_2020_human_no10x

In [27]:
unique = set(df_2020_human_no10x["antigen.epitope"].unique()) - set(df["antigen.epitope"].unique())
df_2020_human_no10x[df_2020_human_no10x["antigen.epitope"].isin(unique)]

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,cdr3fix.vId,cdr3fix.oldVEnd,cdr3fix.oldVFixType,cdr3fix.oldVId,cdr3fix.oldJFixType,cdr3fix.oldJStart,cdr3fix.oldJId,merged,cdr3.len,antigen.epitope.len
53237,20790,TRA,CALSGFYNTDKLIF,TRAV19*01,TRAJ34*01,HomoSapiens,HLA-B*35:08,B2M,MHCI,LPEPLPQGQLTAY,...,TRAV19*01,,,,,,,CALSGFYNTDKLIF-LPEPLPQGQLTAY,14,13
53238,20790,TRB,CASSIGTGGSQPQHF,TRBV7-2*01,TRBJ1-5*01,HomoSapiens,HLA-B*35:08,B2M,MHCI,LPEPLPQGQLTAY,...,TRBV7-2*01,,,,,,,CASSIGTGGSQPQHF-LPEPLPQGQLTAY,15,13
53240,20791,TRB,CASSKLGTSEETQYF,TRBV5-6*01,TRBJ2-5*01,HomoSapiens,HLA-B*35:08,B2M,MHCI,LPEPLPQGQLTAY,...,TRBV5-6*01,,,,,,,CASSKLGTSEETQYF-LPEPLPQGQLTAY,15,13
53242,20792,TRB,CASPGLAGEYEQYF,TRBV6-1*01,TRBJ2-7*01,HomoSapiens,HLA-B*35:08,B2M,MHCI,LPEPLPQGQLTAY,...,TRBV6-1*01,,,,,,,CASPGLAGEYEQYF-LPEPLPQGQLTAY,14,13
53244,20793,TRB,CASPGETEAFF,TRBV6-1*01,TRBJ1-1*01,HomoSapiens,HLA-B*35:08,B2M,MHCI,LPEPLPQGQLTAY,...,TRBV6-1*01,,,,,,,CASPGETEAFF-LPEPLPQGQLTAY,11,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76146,24634,TRA,CLVGDLPSGGYQKVTF,TRAV4*01,TRAJ13*01,HomoSapiens,HLA-A*68:01,B2M,MHCI,DATYQRTRALVR,...,TRAV4*01,,,,,,,CLVGDLPSGGYQKVTF-DATYQRTRALVR,16,12
76147,24634,TRB,CSARDRGREKLFF,TRBV20-1*01,TRBJ1-4*01,HomoSapiens,HLA-A*68:01,B2M,MHCI,DATYQRTRALVR,...,TRBV20-1*01,,,,,,,CSARDRGREKLFF-DATYQRTRALVR,13,12
76148,24635,TRA,CAVNNNNDMRF,TRAV12-2*01,TRAJ43*01,HomoSapiens,HLA-A*68:01,B2M,MHCI,DATYQRTRALVR,...,TRAV12-2*01,,,,,,,CAVNNNNDMRF-DATYQRTRALVR,11,12
76149,24635,TRB,CSGSQDPYEQYF,TRBV20-1*01,TRBJ2-7*01,HomoSapiens,HLA-A*68:01,B2M,MHCI,DATYQRTRALVR,...,TRBV20-1*01,,,,,,,CSGSQDPYEQYF-DATYQRTRALVR,12,12


In [28]:
df_2020_human_no10x.loc[df_2020_human_no10x["antigen.epitope"].isin(unique), "mhc.class"].nunique()

1

New epitopes all fall outside of length restrictions

In [29]:
columns = ["cdr3", "antigen.epitope"]
df_diff_unique = df_2020_human_no10x.drop_duplicates(columns).fillna("NULL").merge(df[columns].drop_duplicates(columns).fillna("NULL"),indicator = True, how='outer', on=columns).loc[lambda x : x['_merge']!='both']
df_diff_unique["_merge"].value_counts()

left_only     376
both            0
right_only      0
Name: _merge, dtype: int64

In [30]:
df_diff_unique[df_diff_unique["cdr3fix.good"]].drop_duplicates(columns).groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
DATYQRTRALVR       190
NLVPMVATV           58
GILGFVFTL           44
LLLGIGILV           16
ELAGIGILTV          12
LPEPLPQGQLTAY        9
CPSQEPMSIYVY         6
LLWNGPMAV            3
KRWIILGLNK           3
ARNLVPMVATVQGQN      3
LPEPLPQGGLTAY        2
KLVALGINAV           2
CINGVCWTV            2
QVPLRPMTYK           2
LSEFCRVLCCYVLEE      2
LPEPLPQGQLTGY        2
LPEPLPQAQLTAY        2
GPEPLPQGQLTAY        2
LPEPLPQGQLGAY        2
LPEPLPQGQGTAY        2
LPEGLPQGQLTAY        2
LPEPLGQGQLTAY        2
IVTDFSVIK            1
IPSINVHHY            1
GLCTLVAML            1
SFHSLHLLF            1
EDVPSGKLFMHVTLG      1
NEGVKAAW             1
CLGGLLTMV            1
TPQDLNTML            1
dtype: int64

In [31]:
df_diff_unique[df_diff_unique["cdr3fix.good"]].drop_duplicates(columns).groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
DATYQRTRALVR       190
NLVPMVATV           58
GILGFVFTL           44
LLLGIGILV           16
ELAGIGILTV          12
LPEPLPQGQLTAY        9
CPSQEPMSIYVY         6
LLWNGPMAV            3
KRWIILGLNK           3
ARNLVPMVATVQGQN      3
LPEPLPQGGLTAY        2
KLVALGINAV           2
CINGVCWTV            2
QVPLRPMTYK           2
LSEFCRVLCCYVLEE      2
LPEPLPQGQLTGY        2
LPEPLPQAQLTAY        2
GPEPLPQGQLTAY        2
LPEPLPQGQLGAY        2
LPEPLPQGQGTAY        2
LPEGLPQGQLTAY        2
LPEPLGQGQLTAY        2
IVTDFSVIK            1
IPSINVHHY            1
GLCTLVAML            1
SFHSLHLLF            1
EDVPSGKLFMHVTLG      1
NEGVKAAW             1
CLGGLLTMV            1
TPQDLNTML            1
dtype: int64

In [32]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2020-01-20/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()

df = df.loc[(df["species"] == "HomoSapiens") & 
#                                  (df_2020["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 ~(df["reference.id"].str.contains("10x"))]

df = df.loc[ (df["antigen.epitope"].str.len() >= 8) &
                                                 (df["antigen.epitope"].str.len() <= 11) &
                                                 (df["cdr3"].str.len() >= 10) &
                                                 (df["cdr3"].str.len() <= 20)]

df = df.drop_duplicates(columns)

df_jan = df

###

# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2019-08-08/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()


df = df.loc[(df["species"] == "HomoSapiens") & 
#                                  (df_2020["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 ~(df["reference.id"].str.contains("10x"))]

df = df.loc[ (df["antigen.epitope"].str.len() >= 8) &
                                                 (df["antigen.epitope"].str.len() <= 11) &
                                                 (df["cdr3"].str.len() >= 10) &
                                                 (df["cdr3"].str.len() <= 20)]

df = df.drop_duplicates(columns)

df_aug = df

  import sys
  
  if __name__ == '__main__':


In [33]:
# df_diff = df_aug.fillna("NULL").merge(df_jan.fillna("NULL"),indicator = True, how='outer').loc[lambda x : x['_merge']!='both']

In [34]:
df_diff_unique = df_jan.drop_duplicates(columns).fillna("NULL").merge(df_aug[columns].drop_duplicates(columns).fillna("NULL"),indicator = True, how='outer', on=columns).loc[lambda x : x['_merge']!='both']

In [35]:
df_diff_unique["_merge"].value_counts()

left_only     1
both          0
right_only    0
Name: _merge, dtype: int64

In [36]:
df_diff_unique[df_diff_unique["cdr3fix.good"]].drop_duplicates(columns).groupby("antigen.epitope").size().sort_values(ascending=False)

antigen.epitope
SFHSLHLLF    1
dtype: int64

## 10x data

In [37]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2019-08-08/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()

df_human_10x = df.loc[(df["species"] == "HomoSapiens") & 
                                 (df["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 (df["reference.id"].str.contains("10x"))]

df_human_10x = df_human_10x.loc[ (df_human_10x["antigen.epitope"].str.len() >= 8) &
                                                 (df_human_10x["antigen.epitope"].str.len() <= 11) &
                                                 (df_human_10x["cdr3"].str.len() >= 10) &
                                                 (df_human_10x["cdr3"].str.len() <= 20)]

df_human_10x = df_human_10x.drop_duplicates(columns)

print(f"Filtering reduced the number of unique pairs from {df.drop_duplicates(columns).shape[0]} to {df_human_10x.shape[0]}.")

  import sys
  
  if __name__ == '__main__':


Filtering reduced the number of unique pairs from 61047 to 17200.


In [38]:
# import dataset again to start fresh
vdjdb_normal_path = PROJECT_ROOT / "data/raw/vdjdb/vdjdb-2019-08-08/vdjdb.txt"
df = pd.read_csv(vdjdb_normal_path, sep="\t")

# expand the json/dict-like columns
df = (
    df.join(json_normalize(df["method"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("method."))
    .join(json_normalize(df["meta"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("meta."))
    .join(json_normalize(df["cdr3fix"].apply(lambda x: json.loads(r'{}'.format(x)))).add_prefix("cdr3fix."))
    .drop(["method", "meta", "cdr3fix"], axis=1)
)

columns = ["cdr3", "antigen.epitope"]

# create column based on cdr3 and epitope sequences
df["merged"] = df["cdr3"] + "-" + df["antigen.epitope"]

# create length columns for figures
df["cdr3.len"] = df["cdr3"].str.len()
df["antigen.epitope.len"] = df["antigen.epitope"].str.len()

df_human_no10x = df.loc[(df["species"] == "HomoSapiens") & 
#                                  (df["gene"] == "TRB") & 
                                 (df["cdr3fix.good"]) & 
                                 (df["mhc.class"] == "MHCI") &
                                 ~(df["reference.id"].str.contains("10x"))]

df_human_no10x = df_human_no10x.loc[ (df_human_no10x["antigen.epitope"].str.len() >= 8) &
                                                 (df_human_no10x["antigen.epitope"].str.len() <= 11) &
                                                 (df_human_no10x["cdr3"].str.len() >= 10) &
                                                 (df_human_no10x["cdr3"].str.len() <= 20)]

df_human_no10x = df_human_no10x.drop_duplicates(columns)

print(f"Filtering reduced the number of unique pairs from {df.drop_duplicates(columns).shape[0]} to {df_human_no10x.shape[0]}.")

  import sys
  
  if __name__ == '__main__':


Filtering reduced the number of unique pairs from 61047 to 19842.


In [39]:
unique_10x = set(df_human_10x["antigen.epitope"].unique()) - set(df_human_no10x["antigen.epitope"].unique())
df_human_10x[df_human_10x["antigen.epitope"].isin(unique_10x)]

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,cdr3fix.vId,cdr3fix.oldVEnd,cdr3fix.oldVFixType,cdr3fix.oldVId,cdr3fix.oldJFixType,cdr3fix.oldJId,cdr3fix.oldJStart,merged,cdr3.len,antigen.epitope.len
1567,358,TRB,CASSEGWHSYEQYF,TRBV6-1*01,TRBJ2-7*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV6-1*01,,,,,,,CASSEGWHSYEQYF-KLGGALQAK,14,9
1569,359,TRB,CASGLNIDGDEQFF,TRBV12-5*01,TRBJ2-1*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV12-5*01,,,,,,,CASGLNIDGDEQFF-KLGGALQAK,14,9
1575,362,TRB,CASSVMLDSPLHF,TRBV9*01,TRBJ1-6*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV9*01,,,,,,,CASSVMLDSPLHF-KLGGALQAK,13,9
1577,363,TRB,CASSLESGFLSGYTF,TRBV7-9*01,TRBJ1-2*01,HomoSapiens,HLA-A*24:02,B2M,MHCI,CYTWNQMNL,...,TRBV7-9*01,,,,,,,CASSLESGFLSGYTF-CYTWNQMNL,15,9
1579,364,TRB,CASSFTYRDTQYF,TRBV28*01,TRBJ2-3*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV28*01,,,,,,,CASSFTYRDTQYF-KLGGALQAK,13,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42267,20708,TRB,CASSLGGGRPQHF,TRBV7-8*01,TRBJ1-5*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV7-8*01,,,,,,,CASSLGGGRPQHF-KLGGALQAK,13,9
42271,20710,TRB,CASSHTSADEQFF,TRBV4-3*01,TRBJ2-1*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,RLRAEAQVK,...,TRBV4-3*01,,,,,,,CASSHTSADEQFF-RLRAEAQVK,13,9
42275,20712,TRB,CASSWRTRDNQPQHF,TRBV28*01,TRBJ1-5*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV28*01,,,,,,,CASSWRTRDNQPQHF-KLGGALQAK,15,9
42279,20714,TRB,CASSQDVRGSNSPLHF,TRBV4-3*01,TRBJ1-6*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV4-3*01,,,,,,,CASSQDVRGSNSPLHF-KLGGALQAK,16,9


In [40]:
unique_10x

{'AYAQKIFKI',
 'CLLGTYTQDV',
 'CLLWSFQTSA',
 'CYTWNQMNL',
 'FLASKIGRLV',
 'FLYALALLL',
 'IMDQVPFSV',
 'KLGGALQAK',
 'KLQCVDLHV',
 'KTWGQYWQV',
 'KVAELVHFL',
 'KVLEYVIKV',
 'LLDFVRFMGV',
 'MLDLQPETT',
 'QPRAPIRPI',
 'RIAAWMATY',
 'RLRAEAQVK',
 'RTLNAWVKV',
 'SLFNTVATL',
 'SLFNTVATLY',
 'SLYNTVATLY',
 'YLLEMLWRL',
 'YLNDHLEPWI'}

In [41]:
df_human_10x.loc[df_human_10x["antigen.epitope"].isin(unique_10x), "antigen.epitope"].unique()

array(['KLGGALQAK', 'CYTWNQMNL', 'RIAAWMATY', 'SLFNTVATLY', 'RLRAEAQVK',
       'LLDFVRFMGV', 'AYAQKIFKI', 'FLYALALLL', 'FLASKIGRLV', 'MLDLQPETT',
       'IMDQVPFSV', 'YLLEMLWRL', 'KVLEYVIKV', 'RTLNAWVKV', 'KTWGQYWQV',
       'YLNDHLEPWI', 'QPRAPIRPI', 'SLFNTVATL', 'CLLWSFQTSA', 'CLLGTYTQDV',
       'KVAELVHFL', 'SLYNTVATLY', 'KLQCVDLHV'], dtype=object)

In [42]:
df_human_no10x[df_human_no10x["antigen.epitope"] == "MLDLQPETT"]

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,cdr3fix.vId,cdr3fix.oldVEnd,cdr3fix.oldVFixType,cdr3fix.oldVId,cdr3fix.oldJFixType,cdr3fix.oldJId,cdr3fix.oldJStart,merged,cdr3.len,antigen.epitope.len


In [43]:
df_human_10x.loc[df_human_10x["antigen.epitope"].isin(unique_10x), "antigen.epitope"].value_counts()

KLGGALQAK     12548
RLRAEAQVK       407
RTLNAWVKV        46
AYAQKIFKI        39
SLFNTVATLY       38
FLASKIGRLV       34
FLYALALLL        28
LLDFVRFMGV       21
KTWGQYWQV        16
CYTWNQMNL        15
MLDLQPETT        14
YLLEMLWRL        13
IMDQVPFSV        12
KVLEYVIKV         8
YLNDHLEPWI        7
CLLWSFQTSA        7
QPRAPIRPI         5
SLFNTVATL         5
CLLGTYTQDV        3
SLYNTVATLY        2
KVAELVHFL         1
KLQCVDLHV         1
RIAAWMATY         1
Name: antigen.epitope, dtype: int64

In [44]:
df_human_10x[df_human_10x["antigen.epitope"].isin(unique_10x)]

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,cdr3fix.vId,cdr3fix.oldVEnd,cdr3fix.oldVFixType,cdr3fix.oldVId,cdr3fix.oldJFixType,cdr3fix.oldJId,cdr3fix.oldJStart,merged,cdr3.len,antigen.epitope.len
1567,358,TRB,CASSEGWHSYEQYF,TRBV6-1*01,TRBJ2-7*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV6-1*01,,,,,,,CASSEGWHSYEQYF-KLGGALQAK,14,9
1569,359,TRB,CASGLNIDGDEQFF,TRBV12-5*01,TRBJ2-1*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV12-5*01,,,,,,,CASGLNIDGDEQFF-KLGGALQAK,14,9
1575,362,TRB,CASSVMLDSPLHF,TRBV9*01,TRBJ1-6*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV9*01,,,,,,,CASSVMLDSPLHF-KLGGALQAK,13,9
1577,363,TRB,CASSLESGFLSGYTF,TRBV7-9*01,TRBJ1-2*01,HomoSapiens,HLA-A*24:02,B2M,MHCI,CYTWNQMNL,...,TRBV7-9*01,,,,,,,CASSLESGFLSGYTF-CYTWNQMNL,15,9
1579,364,TRB,CASSFTYRDTQYF,TRBV28*01,TRBJ2-3*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV28*01,,,,,,,CASSFTYRDTQYF-KLGGALQAK,13,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42267,20708,TRB,CASSLGGGRPQHF,TRBV7-8*01,TRBJ1-5*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV7-8*01,,,,,,,CASSLGGGRPQHF-KLGGALQAK,13,9
42271,20710,TRB,CASSHTSADEQFF,TRBV4-3*01,TRBJ2-1*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,RLRAEAQVK,...,TRBV4-3*01,,,,,,,CASSHTSADEQFF-RLRAEAQVK,13,9
42275,20712,TRB,CASSWRTRDNQPQHF,TRBV28*01,TRBJ1-5*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV28*01,,,,,,,CASSWRTRDNQPQHF-KLGGALQAK,15,9
42279,20714,TRB,CASSQDVRGSNSPLHF,TRBV4-3*01,TRBJ1-6*01,HomoSapiens,HLA-A*03:01,B2M,MHCI,KLGGALQAK,...,TRBV4-3*01,,,,,,,CASSQDVRGSNSPLHF-KLGGALQAK,16,9


In [45]:
df_human_10x[df_human_10x["antigen.epitope"].isin(unique_10x)].to_csv(PROJECT_ROOT / "data/interim/vdjdb-2019-08-08/vdjdb-human-trb-mhci-10xunique-size.csv", index=False, sep=";")


In [46]:
df_human_10x.loc[df_human_10x["antigen.epitope"].isin(unique_10x), "antigen.epitope"].value_counts()

KLGGALQAK     12548
RLRAEAQVK       407
RTLNAWVKV        46
AYAQKIFKI        39
SLFNTVATLY       38
FLASKIGRLV       34
FLYALALLL        28
LLDFVRFMGV       21
KTWGQYWQV        16
CYTWNQMNL        15
MLDLQPETT        14
YLLEMLWRL        13
IMDQVPFSV        12
KVLEYVIKV         8
YLNDHLEPWI        7
CLLWSFQTSA        7
QPRAPIRPI         5
SLFNTVATL         5
CLLGTYTQDV        3
SLYNTVATLY        2
KVAELVHFL         1
KLQCVDLHV         1
RIAAWMATY         1
Name: antigen.epitope, dtype: int64

In [47]:
sample_indices = df_human_10x[df_human_10x["antigen.epitope"] == "KLGGALQAK"].sample(n=400, random_state=42).index
drop_indices = df_human_10x[df_human_10x["antigen.epitope"] == "KLGGALQAK"].index.difference(sample_indices)

In [48]:
df_human_10x[df_human_10x["antigen.epitope"].isin(unique_10x)].drop(drop_indices).to_csv(PROJECT_ROOT / "data/interim/vdjdb-2019-08-08/vdjdb-human-trb-mhci-10xuniquedown-size.csv", index=False, sep=";")