In [116]:
import pandas as pd
import glob
import numpy as np

In [26]:
def get_junction_coordinates(df, coordinates_col, sep=':', shift_to_1based=True):
    df['strand'] = None
    df['junction_coordinate'] = None

    for idx, row in df.iterrows():
        kmer_coordinates = [int(x) for x in row[coordinates_col].split(sep) if x !='None']

        if kmer_coordinates[1] < kmer_coordinates[2]: # order strand +

            df.loc[idx, 'strand'] = '+'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:3]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:5]])
        else: # order strand +
            df.loc[idx, 'strand'] = '-'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0]]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0],
                                                                                kmer_coordinates[2],
                                                                                kmer_coordinates[5]
                                                                               ]])
    return df




### Load ETH

In [133]:
batch = 'tmp_out_ref_batch_601'
kmer_files = glob.glob(f'/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/cohort_mutNone/{batch}/ref_graph_kmer_JuncExpr/part*')
kmer_df = pd.concat([ pd.read_csv(kmer_file, sep = '\t') for kmer_file in kmer_files], axis = 0) 
protein_meta = pd.read_csv(f'/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/cohort_mutNone/{batch}/ref_sample_peptides_meta.gz', sep = '\t')

kmer_df = get_junction_coordinates(kmer_df, 'coord')


In [134]:
kmer_df.head()

Unnamed: 0,kmer,coord,isCrossJunction,junctionAnnotated,readFrameAnnotated,TCGA3CAAAU01A11RA41B07all,TCGA3CAALI01A11RA41B07all,TCGA3CAALJ01A31RA41B07all,TCGA3CAALK01A11RA41B07all,TCGA4HAAAK01A12RA41B07all,...,TCGAV7A7HQ01A11RA33J07all,TCGAW8A86G01A21RA36F07all,TCGAWTAB4101A11RA41B07all,TCGAWTAB4401A11RA41B07all,TCGAXXA89901A11RA36F07all,TCGAXXA89A01A11RA36F07all,TCGAZ7A8R501A42RA41B07all,TCGAZ7A8R601A11RA41B07all,strand,junction_coordinate
0,VFHICDRVS,17267946:17267962:17269071:17269082:None:None,True,False,False,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,+,17267962:17269071
1,PSCSRGLLM,17276852:17276876:17280349:17280352:None:None,True,True,False,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,+,17276876:17280349
2,GSRMRWSWA,17271170:17271178:17273339:17273358:None:None,True,True,False,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,4.0,0.0,1.0,+,17271178:17273339
3,DFPYKRILV,17273423:17273447:17276518:17276521:None:None,True,False,False,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,+,17273447:17276518
4,SWSSGRRVT,17273442:17273447:17276518:17276540:None:None,True,False,False,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,+,17273447:17276518


In [127]:
protein_meta.head(3)



Unnamed: 0,peptide,id,readFrame,readFrameAnnotated,geneName,geneChr,geneStrand,mutationMode,junctionAnnotated,hasStopCodon,isInJunctionList,isIsolated,variantComb,variantSegExpr,modifiedExonsCoord,originalExonsCoord,vertexIdx,junctionExpr,segmentExpr,kmerType
0,VIASLQQQVDFQETQLRKINTENETLQKELRERRQQLQAMTDKFSN...,ENSG00000162592.10:15_16:0:3763707:2-exons,0,False,ENSG00000162592.10,chr1,+,ref,0,,0,,,3763707;3763836;3766534;3766612,3763705;3763836;3766534;3766612,15;16,2-exons,,,
1,QVSELERKLTKRDCVISELDTKVSQLQEQVELDQNHLQRWKQLQED...,ENSG00000162592.10:18_19:0:3767232:2-exons,0,False,ENSG00000162592.10,chr1,+,ref,0,,0,,,3767232;3767445;3769782;3769887,3767232;3767445;3769782;3769887,18;19,2-exons,,,
2,LERLRNKIIQATFSISGTKSLANEISDNDILEALQRIISERSDYYN...,ENSG00000162592.10:19_20:0:3769782:2-exons,0,False,ENSG00000162592.10,chr1,+,ref,1,,0,,,3769782;3769887;3771400;3771643,3769782;3769887;3771400;3771645,19;20,2-exons,,,


In [None]:
protein_meta = get_junction_coordinates(protein_meta, 'modifiedExonsCoord', sep=';')

### Load OHSU

In [252]:
ohsu_df = pd.read_csv('/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/OHSU_Feb2023_complete-annotated-shortlist.tsv.gz', sep = '\t')


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [187]:
ohsu_df.columns

Index(['jx', 'TCGA-24-1431-01A-01R-1566-13', 'TCGA-24-2298-01A-01R-1569-13',
       'TCGA-25-1313-01A-01R-1565-13', 'TCGA-25-1319-01A-01R-1565-13',
       'TCGA-61-2008-01A-02R-1568-13', 'TCGA-A2-A0D2-01A-21R-A034-07',
       'TCGA-A2-A0SX-01A-12R-A084-07', 'TCGA-AO-A0JM-01A-21R-A056-07',
       'TCGA-BH-A18V-01A-11R-A12D-07', 'TCGA-C8-A12P-01A-11R-A115-07',
       'GTEx_brain_testis_over0', 'GTEx_brain_testis_over1',
       'GTEx_brain_testis_over10', 'GTEx_brain_testis_over2',
       'GTEx_brain_testis_over3', 'GTEx_brain_testis_over5',
       'GTEx_breast_over0', 'GTEx_breast_over1', 'GTEx_breast_over10',
       'GTEx_breast_over2', 'GTEx_breast_over3', 'GTEx_breast_over5',
       'GTEx_core_over0', 'GTEx_core_over1', 'GTEx_core_over10',
       'GTEx_core_over2', 'GTEx_core_over3', 'GTEx_core_over5',
       'GTEx_ovary_over0', 'GTEx_ovary_over1', 'GTEx_ovary_over10',
       'GTEx_ovary_over2', 'GTEx_ovary_over3', 'GTEx_ovary_over5',
       'all_TCGA_over0', 'all_TCGA_over1', 'all_TC

In [45]:
ohsu_df['in-frame_neoepitopes'] = ohsu_df['in-frame_neoepitopes'].str.split(';')
ohsu_df = ohsu_df.explode("in-frame_neoepitopes")

### Compare

In [173]:
kmer = 'INPRQATNP' #'NMASWGRAT' in batch 101 -   #HTSDNQCQE, NQTTSPAPF , QTTSPAPFV in batch 100 +
search = True

In [135]:
if search:
    for kmer in kmer_df['kmer']:
        foo = ohsu_df.loc[ohsu_df['in-frame_neoepitopes'] == kmer]
        if foo.shape[0] and kmer_df.loc[kmer_df['kmer'] == kmer, 'junctionAnnotated'].values[0]:
            display(foo)


In [174]:
ohsu_df.loc[ohsu_df['in-frame_neoepitopes'] == kmer]

Unnamed: 0,jx,TCGA-24-1431-01A-01R-1566-13,TCGA-24-2298-01A-01R-1569-13,TCGA-25-1313-01A-01R-1565-13,TCGA-25-1319-01A-01R-1565-13,TCGA-61-2008-01A-02R-1568-13,TCGA-A2-A0D2-01A-21R-A034-07,TCGA-A2-A0SX-01A-12R-A084-07,TCGA-AO-A0JM-01A-21R-A056-07,TCGA-BH-A18V-01A-11R-A12D-07,...,modified_upstream_txs,in-frame_all-transcript_biexons,in-frame_nonhanging-tx_biexons,in-frame_peptide_sequence,hanging_txs_included_inframe_pepseqs,prefiltered_in-frame_epitopes,prefiltered_inframe_epitope_count,in-frame_neoepitopes,in-frame_neoepitope_count,frame-agnostic_all-transcript_biexons


In [123]:
kmer_df.loc[kmer_df['kmer'] == kmer]

Unnamed: 0,kmer,coord,isCrossJunction,junctionAnnotated,readFrameAnnotated,TCGA3CAAAU01A11RA41B07all,TCGA3CAALI01A11RA41B07all,TCGA3CAALJ01A31RA41B07all,TCGA3CAALK01A11RA41B07all,TCGA4HAAAK01A12RA41B07all,...,TCGAV7A7HQ01A11RA33J07all,TCGAW8A86G01A21RA36F07all,TCGAWTAB4101A11RA41B07all,TCGAWTAB4401A11RA41B07all,TCGAXXA89901A11RA36F07all,TCGAXXA89A01A11RA36F07all,TCGAZ7A8R501A42RA41B07all,TCGAZ7A8R601A11RA41B07all,strand,junction_coordinate
219,GVSRVKGAK,1337976:1337994:1336506:1336515:None:None,True,True,False,200.0,44.0,73.0,158.0,124.0,...,73.0,117.0,17.0,31.0,104.0,76.0,58.0,131.0,-,1336515:1337976


### Use ETH FULL TABLE

In [137]:
eth_df = pd.read_csv('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_intermediate/complete_cancer_candidates_.tsv.gz', sep = '\t')



In [138]:
eth_df.head()

Unnamed: 0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,cancerCohortfilter >=2.0,cancerCohortfilter >=3.0,cancerCohortfilter >=5.0,cancerCohortfilter >=10.0,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all,isAnnotated
0,WYITRSGIA,3290.0,1969.0,861.0,394.0,89.0,4.0,92347505:92347506:92349915:92349941:None:None,False,False,...,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
1,WYITRSGIA,572.0,545.0,243.0,130.0,27.0,2.0,92347505:92347506:92349915:92349941:None:None,False,False,...,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
2,ISSQSRVEK,278.0,261.0,82.0,29.0,7.0,0.0,92379851:92379859:92493866:92493885:None:None,False,False,...,10,2,0,0,0.0,0.0,2.474321,0.0,0.0,
3,RSGDEEKYP,7350.0,4622.0,2653.0,1745.0,1126.0,734.0,92600493:92600508:92611313:92611325:None:None,True,True,...,641,520,348,170,2.922641,2.102386,1.237161,0.0,0.0,1.0
4,HLKMKMFQI,146.0,136.0,42.0,20.0,3.0,1.0,92379850:92379859:92496416:92496434:None:None,False,False,...,16,2,0,0,2.922641,0.0,0.0,0.0,0.0,


In [160]:
eth_df = get_junction_coordinates(eth_df, 'coord')


KeyboardInterrupt: 

In [168]:
foo = eth_df.loc[(eth_df['junctionAnnotated'] == True) ]
foo = get_junction_coordinates(foo, 'coord')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strand'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['junction_coordinate'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


KeyboardInterrupt: 

In [177]:
kmers_eth = set(foo.loc[foo['strand'] == '-', 'kmer'])

In [178]:
kmers_ohsu = set(ohsu_df['in-frame_neoepitopes'])

In [179]:
res = kmers_ohsu.intersection(kmers_eth)

In [180]:
res

set()

### Use old data

In [184]:
eth_old = pd.read_parquet('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/ARCHIV_keep_runs/commit_v3_TEST_merged3_372a147_medium_run_conf2_annotFrame_cap0_runs/TCGA_Breast_1102/cohort_mutNone/tmp_out_ref_batch_101/ref_graph_kmer_JuncExpr.pq')

In [185]:
eth_old.head()


Unnamed: 0,kmer,TCGA3CAAAU01A11RA41B07all,TCGA3CAALI01A11RA41B07all,TCGA3CAALJ01A31RA41B07all,TCGA3CAALK01A11RA41B07all,TCGA4HAAAK01A12RA41B07all,TCGA5LAAT001A12RA41B07all,TCGA5LAAT101A12RA41B07all,TCGA5TA9QA01A11RA41B07all,TCGAA1A0SB01A11RA14407all,...,TCGAW8A86G01A21RA36F07all,TCGAWTAB4101A11RA41B07all,TCGAWTAB4401A11RA41B07all,TCGAXXA89901A11RA36F07all,TCGAXXA89A01A11RA36F07all,TCGAZ7A8R501A42RA41B07all,TCGAZ7A8R601A11RA41B07all,isCrossJunction,junctionAnnotated,readFrameAnnotated
0,KSMDQDFGW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,1.0
1,SMDQDFGWS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,1.0
2,MDQDFGWSW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,1.0
3,DQDFGWSWL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,1.0
4,QDFGWSWLR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,1.0


In [186]:
eth_old.loc[eth_old['junctionAnnotated'] == 1]

Unnamed: 0,kmer,TCGA3CAAAU01A11RA41B07all,TCGA3CAALI01A11RA41B07all,TCGA3CAALJ01A31RA41B07all,TCGA3CAALK01A11RA41B07all,TCGA4HAAAK01A12RA41B07all,TCGA5LAAT001A12RA41B07all,TCGA5LAAT101A12RA41B07all,TCGA5TA9QA01A11RA41B07all,TCGAA1A0SB01A11RA14407all,...,TCGAW8A86G01A21RA36F07all,TCGAWTAB4101A11RA41B07all,TCGAWTAB4401A11RA41B07all,TCGAXXA89901A11RA36F07all,TCGAXXA89A01A11RA36F07all,TCGAZ7A8R501A42RA41B07all,TCGAZ7A8R601A11RA41B07all,isCrossJunction,junctionAnnotated,readFrameAnnotated
7,GWSWLRVLT,198.0,47.0,55.0,143.0,124.0,77.0,40.0,32.0,96.0,...,95.0,5.0,8.0,125.0,56.0,17.0,126.0,True,1.0,1.0
205,PGWSWLRVL,198.0,47.0,55.0,143.0,124.0,77.0,40.0,32.0,96.0,...,95.0,5.0,8.0,125.0,56.0,17.0,126.0,True,1.0,0.0
206,ASTAAWSPG,198.0,47.0,55.0,143.0,124.0,77.0,40.0,32.0,96.0,...,95.0,5.0,8.0,125.0,56.0,17.0,126.0,True,1.0,0.0
207,STAAWSPGW,198.0,47.0,55.0,143.0,124.0,77.0,40.0,32.0,96.0,...,95.0,5.0,8.0,125.0,56.0,17.0,126.0,True,1.0,0.0
208,TAAWSPGWS,198.0,47.0,55.0,143.0,124.0,77.0,40.0,32.0,96.0,...,95.0,5.0,8.0,125.0,56.0,17.0,126.0,True,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1300,SRVKGAKAV,200.0,44.0,73.0,158.0,124.0,59.0,41.0,37.0,65.0,...,117.0,17.0,31.0,104.0,76.0,58.0,131.0,True,1.0,0.0
1301,RVKGAKAVG,200.0,44.0,73.0,158.0,124.0,59.0,41.0,37.0,65.0,...,117.0,17.0,31.0,104.0,76.0,58.0,131.0,True,1.0,0.0
1302,VKGAKAVGP,200.0,44.0,73.0,158.0,124.0,59.0,41.0,37.0,65.0,...,117.0,17.0,31.0,104.0,76.0,58.0,131.0,True,1.0,0.0
1303,KGAKAVGPP,200.0,44.0,73.0,158.0,124.0,59.0,41.0,37.0,65.0,...,117.0,17.0,31.0,104.0,76.0,58.0,131.0,True,1.0,0.0


In [188]:
ohsu_old = pd.read_csv('/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/OHSU_Feb2023_complete-annotated-shortlist.tsv.gz', sep = '\t')


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [190]:
ep_col = 'prefiltered_in-frame_epitopes'
ohsu_old[ep_col] = ohsu_old[ep_col].str.split(';')
ohsu_old = ohsu_old.explode(ep_col)

In [195]:
ohsu_old

Unnamed: 0,jx,TCGA-24-1431-01A-01R-1566-13,TCGA-24-2298-01A-01R-1569-13,TCGA-25-1313-01A-01R-1565-13,TCGA-25-1319-01A-01R-1565-13,TCGA-61-2008-01A-02R-1568-13,TCGA-A2-A0D2-01A-21R-A034-07,TCGA-A2-A0SX-01A-12R-A084-07,TCGA-AO-A0JM-01A-21R-A056-07,TCGA-BH-A18V-01A-11R-A12D-07,...,modified_upstream_txs,in-frame_all-transcript_biexons,in-frame_nonhanging-tx_biexons,in-frame_peptide_sequence,hanging_txs_included_inframe_pepseqs,prefiltered_in-frame_epitopes,prefiltered_inframe_epitope_count,in-frame_neoepitopes,in-frame_neoepitope_count,frame-agnostic_all-transcript_biexons
0,chr10;48726;48803;-,0.000000,2.176140,0.842293,2.517914,0.961344,0.000000,5.700962,2.102386,1.237161,...,['ENST00000561967.1.MOD.CHR10.48726.48803.MINU...,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...,MNMPSTPLAPTTGTATCSWSASTCTTTRPAVAGQCGAGNNWAKGHY...,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NVYYNEASG,17.0,TTRPAVAGQ;TRPAVAGQC;RPAVAGQCG;PAVAGQCGA;AVAGQCGAG,5.0,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...
0,chr10;48726;48803;-,0.000000,2.176140,0.842293,2.517914,0.961344,0.000000,5.700962,2.102386,1.237161,...,['ENST00000561967.1.MOD.CHR10.48726.48803.MINU...,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...,MNMPSTPLAPTTGTATCSWSASTCTTTRPAVAGQCGAGNNWAKGHY...,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,VYYNEASGG,17.0,TTRPAVAGQ;TRPAVAGQC;RPAVAGQCG;PAVAGQCGA;AVAGQCGAG,5.0,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...
0,chr10;48726;48803;-,0.000000,2.176140,0.842293,2.517914,0.961344,0.000000,5.700962,2.102386,1.237161,...,['ENST00000561967.1.MOD.CHR10.48726.48803.MINU...,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...,MNMPSTPLAPTTGTATCSWSASTCTTTRPAVAGQCGAGNNWAKGHY...,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,YYNEASGGR,17.0,TTRPAVAGQ;TRPAVAGQC;RPAVAGQCG;PAVAGQCGA;AVAGQCGAG,5.0,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...
0,chr10;48726;48803;-,0.000000,2.176140,0.842293,2.517914,0.961344,0.000000,5.700962,2.102386,1.237161,...,['ENST00000561967.1.MOD.CHR10.48726.48803.MINU...,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...,MNMPSTPLAPTTGTATCSWSASTCTTTRPAVAGQCGAGNNWAKGHY...,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,YNEASGGRY,17.0,TTRPAVAGQ;TRPAVAGQC;RPAVAGQCG;PAVAGQCGA;AVAGQCGAG,5.0,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...
0,chr10;48726;48803;-,0.000000,2.176140,0.842293,2.517914,0.961344,0.000000,5.700962,2.102386,1.237161,...,['ENST00000561967.1.MOD.CHR10.48726.48803.MINU...,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...,MNMPSTPLAPTTGTATCSWSASTCTTTRPAVAGQCGAGNNWAKGHY...,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NEASGGRYV,17.0,TTRPAVAGQ;TRPAVAGQC;RPAVAGQCG;PAVAGQCGA;AVAGQCGAG,5.0,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1346995,chr2;109934895;109947363;+,14.921219,3.626901,175.196991,87.287682,1.922689,0.000000,1.140192,2.102386,0.000000,...,,,,,,,,,,
1346996,chr2;109987191;109987560;+,34.433582,53.678131,138.136089,42.804536,61.526038,21.880716,92.355580,8.409545,44.537781,...,,,,,,,,,,
1346997,chr2;109987191;109993691;+,0.000000,1.450760,1.684586,1.678609,0.000000,0.000000,1.140192,2.102386,0.000000,...,,,,,,,,,,
1346998,chr2;109987191;109994400;+,16.069005,5.077661,5.053759,10.071656,8.652099,9.572813,4.560769,10.511931,8.660124,...,,,,,,,,,,


In [194]:
kmer = 'APGVSRVKP'
ohsu_old.loc[ohsu_old[ep_col] == kmer]

Unnamed: 0,jx,TCGA-24-1431-01A-01R-1566-13,TCGA-24-2298-01A-01R-1569-13,TCGA-25-1313-01A-01R-1565-13,TCGA-25-1319-01A-01R-1565-13,TCGA-61-2008-01A-02R-1568-13,TCGA-A2-A0D2-01A-21R-A034-07,TCGA-A2-A0SX-01A-12R-A084-07,TCGA-AO-A0JM-01A-21R-A056-07,TCGA-BH-A18V-01A-11R-A12D-07,...,modified_upstream_txs,in-frame_all-transcript_biexons,in-frame_nonhanging-tx_biexons,in-frame_peptide_sequence,hanging_txs_included_inframe_pepseqs,prefiltered_in-frame_epitopes,prefiltered_inframe_epitope_count,in-frame_neoepitopes,in-frame_neoepitope_count,frame-agnostic_all-transcript_biexons


In [197]:
gene_id = 'ENSG00000107404.20'
ohsu_old.loc[ohsu_old['gene_id'] == gene_id]

Unnamed: 0,jx,TCGA-24-1431-01A-01R-1566-13,TCGA-24-2298-01A-01R-1569-13,TCGA-25-1313-01A-01R-1565-13,TCGA-25-1319-01A-01R-1565-13,TCGA-61-2008-01A-02R-1568-13,TCGA-A2-A0D2-01A-21R-A034-07,TCGA-A2-A0SX-01A-12R-A084-07,TCGA-AO-A0JM-01A-21R-A056-07,TCGA-BH-A18V-01A-11R-A12D-07,...,modified_upstream_txs,in-frame_all-transcript_biexons,in-frame_nonhanging-tx_biexons,in-frame_peptide_sequence,hanging_txs_included_inframe_pepseqs,prefiltered_in-frame_epitopes,prefiltered_inframe_epitope_count,in-frame_neoepitopes,in-frame_neoepitope_count,frame-agnostic_all-transcript_biexons
135280,chr1;1336516;1337017;-,3.443358,6.528421,3.369173,3.357219,7.690755,0.000000,1.140192,0.0,1.237161,...,['ENST00000632445.1.MOD.CHR1.1336516.1337017.M...,,,,,,0.0,,0.0,
135281,chr1;1338159;1338268;-,4.591144,4.352281,0.000000,4.196523,0.961344,1.367545,5.700962,0.0,2.474321,...,['ENST00000631679.1.MOD.CHR1.1338159.1338268.M...,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...,YVFGDLCSMAPVGLRIR,YVFGDLCSMAPVGLRIR,YVFGDLCSM,9.0,YVFGDLCSM;VFGDLCSMA;FGDLCSMAP;GDLCSMAPV;DLCSMA...,9.0,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...
135281,chr1;1338159;1338268;-,4.591144,4.352281,0.000000,4.196523,0.961344,1.367545,5.700962,0.0,2.474321,...,['ENST00000631679.1.MOD.CHR1.1338159.1338268.M...,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...,YVFGDLCSMAPVGLRIR,YVFGDLCSMAPVGLRIR,VFGDLCSMA,9.0,YVFGDLCSM;VFGDLCSMA;FGDLCSMAP;GDLCSMAPV;DLCSMA...,9.0,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...
135281,chr1;1338159;1338268;-,4.591144,4.352281,0.000000,4.196523,0.961344,1.367545,5.700962,0.0,2.474321,...,['ENST00000631679.1.MOD.CHR1.1338159.1338268.M...,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...,YVFGDLCSMAPVGLRIR,YVFGDLCSMAPVGLRIR,FGDLCSMAP,9.0,YVFGDLCSM;VFGDLCSMA;FGDLCSMAP;GDLCSMAPV;DLCSMA...,9.0,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...
135281,chr1;1338159;1338268;-,4.591144,4.352281,0.000000,4.196523,0.961344,1.367545,5.700962,0.0,2.474321,...,['ENST00000631679.1.MOD.CHR1.1338159.1338268.M...,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...,YVFGDLCSMAPVGLRIR,YVFGDLCSMAPVGLRIR,GDLCSMAPV,9.0,YVFGDLCSM;VFGDLCSMA;FGDLCSMAP;GDLCSMAPV;DLCSMA...,9.0,ADVVDWLYTHVEGFKERREARKYASSLLKHGFLRHTVNKITFSEQC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1216474,chr1;1342139;1342362;-,0.000000,0.725380,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,['ENST00000378888.10.MOD.CHR1.1342139.1342362....,PCFNGRVVSWLVLAEGAHSDAGSQGTDSHTDLPPPLERTGGIGDSR...,PCFNGRVVSWLVLAEGAHSDAGSQGTDSHTDLPPPLERTGGIGDSR...,GDSRPPSFHRDGMDNET,GDSRPPSFHRDGMDNET,FHRDGMDNE,9.0,DSRPPSFHR;SRPPSFHRD;RPPSFHRDG;PPSFHRDGM;PSFHRD...,8.0,FPASTAAWSPGWSWLRVLTRMRGPRARTATQTCPRLLSGQAASGTP...
1216474,chr1;1342139;1342362;-,0.000000,0.725380,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,['ENST00000378888.10.MOD.CHR1.1342139.1342362....,PCFNGRVVSWLVLAEGAHSDAGSQGTDSHTDLPPPLERTGGIGDSR...,PCFNGRVVSWLVLAEGAHSDAGSQGTDSHTDLPPPLERTGGIGDSR...,GDSRPPSFHRDGMDNET,GDSRPPSFHRDGMDNET,HRDGMDNET,9.0,DSRPPSFHR;SRPPSFHRD;RPPSFHRDG;PPSFHRDGM;PSFHRD...,8.0,FPASTAAWSPGWSWLRVLTRMRGPRARTATQTCPRLLSGQAASGTP...
1220095,chr1;1338654;1340397;-,0.000000,0.725380,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,,,,,,,,,,
1220096,chr1;1339396;1339484;-,0.000000,0.725380,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,,,,,,,,,,


In [198]:
old_ohsu_set = set(ohsu_old.loc[ohsu_old['gene_id'] == gene_id, ep_col])

In [199]:
old_eth_set = set(eth_old.loc[eth_old['junctionAnnotated'] == 1, 'kmer'])

In [200]:
old_eth_set.intersection(old_ohsu_set)

{'ALPRYELEE',
 'DSSGSTPAA',
 'ELEEAPLTV',
 'GALPRYELE',
 'GSQQSEALD',
 'GTPGPPPST',
 'GVSRVKGAK',
 'IRKYASSLL',
 'LIRKYASSL',
 'LPRYELEEA',
 'LTGALPRYE',
 'PGVSRVKGA',
 'PRYELEEAP',
 'RLIRKYASS',
 'RYELEEAPL',
 'SRLIRKYAS',
 'SSRLIRKYA',
 'STGSQQSEA',
 'TGALPRYEL',
 'TGSQQSEAL',
 'TSSRLIRKY',
 'YELEEAPLT'}

In [205]:
kmer = 'ALPRYELEE'

In [206]:

ohsu_old.loc[ohsu_old[ep_col] == kmer]

Unnamed: 0,jx,TCGA-24-1431-01A-01R-1566-13,TCGA-24-2298-01A-01R-1569-13,TCGA-25-1313-01A-01R-1565-13,TCGA-25-1319-01A-01R-1565-13,TCGA-61-2008-01A-02R-1568-13,TCGA-A2-A0D2-01A-21R-A034-07,TCGA-A2-A0SX-01A-12R-A084-07,TCGA-AO-A0JM-01A-21R-A056-07,TCGA-BH-A18V-01A-11R-A12D-07,...,modified_upstream_txs,in-frame_all-transcript_biexons,in-frame_nonhanging-tx_biexons,in-frame_peptide_sequence,hanging_txs_included_inframe_pepseqs,prefiltered_in-frame_epitopes,prefiltered_inframe_epitope_count,in-frame_neoepitopes,in-frame_neoepitope_count,frame-agnostic_all-transcript_biexons
895337,chr1;1338654;1339361;-,270.877511,144.350649,67.383458,80.573245,241.29743,30.085984,103.757504,39.945338,45.774942,...,['ENST00000631679.1.MOD.CHR1.1338654.1339361.M...,IDPAAWLSHTAALTGALPRYELEEAPLTVKSDMSAVVRVMQLPDSG...,TGPISLTVAKCWDPTPRSYFTVPRADPVRPIDPAAWLSHTAALTGA...,LTGALPRYELEEAPLTV,LTGALPRYELEEAPLTV,ALPRYELEE,9.0,,0.0,IDPAAWLSHTAALTGALPRYELEEAPLTVKSDMSAVVRVMQLPDSG...


In [204]:
eth_old.loc[eth_old['kmer'] == kmer, 'coord']

KeyError: 'coord'

In [None]:
(pep3) prelotla@login-biomed-01:.../tmp_out_ref_batch_101$ parquet-tools csv *meta* | grep ALPRYELEE
IDPAAWLSHTAALTGALPRYELEEAPLTVKSDMSAVVRVMQLPDSGLEIRDRMWLKITIANAVI,ENSG00000107404.20:33_29:0:1339422:2-exons,1,1,ENSG00000107404.20,chr1,-,ref,1,0,nan,0,nan,nan,1339361;1339422;1338522;1338653,1339361;1339439;1338521;1338653,33;29,[[39. 20. 15. ... 35. 20. 48.]],nan,2-exons
DPVRPIDPAAWLSHTAALTGALPRYELEEAPLTVKSDMSAVVRVMQLPDSGLEIRDRMWLKITIANAVI,ENSG00000107404.20:33_29:0:1339437:2-exons,1,0,ENSG00000107404.20,chr1,-,ref,1,0,nan,0,nan,nan,1339361;1339437;1338522;1338653,1339361;1339439;1338521;1338653,33;29,[[39. 20. 15. ... 35. 20. 48.]],nan,2-exons
IDPAAWLSHTAALTGALPRYELEEAPLTVKSDMSAVVRVMQLPDSGLEIRDRMWLKITIANAVI,ENSG00000107404.20:34_29:0:1339422:2-exons,1,1,ENSG00000107404.20,chr1,-,ref,1,0,nan,0,nan,nan,1339361;1339422;1338522;1338653,1339361;1339649;1338521;1338653,34;29,[[39. 20. 15. ... 35. 20. 48.]],nan,2-exons

In [None]:
Coordinates 
1339361;1339422;1338522;1338653 
=> 1338653, 1339361

In [None]:
ANNOTATION FINDINGS 
1338653, 1339362
NOT FOUND 
1338654 1339361

In [207]:
1338654 < 1339361

True

In [208]:
1339439 > 1339362

True

In [209]:
1338522 < 1338653

True

In [210]:
1338653 < 1339362

True

In [211]:
1338522 < 1338653 < 1339362 < 1339439

True

In [230]:
tmp_junction

Unnamed: 0,0,1,2,3
0,chr10,48726,48803,-
0,chr10,48726,48803,-
0,chr10,48726,48803,-
0,chr10,48726,48803,-
0,chr10,48726,48803,-
...,...,...,...,...
1346995,chr2,109934895,109947363,+
1346996,chr2,109987191,109987560,+
1346997,chr2,109987191,109993691,+
1346998,chr2,109987191,109994400,+


0              48725
0              48725
0              48725
0              48725
0              48725
             ...    
1346995    109934894
1346996    109987190
1346997    109987190
1346998    109987190
1346999    109987190
Name: 1, Length: 4157968, dtype: object

In [253]:
def ohsu_to_eth_coord(df, col = 'jx', sep = ';'):
    tmp_jx = df[col].str.split(sep,  expand = True)
    df[col] = tmp_jx[0] + sep + (tmp_jx[1].astype(int) - 1).astype(str) + sep + tmp_jx[2] + sep + tmp_jx[3]
    return df

ohsu_df = ohsu_to_eth_coord(ohsu_df)

In [255]:
ohsu_df.head()

Unnamed: 0,jx,TCGA-24-1431-01A-01R-1566-13,TCGA-24-2298-01A-01R-1569-13,TCGA-25-1313-01A-01R-1565-13,TCGA-25-1319-01A-01R-1565-13,TCGA-61-2008-01A-02R-1568-13,TCGA-A2-A0D2-01A-21R-A034-07,TCGA-A2-A0SX-01A-12R-A084-07,TCGA-AO-A0JM-01A-21R-A056-07,TCGA-BH-A18V-01A-11R-A12D-07,...,modified_upstream_txs,in-frame_all-transcript_biexons,in-frame_nonhanging-tx_biexons,in-frame_peptide_sequence,hanging_txs_included_inframe_pepseqs,prefiltered_in-frame_epitopes,prefiltered_inframe_epitope_count,in-frame_neoepitopes,in-frame_neoepitope_count,frame-agnostic_all-transcript_biexons
0,chr10;48725;48803;-,0.0,2.17614,0.842293,2.517914,0.961344,0.0,5.700962,2.102386,1.237161,...,['ENST00000561967.1.MOD.CHR10.48726.48803.MINU...,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...,MNMPSTPLAPTTGTATCSWSASTCTTTRPAVAGQCGAGNNWAKGHY...,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NVYYNEASGGRYVPRAV;TCTTTRPAVAGQCGAG,NVYYNEASG;VYYNEASGG;YYNEASGGR;YNEASGGRY;NEASGG...,17.0,TTRPAVAGQ;TRPAVAGQC;RPAVAGQCG;PAVAGQCGA;AVAGQCGAG,5.0,LTQIGQCGNQIGAKFWEVISDEHAIDSAGTYHGDSHLQLERINVYY...
1,chr10;277577;281199;-,2.295572,30.465966,16.003571,10.071656,26.917642,51.9667,53.58904,31.535793,50.723584,...,['ENST00000280886.12.MOD.CHR10.277578.281199.M...,RTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHK...,RTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHK...,RAHKSVTECAVFTWTNL,RAHKSVTECAVFTWTNL,RAHKSVTEC;AHKSVTECA;HKSVTECAV;KSVTECAVF;SVTECA...,9.0,,0.0,RTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHK...
2,chr10;280260;281199;-,0.0,0.0,0.0,0.0,0.961344,0.0,0.0,6.307159,2.474321,...,['ENST00000280886.12.MOD.CHR10.280261.281199.M...,RTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHK...,RTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHK...,RAHKSVTEC,RAHKSVTEC,RAHKSVTEC,1.0,,0.0,RTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHK...
3,chr10;281323;283271;-,3.443358,23.212165,18.530451,15.107483,20.188231,32.821074,33.065578,44.15011,40.8263,...,['ENST00000280886.12.MOD.CHR10.281324.283271.M...,IWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLG...,IWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLG...,TELTDANGERHDALYVV,TELTDANGERHDALYVV,TELTDANGE;ELTDANGER;LTDANGERH;TDANGERHD;DANGER...,9.0,,0.0,IWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLG...
4,chr10;283446;286272;-,1.147786,41.346668,19.372744,6.714437,28.84033,61.539513,62.710579,37.842952,92.787044,...,['ENST00000280886.12.MOD.CHR10.283447.286272.M...,ALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDS...,ALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDS...,LGDSHLGEIWVHSAHN,LGDSHLGEIWVHSAHN,LGDSHLGEI;GDSHLGEIW;DSHLGEIWV;SHLGEIWVH;HLGEIW...,8.0,,0.0,ALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDS...


In [237]:
ohsu_df['jx'].head(2)

0    chr10;48726;48803;-
0    chr10;48726;48803;-
Name: jx, dtype: object