In [58]:
import os 
import pandas as pd 
import numpy as np 
import tarfile
import glob
from collections import defaultdict

In [32]:
# paths 
path_tryptic_pep= '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/tryptic_peptide_fastas'

path_filtered_pep = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102'

commit_interest = 'commit_d4aee54_GTEXcore' # hardcoded cohort 

path_eth = os.path.join(path_tryptic_pep, 'G_tryptic_peptides')

file_name = 'peptide-extracted-filter.fasta'

In [26]:
samples = ['TCGA-BH-A18V-01A-11R-A12D-07','TCGA-C8-A12P-01A-11R-A115-07',
           'TCGA-A2-A0D2-01A-21R-A034-07','TCGA-A2-A0SX-01A-12R-A084-07',
           'TCGA-AO-A0JM-01A-21R-A056-07']

In [28]:
sample = samples[-1]

In [61]:
# Tryptic digestion products from Andy Lin 
data_tryptic = []
counter = 0 
with open(os.path.join(path_eth, 'G_' + sample, file_name ), 'r') as f: 
    for line in f.readlines():
        if 'pepID' in line:
            counter +=1
            data_tryptic.append ([line.split('\n')[0]])
        else:
            data_tryptic[-1].extend([counter, line.split('\n')[0]])


data_tryptic = pd.DataFrame(data_tryptic)
data_tryptic.columns = ['id', 'counter', 'tryptic_seq']

In [77]:
assert(data_tryptic['id'].unique().shape[0] == data_tryptic.shape[0])

In [78]:
data_tryptic.shape

(24764, 3)

In [79]:
data_tryptic['id'].unique().shape[0]

24764

In [83]:
data_tryptic['tryptic_seq'].unique().shape

(2757,)

In [84]:
# Filtered peptides to query (longlist for sample)
data_meta = []
with open(os.path.join(path_filtered_pep, f'filter_{sample}.all', commit_interest, f'G_{sample}_query_peptides.fa'), 'r') as f :
    for line in f.readlines():
        if 'pepID' in line:
            data_meta.append(line.split('\n')[0].split(';'))
        else:
            data_meta[-1].extend([line.split('\n')[0]])

data_meta = pd.DataFrame(data_meta)
data_meta.columns = ['id', 'jx_pos', 'between_codons', 'include5', 'include3', 'gene', 'coord', 'seq']

In [86]:
data_meta.head(3)

Unnamed: 0,id,jx_pos,between_codons,include5,include3,gene,coord,seq
0,>pepID-1,jx_pos-12,between_codons-1,includes_5'-0,includes_3'-1,gene-ENSG00000119333.11,jx_coord-128640690_128636438,WLRGELDRAAADGWTMGTGARLSPSCVPGTWTGETCVPSSRRPWWR...
1,>pepID-2,jx_pos-27,between_codons-1,includes_5'-0,includes_3'-1,gene-ENSG00000086758.16,jx_coord-53550665_53549505,ATVGSSILALLCREVAPSRWGVAAAITGLLAVM
2,>pepID-3,jx_pos-6,between_codons-1,includes_5'-0,includes_3'-1,gene-ENSG00000120149.9,jx_coord-174725038_174729158,PIFAAAKTYEPYHLHPEETQDQSEAAHALYHIPAPRPGAQVPSETV...


In [87]:
# merge tryptic, peptide metadata
df = data_meta.merge(data_tryptic, on = 'id', how = 'outer')

In [89]:
df

Unnamed: 0,id,jx_pos,between_codons,include5,include3,gene,coord,seq,counter,tryptic_seq
0,>pepID-1,jx_pos-12,between_codons-1,includes_5'-0,includes_3'-1,gene-ENSG00000119333.11,jx_coord-128640690_128636438,WLRGELDRAAADGWTMGTGARLSPSCVPGTWTGETCVPSSRRPWWR...,1.0,AAADGWTMGTGAR
1,>pepID-2,jx_pos-27,between_codons-1,includes_5'-0,includes_3'-1,gene-ENSG00000086758.16,jx_coord-53550665_53549505,ATVGSSILALLCREVAPSRWGVAAAITGLLAVM,2.0,WGVAAAITGLLAVM
2,>pepID-3,jx_pos-6,between_codons-1,includes_5'-0,includes_3'-1,gene-ENSG00000120149.9,jx_coord-174725038_174729158,PIFAAAKTYEPYHLHPEETQDQSEAAHALYHIPAPRPGAQVPSETV...,,
3,>pepID-4,jx_pos-6,between_codons-1,includes_5'-0,includes_3'-1,gene-ENSG00000120149.9,jx_coord-174725038_174729158,PIFAAAKTYEPYHLHPEETQDQSEAAHALYHIPAPRPGAQVPSETV...,,
4,>pepID-5,jx_pos-90,between_codons-0,includes_5'-1,includes_3'-1,gene-ENSG00000120149.9,jx_coord-174724853_174725017,MASPSKGNDLFSPDEEGPAVVAGPGPGPGGAEGAAEERRVKVSSLP...,3.0,TYEPYHLHPEETQDQSEAAHALYHIPAPR
...,...,...,...,...,...,...,...,...,...,...
40137,>pepID-40138,jx_pos-24,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000134419.15,jx_coord-18783640_18783102,NLYPPNYASGALVESILTSWLLYYSFIVLTIFTNKMPHGLWCFHLV...,,
40138,>pepID-40139,jx_pos-24,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000134419.15,jx_coord-18783640_18783102,NLYPPNYASGALVESILTSWLLYYSFIVQIKCLMDSGASTWSF,,
40139,>pepID-40140,jx_pos-24,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000134419.15,jx_coord-18783640_18783102,NLYPPNYASGALVESILTSWLLYYSFIVQIKCLMDSGASTWSF,,
40140,>pepID-40141,jx_pos-24,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000134419.15,jx_coord-18783640_18783102,NLYPPNYASGALVESILTSWLLYYSFIVQIKCLMDSGASTWSF,,


In [132]:
jct_test = 'jx_coord-99971826_99971771'

In [153]:
grp_trypt = df[['coord', 'tryptic_seq']].groupby(['coord', 'tryptic_seq']).count().reset_index()

grp_trypt = grp_trypt.groupby(['coord']).count().reset_index()

In [158]:
# grp_trypt.loc[grp_trypt['coord'] == jct_test]

# df.loc[df['coord'] == jct_test ]['tryptic_seq'].unique()

In [160]:
# Counting the number of peptides right 
grp_peptides = df[['coord', 'id']].groupby(['coord']).count().reset_index()

In [161]:
df_plot = grp_trypt.merge(grp_peptides, on = 'coord', how = 'outer')

# Checks from Cancer Cell Paper 

In [119]:
asn_exp = pd.read_csv('/cluster/work/grlab/projects/TCGA/PanCanAtlas/peptides_neoantigen/analysis_pancan/ccell_rerun_2018/output/expression_distribution/asns/TCGA-AO-A0JM.asns_kmers_annotated.gtex.tsv', sep = '\t')

In [120]:
asn_exp.loc[#(foo['PEPTIDE_TYPE'] == 'reference') &\
         (foo['JUNC_COUNT'] > 0 ) &\
       (foo['IN_CPTAC'] == 1), :]

Unnamed: 0,SAMPLE_ID,PEPTIDE_TYPE,KMER_TYPE,PROTEIN_NUMBER,CJ_ID,KMER_SEQ,LIBRARY_SIZE,JUNC_COUNT,IN_CPTAC
44,TCGA-AO-A0JM,germline_somatic,cj,30-46.26030,ENSG00000133392.12_15820705_15820818_15818745_...,LQNEAESVT,3262.5,14.0,1
45,TCGA-AO-A0JM,germline_somatic,cj,30-46.26030,ENSG00000133392.12_15820705_15820818_15818745_...,QNEAESVTG,3262.5,14.0,1
241,TCGA-AO-A0JM,germline_somatic,cj,17-33.12694,ENSG00000120733.9_137717205_137717279_13772171...,DNSTPQSEG,3262.5,22.0,1
242,TCGA-AO-A0JM,germline_somatic,cj,17-33.12694,ENSG00000120733.9_137717205_137717279_13772171...,NSTPQSEGG,3262.5,22.0,1
243,TCGA-AO-A0JM,germline_somatic,cj,17-33.12694,ENSG00000120733.9_137717205_137717279_13772171...,STPQSEGGT,3262.5,22.0,1
...,...,...,...,...,...,...,...,...,...
56157,TCGA-AO-A0JM,reference,cj,17-32.210546,ENSG00000129354.7_10685093_10685168_10683745_1...,ITQSGDYQL,3262.5,376.0,1
56158,TCGA-AO-A0JM,reference,cj,17-32.210546,ENSG00000129354.7_10685093_10685168_10683745_1...,TQSGDYQLR,3262.5,376.0,1
77811,TCGA-AO-A0JM,reference,cj,75-92.163039,ENSG00000118523.5_132271432_132271681_13227108...,TVVGPALAA,3262.5,2044.0,1
77812,TCGA-AO-A0JM,reference,cj,75-92.163039,ENSG00000118523.5_132271432_132271681_13227108...,VVGPALAAY,3262.5,2044.0,1


In [117]:
kmer = 'ENSG00000133392.12'
[idx for idx, seq in enumerate(df['gene']) if kmer in seq]

[]

In [118]:
df.iloc[[14250, 24616, 24617, 24822, 24823, 32986, 32987, 36317, 36318]]

Unnamed: 0,id,jx_pos,between_codons,include5,include3,gene,coord,seq,counter,tryptic_seq
14250,>pepID-14251,jx_pos-23,between_codons-1,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10578891_10577356,VKPLIWIESVIEKFSHSRVEIMVKAKGQFKKHMCRRETS,,
24616,>pepID-24617,jx_pos-11,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10574903_10574492,LRSPTSPSLGSRSDT,15161.0,SPTSPSLGSR
24617,>pepID-24618,jx_pos-11,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10574903_10574492,LRSPTSPSLGSRSDT,15162.0,SPTSPSLGSR
24822,>pepID-24823,jx_pos-11,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10574903_10574492,LRSPTSPSLGSRSDT,15297.0,SPTSPSLGSR
24823,>pepID-24824,jx_pos-11,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10574903_10574492,LRSPTSPSLGSRSDT,15298.0,SPTSPSLGSR
32986,>pepID-32987,jx_pos-11,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10574903_10574492,LRSPTSPSLGSRSDT,20253.0,SPTSPSLGSR
32987,>pepID-32988,jx_pos-11,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10574903_10574492,LRSPTSPSLGSRSDT,20254.0,SPTSPSLGSR
36317,>pepID-36318,jx_pos-11,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10574903_10574492,LRSPTSPSLGSRSDT,22377.0,SPTSPSLGSR
36318,>pepID-36319,jx_pos-11,between_codons-0,includes_5'-0,includes_3'-1,gene-ENSG00000129354.11,jx_coord-10574903_10574492,LRSPTSPSLGSRSDT,22378.0,SPTSPSLGSR


In [125]:
asn_final = pd.read_csv('/cluster/work/grlab/projects/TCGA/PanCanAtlas/peptides_neoantigen/analysis_pancan/ccell_rerun_2018/output/cptac_mhc/filtered/filtered_for_new_junctions/sites/expression_filtered/TCGA-A2-A0YM.gtex.unique_novel_junction_binders.2.0.annotated.sites.expressed.txt', '\t')

FileNotFoundError: [Errno 2] No such file or directory: '/cluster/work/grlab/projects/TCGA/PanCanAtlas/peptides_neoantigen/analysis_pancan/ccell_rerun_2018/output/cptac_mhc/filtered/filtered_for_new_junctions/sites/expression_filtered/TCGA-A2-A0YM.gtex.unique_novel_junction_binders.2.0.annotated.sites.expressed.txt'

In [123]:
asn_final.head()

Unnamed: 0,#junction,Nof_peptides,Peptides,Binding_ranks,Sources


In [None]:
# Prensent one sample in the power point = the A0 -- plot from yesterday also in A 0 