In [1]:
import tarfile
import pandas as pd
import glob
import os
import numpy as np
from collections import defaultdict
import gzip

# Load OHSU 

In [9]:
path = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/OHSU_may17-2023_experimentID-update.tar.gz'

In [10]:
with tarfile.open(path, 'r') as tar:
    names = tar.getnames()
    

In [11]:
[ i for i in names if 'per_peptide' in i]


['OHSU-may17-experimentID-update/expts_per_peptide',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-AO-A0JM-01A-21R-A056-07_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-24-1431-01A-01R-1566-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-A2-A0SX-01A-12R-A084-07_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-61-2008-01A-02R-1568-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-24-2298-01A-01R-1569-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-A2-A0D2-01A-21R-A034-07_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-25-1313-01A-01R-1565-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-25-1319-01A-01R-1565-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/

In [12]:
# for i in names:
#     print(i)

In [13]:
f1 = 'OHSU-may17-experimentID-update/experiment_maps/J_TCGA-BH-A18V-01A-11R-A12D-07_experiment_map.tsv'
f2 = 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-BH-A18V-01A-11R-A12D-07_experiments_per_peptide.tsv'


In [14]:
with tarfile.open(path, 'r') as tar:
    d1 = pd.read_csv(tar.extractfile(f1), sep = '\t')
    d2 = pd.read_csv(tar.extractfile(f2), sep = '\t')

In [15]:
d1.head()

Unnamed: 0,id,motif_filter,min_sample_reads,#_of_cohort_samples,reads_per_cohort_sample,normal_cohort_id,#_normal_samples_allowed,reads_per_normal_sample
0,0AN00PC,1,0,0,0,paired,0,0
1,0AN11PC,1,0,0,0,paired,1,1
2,0AN31PC,1,0,0,0,paired,1,3
3,0ANA1PC,1,0,0,0,paired,1,-1
4,0AN12PC,1,0,0,0,paired,2,1


In [16]:
d2.head()

Unnamed: 0,peptide_id,peptide_sequence,experiment_ids
0,100001,MEDYTKIEKIGEVFRMCLCRIPGYISSLSFFPWI,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...
1,100002,IEEHNIKIYHLPDAESDEDEDFKEQTRLLKASIPFSVVGSNQLIEA...,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...
2,100003,QPANCKVESLAMFLGELSLIDADPYLKYLPSVIAGAAFHLALYTVT...,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...
3,100004,MWYRLRLLKPQPNIIPTVKDGVSPCSPGWSRTLGLKWSACLGLQKW,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...
4,100005,LIVIPATYYLWPRDQNAEQIRLKNIRKVYGRCMWYRLRLLKPQPNI...,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...


In [153]:
len(d2['peptide_id'].unique())

13705

In [156]:
d2.shape


(13705, 3)

# Load ETH 

In [2]:
def path_to_expID(path:str):
    ID = {}
    filter_background = os.path.basename(path).split('_')[-1].replace('.tsv.gz', '')
    filter_foreground = os.path.basename(path).split('_')[-2]
    #print(filter_background, filter_foreground)
    # Extract Values
    ID['filter_background_reads'] = filter_background.split('lim')[-1].split('Across')[0]
    ID['filter_background_samples'] = filter_background.split('Across')[-1]
    ID['filter_background_cohort'] = filter_background.split('Normals')[1].split('Cohort')[0]
    ID['filter_foreground_reads'] = filter_foreground.split('Lim')[-1].split('Across')[0]
    ID['filter_foreground_samples'] = filter_foreground.split('Across')[-1]
    ID['filter_foreground_target'] = (filter_foreground.split('Lim')[1].replace('SampleLim', '').replace('Cohort', ''))
    return ID
    

In [3]:
def expID_to_block(ID_split: dict):
    ID_dict = {'Any': 'A', 'None': 'N', '10': 'X', 'paired': 'P', 'Gtex': 'G', 'Core_GTEx': 'R', 'Full': 'F'}
    motif = 'A' # Any
    for k, v in ID_split.items():
        try:
            assert(v != '10') #replace roman letter
            ID_split[k] = str(int(float(v)))
        except:
            ID_split[k] = ID_dict[v]

    return ID_split['filter_foreground_target'] + \
    ID_split['filter_foreground_reads'] + \
    ID_split['filter_foreground_samples'] + \
    ID_split['filter_background_reads'] + \
    ID_split['filter_background_samples'] + \
    ID_split['filter_background_cohort'] + \
    motif 
    

In [4]:
def preprocess_fasta(fasta):
    '''fasta: str. Path fasta file'''
    print(f'Load {fasta} fasta')
    # Extract the peptides from the 1 target fasta from sample
    peptides_IDS = []
    peptides_sequences = []
    pep_position = []
    between_codons = []
    with gzip.open(fasta, 'rt') as f:
        for line in f.readlines():
            if '>' in line:
                pep_position.append(int(line.split(';')[1].split('-')[-1]))
                between_codons.append(int(line.split(';')[2].split('-')[-1]))
                peptides_IDS.append(line.split(';')[0].split('-')[-1])
            else:
                peptides_sequences.append(line.replace('\n',''))




    # Cut the peptides around the junction 
    peptides_IDS_expand = []
    cuts_peptides = []
    kmer_len = 9 
    for idx in np.arange(len(peptides_IDS)):
        for k in np.arange(kmer_len - between_codons[idx] ): 
            peptides_IDS_expand.append(peptides_IDS[idx])
            cut_kmer = peptides_sequences[idx][pep_position[idx] - k : pep_position[idx] + (kmer_len - k)] 
            cuts_peptides.append(cut_kmer)
    return peptides_IDS, peptides_sequences, peptides_IDS_expand, cuts_peptides

In [5]:
def experiments_preprocess(path_filtered_sample):
    '''path_filtered_sample: str. path for all experiements of sample'''
    # Extract the occurences of the kmers in the experimental files. 


    kmer_to_filesID = defaultdict(list)
    for filtered in path_filtered_sample: # All experiments for given sample
        # GET filtered file ID
        ID_split = path_to_expID(filtered)
        ID_EXPERIMENT = expID_to_block(ID_split)
        # Read filtered file 
        df_filt = pd.read_csv(filtered, sep = '\t')
        for km in set(df_filt['kmer']):
            kmer_to_filesID[km].append(ID_EXPERIMENT)

    kmer_to_filesID_ = []
    for k, v in kmer_to_filesID.items():  # Collapse 
        kmer_to_filesID_.append((k,';'.join(np.unique(v))))
    return kmer_to_filesID_

In [6]:
### Inputs ###
fasta_paths = glob.glob('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast*/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/*fa*')

path_filtered = glob.glob('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast*/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/*Filt*')

In [64]:
### Main ###
for fasta in fasta_paths:
    sample = os.path.basename(fasta).split('_')[1]
    path_filtered_sample = [path for path in path_filtered if sample in path]
    print(len(path_filtered_sample))

    # Fasta
    peptides_IDS, peptides_sequences, peptides_IDS_expand, cuts_peptides = preprocess_fasta(fasta)
    fa = pd.DataFrame({'peptide_id': peptides_IDS_expand, \
    'kmers': cuts_peptides}) 
    fa = fa.drop_duplicates()
    display(fa.head())
    print(fa.shape)
    

    # Experiments
    kmer_to_filesID_ = experiments_preprocess(path_filtered_sample)
    exp = pd.DataFrame(kmer_to_filesID_, columns = ['kmers', 'experiment_ids'])
    display(exp.head())
    print(exp.shape)

    # Result: Merge fasta on experiments
    res = exp.merge(fa, on = 'kmers', how = 'inner')
    display(res.head())
    print(res.shape)
    res = res.drop('kmers', axis = 1).drop_duplicates()
    print(res.shape)
    # Result: Collapse to get unique peptide IDs
    collapse_res = defaultdict(set)
    for pep_id, exp_id in zip(res['peptide_id'], res['experiment_ids']):
        collapse_res[pep_id].update(exp_id.split(';'))
    collapse_res_df = []
    for k, v in collapse_res.items():  # Collapse 
        collapse_res_df.append((k,';'.join(v)))
    collapse_res_df = pd.DataFrame(collapse_res_df, columns = ['peptide_id', 'experiment_ids'])
    display(collapse_res_df.head())
    print(collapse_res_df.shape)
    
    # Result: add peptide sequences
    peptide_sequences_ = pd.DataFrame({'peptide_id': peptides_IDS, 'peptide_sequence':peptides_sequences})
    collapse_res_df = collapse_res_df.merge(peptide_sequences_, on = 'peptide_id', how = 'inner')
    display(collapse_res_df.head())
    print(collapse_res_df.shape)
    
    print('Number of peptide IDS')
    print(len(fa['peptide_id'].unique()))
    print(len(res['peptide_id'].unique()))
    print(len(collapse_res_df['peptide_id'].unique()))
    print('----- \n')
#     display(res.loc[res['peptide_id'] == '1'])

65
Load /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-BH-A18V-01A-11R-A12D-07_pool_kmer.fa.gz fasta


Unnamed: 0,peptide_id,kmers
0,1,EAVRSLKET
1,1,SEAVRSLKE
2,1,ISEAVRSLK
3,1,AISEAVRSL
4,1,AAISEAVRS


(4944, 2)


Unnamed: 0,kmers,experiment_ids
0,FSESRKTFK,0213AGA;0213XGA;021AAGA;0A13AGA;0A13XGA;0A1AAG...
1,SCWSRVTPF,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...
2,PVWKTLVVW,0AN3AGA;0AN3XGA;0ANAAGA
3,GNTQTQLAR,0A13AGA;0A13XGA;0A1AAGA;0AN3AGA;0AN3XGA;0ANAAGA
4,QAQPHQKMG,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...


(1470, 2)


Unnamed: 0,kmers,experiment_ids,peptide_id
0,FSESRKTFK,0213AGA;0213XGA;021AAGA;0A13AGA;0A13XGA;0A1AAG...,471
1,SCWSRVTPF,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...,278
2,SCWSRVTPF,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...,279
3,PVWKTLVVW,0AN3AGA;0AN3XGA;0ANAAGA,561
4,GNTQTQLAR,0A13AGA;0A13XGA;0A1AAGA;0AN3AGA;0AN3XGA;0ANAAGA,431


(3264, 3)
(602, 2)


Unnamed: 0,peptide_id,experiment_ids
0,471,0AN3AGA;0213XGA;0A1AAGA;0A13AGA;0213AGA;0AN3XG...
1,278,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...
2,279,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...
3,561,0ANAAGA;0AN3XGA;0AN3AGA
4,431,0AN3AGA;0A1AAGA;0A13AGA;0AN3XGA;0ANAAGA;0A13XGA


(590, 2)


Unnamed: 0,peptide_id,experiment_ids,peptide_sequence
0,471,0AN3AGA;0213XGA;0A1AAGA;0A13AGA;0213AGA;0AN3XG...,EQRWLDEQQQIMESLNVLHSELKNKVETFSESRKTFKNHL
1,278,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...,CCSCTRSGTFDCFHVGICPSSRAKANVGHAPYSCWSRVTPF
2,279,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...,HAPYSCWSRVTPF
3,561,0ANAAGA;0AN3XGA;0AN3AGA,NQCTPLLLAATSGALDTIQYLFSIGANWRKTDIKGNNIIHLSVLTF...
4,431,0AN3AGA;0A1AAGA;0A13AGA;0AN3XGA;0ANAAGA;0A13XGA,SFSGQDSDKMGISMSDIQCLLDKEGASELVIDVIVNTKNDRIFSEG...


(590, 3)
Number of peptide IDS
590
590
590
----- 

65
Load /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-A2-A0SX-01A-12R-A084-07_pool_kmer.fa.gz fasta


Unnamed: 0,peptide_id,kmers
0,1,GDSPIPGAG
1,1,AGDSPIPGA
2,1,AAGDSPIPG
3,1,IAAGDSPIP
4,1,HIAAGDSPI


(2221, 2)


Unnamed: 0,kmers,experiment_ids
0,SEKHTGEWS,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...
1,RGQMAEKNR,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...
2,LDEEEGSPS,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...
3,KKQVWRNFH,0A101GA;0A102GA;0A10AGA;0A10XGA;0A111GA;0A112G...
4,FAESPASAV,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...


(1079, 2)


Unnamed: 0,kmers,experiment_ids,peptide_id
0,SEKHTGEWS,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...,183
1,SEKHTGEWS,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...,184
2,RGQMAEKNR,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,47
3,RGQMAEKNR,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,48
4,RGQMAEKNR,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,49


(1592, 3)
(281, 2)


Unnamed: 0,peptide_id,experiment_ids
0,183,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...
1,184,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...
2,47,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...
3,48,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...
4,49,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...


(263, 2)


Unnamed: 0,peptide_id,experiment_ids,peptide_sequence
0,183,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...,GPVGPSGPPGKDGTSGHPGPIGPPGPRGNRGERGSEKHTGEWSKTV...
1,184,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...,PPGKDGTSGHPGPIGPPGPRGNRGERGSEKHTGEWSKTVFEYRTRK...
2,47,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...,MQIENLKEELAYLKKNHEEMNALRGQMAEKNRKDAEDWFFSK
3,48,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...,FETEQALRLSVEADINGLRRVLDELTLARADLEMQIENLKEELAYL...
4,49,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...,MNALRGQMAEKNRKDAEDWFFSK


(263, 3)
Number of peptide IDS
263
263
263
----- 

65
Load /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-A2-A0D2-01A-21R-A034-07_pool_kmer.fa.gz fasta


Unnamed: 0,peptide_id,kmers
0,1,DDISAQLEE
1,1,ADDISAQLE
2,1,AADDISAQL
3,1,GAADDISAQ
4,1,HGAADDISA


(1997, 2)


Unnamed: 0,kmers,experiment_ids
0,HLDSKPSRT,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...
1,SEGREFRIR,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...
2,KQLVAREQL,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...
3,NAAQKDLMK,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...
4,GDEEKDKDR,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...


(1019, 2)


Unnamed: 0,kmers,experiment_ids,peptide_id
0,HLDSKPSRT,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,176
1,SEGREFRIR,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,158
2,KQLVAREQL,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...,217
3,NAAQKDLMK,0AN01GA;0AN02GA;0AN0AGA;0AN0XGA;0AN11GA;0AN12G...,3
4,GDEEKDKDR,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,109


(1416, 3)
(238, 2)


Unnamed: 0,peptide_id,experiment_ids
0,176,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...
1,158,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...
2,217,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...
3,3,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...
4,109,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...


(235, 2)


Unnamed: 0,peptide_id,experiment_ids,peptide_sequence
0,176,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...,PTLGHLDSKPSRT
1,158,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...,MNIQEQGFPLDLGASFTEDAPRPPVPGEEGELVSTDPRPASYSFCS...
2,217,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...,EMAAEKAKAAAGEAKVKKQLVAREQLQGK
3,3,0AN02GA;0AN12GA;0AN32GA;0AN3AGA;0AN0AGA;0AN31G...,FHEERQMAWDQREVDLERQLDIFDRQQNEILNAAQKDLMKQSPTPV...
4,109,0A10AGA;02112GA;0A111GA;0AN0AGA;02102GA;0A1AAG...,GWTSRWIESKHKSDFGKFVLSSGKFYGDEEKDKDRHARRLRIQHHV


(235, 3)
Number of peptide IDS
235
235
235
----- 

65
Load /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-AO-A0JM-01A-21R-A056-07_pool_kmer.fa.gz fasta


Unnamed: 0,peptide_id,kmers
0,1,QDDTSPRD
1,1,SQDDTSPRD
2,1,KSQDDTSPR
3,1,MKSQDDTSP
4,1,QMKSQDDTS


(2741, 2)


Unnamed: 0,kmers,experiment_ids
0,KLHNTHVLP,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...
1,LLQASQYTC,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...
2,PGQRSGHEH,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...
3,SGSFQGRGV,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...
4,FPGQRSGHE,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...


(1197, 2)


Unnamed: 0,kmers,experiment_ids,peptide_id
0,KLHNTHVLP,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...,275
1,KLHNTHVLP,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...,276
2,LLQASQYTC,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...,62
3,LLQASQYTC,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...,63
4,PGQRSGHEH,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...,233


(1926, 3)
(344, 2)


Unnamed: 0,peptide_id,experiment_ids
0,275,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...
1,276,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...
2,62,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...
3,63,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...
4,233,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...


(329, 2)


Unnamed: 0,peptide_id,experiment_ids,peptide_sequence
0,275,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...,SAARRAGTSCANCQTTTTTLWRRNANGDPVCNACGLYYKLHNTHVL...
1,276,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...,RRAGTSCANCQTTTTTLWRRNANGDPVCNACGLYYKLHNTHVLPEP...
2,62,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...,QESRDVLCELSDHHNHTLEEECQWGPCLQCLWALLQASQYTCPP
3,63,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...,CSQESRDVLCELSDHHNHTLEEECQWGPCLQCLWALLQASQYTCPP
4,233,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...,GNLMVPVFIGCPPGKRLAFDITYTLEYSRLKNKHYFDCVNVNPEMP...


(329, 3)
Number of peptide IDS
329
329
329
----- 

65
Load /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-C8-A12P-01A-11R-A115-07_pool_kmer.fa.gz fasta


Unnamed: 0,peptide_id,kmers
0,1,MDGKRMFGT
1,1,SMDGKRMFG
2,1,FSMDGKRMF
3,1,SFSMDGKRM
4,1,ASFSMDGKR


(2399, 2)


Unnamed: 0,kmers,experiment_ids
0,FIENRVLFC,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...
1,KQFIENRVL,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...
2,RKFKLPAGH,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...
3,LVNVVQLGP,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...
4,REQGAFPTT,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...


(907, 2)


Unnamed: 0,kmers,experiment_ids,peptide_id
0,FIENRVLFC,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,133
1,FIENRVLFC,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,134
2,FIENRVLFC,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,135
3,KQFIENRVL,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,133
4,KQFIENRVL,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,134


(1481, 3)
(297, 2)


Unnamed: 0,peptide_id,experiment_ids
0,133,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...
1,134,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...
2,135,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...
3,193,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...
4,194,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...


(276, 2)


Unnamed: 0,peptide_id,experiment_ids,peptide_sequence
0,133,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...,QALAKAGKGMHGGVPGGKQFIENRVLFCRPHWSSTAQSRLTATSAS...
1,134,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...,SIVEKSILLTEQALAKAGKGMHGGVPGGKQFIENRVLFCRPHWSST...
2,135,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...,EKSILLTEQALAKAGKGMHGGVPGGKQFIENRVLFCRPHWSSTAQS...
3,193,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...,TSPCKSRKRNARRRARWKTIHRKFKLPAGHGGSRL
4,194,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...,IHSGEKYLTNRTSPCKSRKRNARRRARWKTIHRKFKLPAGHGGSRL


(276, 3)
Number of peptide IDS
281
276
276
----- 



In [75]:
test =  exp.merge(fa, on = 'kmers', how = 'right')

In [76]:
test

Unnamed: 0,kmers,experiment_ids,peptide_id
0,MDGKRMFGT,,1
1,SMDGKRMFG,021AAGA;0A1AAGA;0A5AAGA;0ANAAGA,1
2,FSMDGKRMF,021AAGA;0A1AAGA;0A5AAGA;0ANAAGA,1
3,SFSMDGKRM,021AAGA;0A1AAGA;0A5AAGA;0ANAAGA,1
4,ASFSMDGKR,021AAGA;0A1AAGA;0A5AAGA;0ANAAGA,1
...,...,...,...
2394,SSNPRHH,,292
2395,NSSNPRHH,,292
2396,WNSSNPRHH,02112GA;0211AGA;0211XGA;02132GA;0213AGA;0213XG...,292
2397,YWNSSNPRH,,292


In [79]:
foo = test.groupby('peptide_id').sum()

In [88]:
for idx, i in enumerate(foo['experiment_ids']):
    print(len(i))

217
4191
2488
1866
90
138
497
497
329
568
568
217
4191
217
138
42
568
217
60
60
60
60
138
4191
497
855
2488
1244
206
78
760
984
711
760
234
1866
1272
2595
3752
3752
3752
1272
1272
1272
1272
248
1272
497
355
933
933
933
933
31
159
248
312
665
138
105
105
234
568
31
31
721
721
312
184
355
355
355
355
355
1660
933
760
824
312
570
105
155
475
475
195
195
195
195
195
273
195
195
195
159
159
1866
426
312
721
355
2488
355
156
138
1272
1272
1272
4152
4152
4152
4152
217
2488
4152
265
69
69
69
952
47
47
39
273
2488
39
159
284
284
284
284
519
117
117
2488
117
117
2488
312
380
380
380
380
380
155
155
28
28
2488
78
46
46
46
46
119
159
254


TypeError: object of type 'int' has no len()

In [91]:
foo.iloc[idx]

kmers             ANTSTQTMGVANTSTQTMRVANTSTQTQRVANTSTQTQRVANTSTS...
experiment_ids                                                    0
Name: 248, dtype: object

In [96]:
peptide_sequences_.loc[peptide_sequences_['peptide_id'] == '248', 'peptide_sequence'].values

array(['ASSQVPRVMSTQRVANTSTQTMGYNAT'], dtype=object)

In [92]:
test.loc[test['peptide_id'] == '248']

Unnamed: 0,kmers,experiment_ids,peptide_id
2016,ANTSTQTMG,,248
2017,VANTSTQTM,,248
2018,RVANTSTQT,,248
2019,QRVANTSTQ,,248
2020,TQRVANTST,,248
2021,STQRVANTS,,248
2022,MSTQRVANT,,248
2023,VMSTQRVAN,,248
2024,RVMSTQRVA,,248


In [None]:
array(['ASSQVP RVMSTQRVANTSTQTMG YNAT'], dtype=object)

In [74]:
test.loc[test['peptide_id'].isna()]

Unnamed: 0,kmers,experiment_ids,peptide_id
729,ISFGVACFH,0213AGA;0A13AGA;0AN3AGA,


In [61]:
np.sort(collapse_res_df['peptide_id'].astype(int))


array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [20]:
res.head()

Unnamed: 0,kmers,experiment_ids,peptide_id
0,FIENRVLFC,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,133
1,FIENRVLFC,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,134
2,FIENRVLFC,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,135
3,KQFIENRVL,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,133
4,KQFIENRVL,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,134


In [21]:
res = res.drop('kmers', axis = 1).drop_duplicates()

In [52]:
res.shape


(297, 2)

In [53]:
collapse_res_df.shape

(276, 2)

In [49]:
collapse_res_df

Unnamed: 0,0,1
0,133,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...
1,134,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...
2,135,025AAGA;0A53AGA;0A511GA;02501GA;02112GA;0A10AG...
3,193,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...
4,194,025AAGA;0A53AGA;0A1AAGA;0213AGA;0A13XGA;0A132G...
...,...,...
271,66,0ANAAGA
272,82,0ANAAGA
273,238,0ANAAGA
274,239,0ANAAGA


In [42]:
pep_id

'112'

In [43]:
exp_id.split(';')

['0ANAAGA']

In [44]:
collapse_res

defaultdict(set,
            {'133': {'02101GA',
              '02102GA',
              '0210AGA',
              '0210XGA',
              '02111GA',
              '02112GA',
              '0211AGA',
              '0211XGA',
              '02131GA',
              '02132GA',
              '0213AGA',
              '0213XGA',
              '021AAGA',
              '02501GA',
              '02502GA',
              '0250AGA',
              '0250XGA',
              '02511GA',
              '02512GA',
              '0251AGA',
              '0251XGA',
              '02531GA',
              '02532GA',
              '0253AGA',
              '0253XGA',
              '025AAGA',
              '0A101GA',
              '0A102GA',
              '0A10AGA',
              '0A10XGA',
              '0A111GA',
              '0A112GA',
              '0A11AGA',
              '0A11XGA',
              '0A131GA',
              '0A132GA',
              '0A13AGA',
              '0A13XGA',
              '0A1AAGA',
 

In [22]:
res

Unnamed: 0,experiment_ids,peptide_id
0,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,133
1,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,134
2,02101GA;02102GA;0210AGA;0210XGA;02111GA;02112G...,135
6,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...,193
7,02132GA;0213AGA;0213XGA;021AAGA;02532GA;0253AG...,194
...,...,...
1452,0ANAAGA,66
1453,0ANAAGA,82
1454,0ANAAGA,238
1455,0ANAAGA,239


# TESTS

In [29]:
# GENERATE IDS # Test version #TODO ===> NEED to Generate the parameter file 
collect_IDS = []
for filt_file in path_filtered_sample:
    ID_split = path_to_expID(filt_file)
    ID = expID_to_block(ID_split)
    collect_IDS.append(ID)
    


In [60]:
# TEST ETH 
for pos in np.arange(6):
    res =  np.unique([i[pos] for i in np.unique(collect_IDS)])
    print(f'ID position {pos} unique values {res}')

ID position 0 unique values ['0']
ID position 1 unique values ['2' 'A']
ID position 2 unique values ['1' '5' 'N']
ID position 3 unique values ['0' '1' '3' 'A']
ID position 4 unique values ['1' '2' 'A' 'X']
ID position 5 unique values ['G']


In [61]:
# TEST OHSU 
for pos in np.arange(6):
    res =  np.unique([i[pos] for i in d1['id'].unique()])
    print(f'ID position {pos} unique values {res}')

ID position 0 unique values ['0']
ID position 1 unique values ['2' 'A']
ID position 2 unique values ['1' '5' 'N']
ID position 3 unique values ['0' '1' '3' 'A']
ID position 4 unique values ['0' '1' '2' 'A' 'X']
ID position 5 unique values ['G' 'P']


In [None]:
SLACK
# BACKGROUND cohorts we do (cohort_reads, sample_number)- KEEP pipeline as such
# cohort_reads=[0,1,3, Any]
# sample_number=[1,2,10, Any]
# except Any, Any
# FOREGROUND  (cohort_reads, sample_number) means
# sample_reads=[0]
# cohort_reads=[0,2]
# sample_number(rest of cohort) =[1, 5]

In [26]:
print(d1['id'].unique())

['0NN00PC' '0NNN1PC' '0NN11PC' '0NN31PC' '0NNA1PC' '0NNN2PC' '0NN12PC'
 '0NN32PC' '0NNA2PC' '0NNNXPC' '0NN1XPC' '0NN3XPC' '0NNAXPC' '0NNNAPC'
 '0NN1APC' '0NN3APC' '0N100PC' '0N1N1PC' '0N111PC' '0N131PC' '0N1A1PC'
 '0N1N2PC' '0N112PC' '0N132PC' '0N1A2PC' '0N1NXPC' '0N11XPC' '0N13XPC'
 '0N1AXPC' '0N1NAPC' '0N11APC' '0N13APC' '02100PC' '021N1PC' '02111PC'
 '02131PC' '021A1PC' '021N2PC' '02112PC' '02132PC' '021A2PC' '021NXPC'
 '0211XPC' '0213XPC' '021AXPC' '021NAPC' '0211APC' '0213APC' '0N500PC'
 '0N5N1PC' '0N511PC' '0N531PC' '0N5A1PC' '0N5N2PC' '0N512PC' '0N532PC'
 '0N5A2PC' '0N5NXPC' '0N51XPC' '0N53XPC' '0N5AXPC' '0N5NAPC' '0N51APC'
 '0N53APC' '02500PC' '025N1PC' '02511PC' '02531PC' '025A1PC' '025N2PC'
 '02512PC' '02532PC' '025A2PC' '025NXPC' '0251XPC' '0253XPC' '025AXPC'
 '025NAPC' '0251APC' '0253APC' '0NN00PA' '0NNN1PA' '0NN11PA' '0NN31PA'
 '0NNA1PA' '0NNN2PA' '0NN12PA' '0NN32PA' '0NNA2PA' '0NNNXPA' '0NN1XPA'
 '0NN3XPA' '0NNAXPA' '0NNNAPA' '0NN1APA' '0NN3APA' '0N100PA' '0N1N1PA'
 '0N11

In [106]:
for i in np.unique(collect_IDS):
    if i[3] == '0':
        print(i)

00101GA
00102GA
0010AGA
0010XGA
00501GA
00502GA
0050AGA
0050XGA
02101GA
02102GA
0210AGA
0210XGA
02501GA
02502GA
0250AGA
0250XGA
0NN01GA
0NN02GA
0NN0AGA
0NN0XGA


In [108]:
for i in d1['id'].unique():
    if i[3] == '0'  and i[6] == 'A' and i[5] == 'G':
        print(i)

0NN00GA
0N100GA
02100GA
0N500GA
02500GA


In [107]:
for i in d1['id'].unique():
    if i[3] == 'N' and i[6] == 'A' and i[5] == 'G':
        print(i)

0NNN1GA
0NNN2GA
0NNNXGA
0NNNAGA
0N1N1GA
0N1N2GA
0N1NXGA
0N1NAGA
021N1GA
021N2GA
021NXGA
021NAGA
0N5N1GA
0N5N2GA
0N5NXGA
0N5NAGA
025N1GA
025N2GA
025NXGA
025NAGA


In [102]:
d1.loc[d1['id'] == '02100GC']

Unnamed: 0,id,motif_filter,min_sample_reads,#_of_cohort_samples,reads_per_cohort_sample,normal_cohort_id,#_normal_samples_allowed,reads_per_normal_sample
192,02100GC,1,0,1,2,GTEx,0,0


In [100]:
d1.loc[d1['id'] == '021N2PA']

Unnamed: 0,id,motif_filter,min_sample_reads,#_of_cohort_samples,reads_per_cohort_sample,normal_cohort_id,#_normal_samples_allowed,reads_per_normal_sample
117,021N2PA,0,0,1,2,paired,2,0


In [101]:
d1.loc[d1['id'] == '02102PA']

Unnamed: 0,id,motif_filter,min_sample_reads,#_of_cohort_samples,reads_per_cohort_sample,normal_cohort_id,#_normal_samples_allowed,reads_per_normal_sample
