In [1]:
import tarfile
import pandas as pd
import glob
import os
import numpy as np
from collections import defaultdict
import gzip
from helpers_map import *

# MAIN

In [2]:
### Inputs ###

basepath = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374' #TCGA_Breast_1102'
filtering_run = 'filtering_samples/filters_19May_order_5ge_wAnnot_GPstar'
fasta_paths = glob.glob(os.path.join(basepath, filtering_run, '*fa*'))
path_filtered = glob.glob(os.path.join(basepath, filtering_run, '*Filt*'))

In [None]:
### Main ###
for fasta in fasta_paths:
    sample = os.path.basename(fasta).split('_')[1]
    path_filtered_sample = [path for path in path_filtered if sample in path]
    print(len(path_filtered_sample))

    # Fasta
    peptides_IDS, peptides_sequences, peptides_IDS_expand, cuts_peptides = preprocess_fasta(fasta)
    fa = pd.DataFrame({'peptide_id': peptides_IDS_expand, \
    'kmers': cuts_peptides}) 
    fa = fa.drop_duplicates()
    display(fa.head())
    print(fa.shape)
    

    # Experiments
    kmer_to_filesID_ = experiments_preprocess(path_filtered_sample)
    exp = pd.DataFrame(kmer_to_filesID_, columns = ['kmers', 'experiment_ids'])
    display(exp.head())
    print(exp.shape)

    # Result: Merge fasta on experiments
    res = exp.merge(fa, on = 'kmers', how = 'inner')
    display(res.head())
    print(res.shape)
    res = res.drop('kmers', axis = 1).drop_duplicates()
    print(res.shape)
    # Result: Collapse to get unique peptide IDs
    collapse_res = defaultdict(set)
    for pep_id, exp_id in zip(res['peptide_id'], res['experiment_ids']):
        collapse_res[pep_id].update(exp_id.split(';'))
    collapse_res_df = []
    for k, v in collapse_res.items():  # Collapse 
        collapse_res_df.append((k,';'.join(v)))
    collapse_res_df = pd.DataFrame(collapse_res_df, columns = ['peptide_id', 'experiment_ids'])
    display(collapse_res_df.head())
    print(collapse_res_df.shape)
    
    # Result: add peptide sequences
    peptide_sequences_ = pd.DataFrame({'peptide_id': peptides_IDS, 'peptide_sequence':peptides_sequences})
    collapse_res_df = collapse_res_df.merge(peptide_sequences_, on = 'peptide_id', how = 'inner')
    display(collapse_res_df.head())
    print(collapse_res_df.shape)
    
    print('Number of peptide IDS')
    print(len(fa['peptide_id'].unique()))
    print(len(res['peptide_id'].unique()))
    print(len(collapse_res_df['peptide_id'].unique()))
    
    save_name = os.path.join(basepath, filtering_run, f'G_{sample}_experiments_per_peptide.tsv.gz')
    collapse_res_df.to_csv(save_name, index = None, compression = 'gzip', sep = '\t')
    print(f'Save to {save_name}')
    print('----- \n')
    

#     display(res.loc[res['peptide_id'] == '1'])

65
Load /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-61-2008-01A-02R-1568-13_pool_kmer.fa.gz fasta


Unnamed: 0,peptide_id,kmers
0,1,PAPAPAAPP
1,1,DPAPAPAAP
2,1,ADPAPAPAA
3,1,AADPAPAPA
4,1,AAADPAPAP


(731592, 2)


# Debug

In [75]:
test =  exp.merge(fa, on = 'kmers', how = 'right')

In [74]:
test.loc[test['peptide_id'].isna()]

Unnamed: 0,kmers,experiment_ids,peptide_id
729,ISFGVACFH,0213AGA;0A13AGA;0AN3AGA,


In [79]:
foo = test.groupby('peptide_id').sum()

In [105]:
foo[foo['experiment_ids'] == 0]

Unnamed: 0_level_0,kmers,experiment_ids
peptide_id,Unnamed: 1_level_1,Unnamed: 2_level_1
248,ANTSTQTMGVANTSTQTMRVANTSTQTQRVANTSTQTQRVANTSTS...,0
249,ANTSTQTMGVANTSTQTMRVANTSTQTQRVANTSTQTQRVANTSTS...,0
251,ANTSTQTMGVANTSTQTMRVANTSTQTQRVANTSTQTQRVANTSTS...,0
252,PNTSTQTMGHPNTSTQTMPHPNTSTQTRPHPNTSTQARPHPNTSTG...,0
253,ANTSTQTMGVANTSTQTMRVANTSTQTQRVANTSTQTQRVANTSTS...,0


# TEST Load OHSU 

In [9]:
path = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/OHSU_may17-2023_experimentID-update.tar.gz'

In [10]:
with tarfile.open(path, 'r') as tar:
    names = tar.getnames()
    

In [11]:
[ i for i in names if 'per_peptide' in i]


['OHSU-may17-experimentID-update/expts_per_peptide',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-AO-A0JM-01A-21R-A056-07_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-24-1431-01A-01R-1566-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-A2-A0SX-01A-12R-A084-07_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-61-2008-01A-02R-1568-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-24-2298-01A-01R-1569-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-A2-A0D2-01A-21R-A034-07_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-25-1313-01A-01R-1565-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-25-1319-01A-01R-1565-13_experiments_per_peptide.tsv',
 'OHSU-may17-experimentID-update/expts_per_peptide/

In [12]:
# for i in names:
#     print(i)

In [13]:
f1 = 'OHSU-may17-experimentID-update/experiment_maps/J_TCGA-BH-A18V-01A-11R-A12D-07_experiment_map.tsv'
f2 = 'OHSU-may17-experimentID-update/expts_per_peptide/J_TCGA-BH-A18V-01A-11R-A12D-07_experiments_per_peptide.tsv'


In [14]:
with tarfile.open(path, 'r') as tar:
    d1 = pd.read_csv(tar.extractfile(f1), sep = '\t')
    d2 = pd.read_csv(tar.extractfile(f2), sep = '\t')

In [15]:
d1.head()

Unnamed: 0,id,motif_filter,min_sample_reads,#_of_cohort_samples,reads_per_cohort_sample,normal_cohort_id,#_normal_samples_allowed,reads_per_normal_sample
0,0AN00PC,1,0,0,0,paired,0,0
1,0AN11PC,1,0,0,0,paired,1,1
2,0AN31PC,1,0,0,0,paired,1,3
3,0ANA1PC,1,0,0,0,paired,1,-1
4,0AN12PC,1,0,0,0,paired,2,1


In [16]:
d2.head()

Unnamed: 0,peptide_id,peptide_sequence,experiment_ids
0,100001,MEDYTKIEKIGEVFRMCLCRIPGYISSLSFFPWI,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...
1,100002,IEEHNIKIYHLPDAESDEDEDFKEQTRLLKASIPFSVVGSNQLIEA...,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...
2,100003,QPANCKVESLAMFLGELSLIDADPYLKYLPSVIAGAAFHLALYTVT...,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...
3,100004,MWYRLRLLKPQPNIIPTVKDGVSPCSPGWSRTLGLKWSACLGLQKW,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...
4,100005,LIVIPATYYLWPRDQNAEQIRLKNIRKVYGRCMWYRLRLLKPQPNI...,0AN11PC;0AN31PC;0ANA1PC;0AN12PC;0AN32PC;0ANA2P...


In [153]:
len(d2['peptide_id'].unique())

13705

In [156]:
d2.shape


(13705, 3)

# TESTS Experiment ID Matching

In [29]:
# GENERATE IDS # Test version #TODO ===> NEED to Generate the parameter file 
collect_IDS = []
for filt_file in path_filtered_sample:
    ID_split = path_to_expID(filt_file)
    ID = expID_to_block(ID_split)
    collect_IDS.append(ID)
    


In [60]:
# TEST ETH 
for pos in np.arange(6):
    res =  np.unique([i[pos] for i in np.unique(collect_IDS)])
    print(f'ID position {pos} unique values {res}')

ID position 0 unique values ['0']
ID position 1 unique values ['2' 'A']
ID position 2 unique values ['1' '5' 'N']
ID position 3 unique values ['0' '1' '3' 'A']
ID position 4 unique values ['1' '2' 'A' 'X']
ID position 5 unique values ['G']


In [61]:
# TEST OHSU 
for pos in np.arange(6):
    res =  np.unique([i[pos] for i in d1['id'].unique()])
    print(f'ID position {pos} unique values {res}')

ID position 0 unique values ['0']
ID position 1 unique values ['2' 'A']
ID position 2 unique values ['1' '5' 'N']
ID position 3 unique values ['0' '1' '3' 'A']
ID position 4 unique values ['0' '1' '2' 'A' 'X']
ID position 5 unique values ['G' 'P']


In [None]:
SLACK
# BACKGROUND cohorts we do (cohort_reads, sample_number)- KEEP pipeline as such
# cohort_reads=[0,1,3, Any]
# sample_number=[1,2,10, Any]
# except Any, Any
# FOREGROUND  (cohort_reads, sample_number) means
# sample_reads=[0]
# cohort_reads=[0,2]
# sample_number(rest of cohort) =[1, 5]

In [26]:
print(d1['id'].unique())

['0NN00PC' '0NNN1PC' '0NN11PC' '0NN31PC' '0NNA1PC' '0NNN2PC' '0NN12PC'
 '0NN32PC' '0NNA2PC' '0NNNXPC' '0NN1XPC' '0NN3XPC' '0NNAXPC' '0NNNAPC'
 '0NN1APC' '0NN3APC' '0N100PC' '0N1N1PC' '0N111PC' '0N131PC' '0N1A1PC'
 '0N1N2PC' '0N112PC' '0N132PC' '0N1A2PC' '0N1NXPC' '0N11XPC' '0N13XPC'
 '0N1AXPC' '0N1NAPC' '0N11APC' '0N13APC' '02100PC' '021N1PC' '02111PC'
 '02131PC' '021A1PC' '021N2PC' '02112PC' '02132PC' '021A2PC' '021NXPC'
 '0211XPC' '0213XPC' '021AXPC' '021NAPC' '0211APC' '0213APC' '0N500PC'
 '0N5N1PC' '0N511PC' '0N531PC' '0N5A1PC' '0N5N2PC' '0N512PC' '0N532PC'
 '0N5A2PC' '0N5NXPC' '0N51XPC' '0N53XPC' '0N5AXPC' '0N5NAPC' '0N51APC'
 '0N53APC' '02500PC' '025N1PC' '02511PC' '02531PC' '025A1PC' '025N2PC'
 '02512PC' '02532PC' '025A2PC' '025NXPC' '0251XPC' '0253XPC' '025AXPC'
 '025NAPC' '0251APC' '0253APC' '0NN00PA' '0NNN1PA' '0NN11PA' '0NN31PA'
 '0NNA1PA' '0NNN2PA' '0NN12PA' '0NN32PA' '0NNA2PA' '0NNNXPA' '0NN1XPA'
 '0NN3XPA' '0NNAXPA' '0NNNAPA' '0NN1APA' '0NN3APA' '0N100PA' '0N1N1PA'
 '0N11