In [2]:
import glob 
import os 
import numpy as np 
from collections import defaultdict
import tarfile
import pandas as pd
#import ipynb

In [17]:
tar_file_OHSU = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/Aug21_graph_data_updatedfilters/OHSU_kmer_lists_Dec15.tar.gz'
base_folder_ETH = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102'
commit_folder_ETH = 'commit_d4aee54*'

In [18]:
# restrict to cohort background
pattern_restrict = ['NormalCohortGtexcore_']
# restrict to samples 
samples = ['TCGA-BH-A18V-01A-11R-A12D-07.all','TCGA-C8-A12P-01A-11R-A115-07.all',
           'TCGA-A2-A0D2-01A-21R-A034-07.all','TCGA-A2-A0SX-01A-12R-A084-07.all',
           'TCGA-AO-A0JM-01A-21R-A056-07.all']

# Helper Functions: to get the file pair equivalence

In [19]:
def ohsu_to_eth(path):
    '''Transform OHSU file Name into ETH Names'''
    cohort = {'NormalCohortGtexcore_': 'Gtexcore', 
             'NormalCohortMatched': 'Matched', 
             'NormalCohortGtexTcga':'Alls'}
    
    key_to_apply = [k for k in cohort if k in path]
    if key_to_apply: 
        key_to_apply = key_to_apply[0]
        
        # Different position of the cohort name 
        path = path.replace(key_to_apply, '')
        path = path.replace('FiltNormalsC','FiltNormals{}C'.format(cohort[key_to_apply]) )
        # Prefix
        path = path.replace('J_', 'G_')
        # Float replace 
        path = path.replace('CohortLim', '.0CohortLim')
        sample = path.split('_')[1]
        #print(path)
        path = path.replace('Any', 'None')
        return path, sample
    else:
        return None, None 

In [20]:
def get_eth_path(base_folder_ETH, commit_folder_ETH, file_name_eth=None, sample=None):
    '''Converts a ETH file name into its path on the cluster'''
    if file_name_eth is not None: 
        path_list = os.path.join(base_folder_ETH, 'filter_' + sample, commit_folder_ETH, file_name_eth, 'part*')
        path_list = glob.glob(path_list)
        if path_list:
            path_output = path_list[0]
            return path_output 
        else:
            return None

In [21]:
def get_file_pairs(tar_file_OHSU, base_folder_ETH, commit_folder_ETH, pattern_restrict, samples):
    '''Get pairs of OHSU file names and ETH file paths for Loading'''
    # Get the file name pairs 
    file_pair = {'eth':[], 'ohsu': []}
    true_pairs = 0 
    with tarfile.open(tar_file_OHSU, "r:*") as tar:
        file_names_OHSU = tar.getnames()
        file_names_OHSU_filtered = [ohsu_name for pattern in pattern_restrict 
                                    for ohsu_name in file_names_OHSU
                                   if pattern in ohsu_name] # restrict to cohort background
        file_names_OHSU_filtered = [ohsu_name for sample in samples 
                                    for ohsu_name in file_names_OHSU_filtered
                                   if sample in ohsu_name] # restrict to samples
        total_pairs = len(file_names_OHSU_filtered)
        for name_ohsu in file_names_OHSU_filtered:
            # Convert name of the file 
            name_eth, sample = ohsu_to_eth(name_ohsu)
            # Get the full path 
            eth_path = get_eth_path(base_folder_ETH, commit_folder_ETH, name_eth, sample)
            if (eth_path is not None) and os.path.isfile(eth_path):
                print('\n TRUE PAIR:')
                true_pairs +=1
                file_pair['eth'].append(eth_path) 
                file_pair['ohsu'].append(name_ohsu)
            else:
                print('\n FALSE PAIR:')
            print('Name OHSU is : {}'.format(name_ohsu))
            print('Equivalent Name ETH is : {}'.format(name_eth))
            print('Path to check is : {}'.format(eth_path))
            print('\n ')

        print('{}/{} selected OHSU files to pair have found a match'.format(true_pairs, total_pairs))
        return file_pair

### Apply: get the file pair equivalence

In [22]:
file_pair = get_file_pairs(tar_file_OHSU, base_folder_ETH, commit_folder_ETH, pattern_restrict, samples)


 TRUE PAIR:
Name OHSU is : J_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLim0.0Across10_FiltNormalsCohortlim0.0Across0_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim0.0CohortLim0.0Across10_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv
Path to check is : /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-BH-A18V-01A-11R-A12D-07.all/commit_d4aee54_GTEXcore/G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim0.0CohortLim0.0Across10_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv/part-00000-ca83b692-a64d-4b9f-b27c-762748c7fcce-c000.csv

 

 TRUE PAIR:
Name OHSU is : J_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLim0.0Across10_FiltNormalsCohortlim10.0Across2_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim0.0CohortLim0.0Across10_FiltNormalsGt


 TRUE PAIR:
Name OHSU is : J_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_NormalCohortGtexcore_SampleLim2CohortLim1.0Across2_FiltNormalsCohortlim3.0AcrossAny_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim2.0CohortLim1.0Across2_FiltNormalsGtexcoreCohortlim3.0AcrossNone_FiltUniprot.tsv
Path to check is : /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-BH-A18V-01A-11R-A12D-07.all/commit_d4aee54_GTEXcore/G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim2.0CohortLim1.0Across2_FiltNormalsGtexcoreCohortlim3.0AcrossNone_FiltUniprot.tsv/part-00000-52841081-1351-4106-b492-b52b7a9f0c61-c000.csv

 

 TRUE PAIR:
Name OHSU is : J_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_NormalCohortGtexcore_SampleLim2CohortLim1.0Across2_FiltNormalsCohortlimAnyAcross2_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim2.0CohortLim1.0Across2_FiltNormals


 TRUE PAIR:
Name OHSU is : J_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_NormalCohortGtexcore_SampleLim2CohortLim5.0Across2_FiltNormalsCohortlim0.0Across0_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim2.0CohortLim5.0Across2_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv
Path to check is : /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-BH-A18V-01A-11R-A12D-07.all/commit_d4aee54_GTEXcore/G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim2.0CohortLim5.0Across2_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv/part-00000-e396d39c-55b7-4df4-aa1e-6025a8066b1f-c000.csv

 

 TRUE PAIR:
Name OHSU is : J_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_NormalCohortGtexcore_SampleLim2CohortLim5.0Across2_FiltNormalsCohortlim10.0Across2_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleLim2.0CohortLim5.0Across2_FiltNormalsGtexcor


 TRUE PAIR:
Name OHSU is : J_TCGA-C8-A12P-01A-11R-A115-07.all_ref_NormalCohortGtexcore_SampleLim2CohortLim0.0Across1_FiltNormalsCohortlim3.0AcrossAny_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-C8-A12P-01A-11R-A115-07.all_ref_SampleLim2.0CohortLim0.0Across1_FiltNormalsGtexcoreCohortlim3.0AcrossNone_FiltUniprot.tsv
Path to check is : /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-C8-A12P-01A-11R-A115-07.all/commit_d4aee54_GTEXcore/G_TCGA-C8-A12P-01A-11R-A115-07.all_ref_SampleLim2.0CohortLim0.0Across1_FiltNormalsGtexcoreCohortlim3.0AcrossNone_FiltUniprot.tsv/part-00000-301e3bc8-52c7-4760-bae3-45f197760794-c000.csv

 

 TRUE PAIR:
Name OHSU is : J_TCGA-C8-A12P-01A-11R-A115-07.all_ref_NormalCohortGtexcore_SampleLim2CohortLim0.0Across1_FiltNormalsCohortlimAnyAcross2_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-C8-A12P-01A-11R-A115-07.all_ref_SampleLim2.0CohortLim0.0Across1_FiltNormals


 TRUE PAIR:
Name OHSU is : J_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLimNoneAcrossNone_FiltNormalsCohortlim0.0Across0_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv
Path to check is : /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore/G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv/part-00000-7f7f4b26-5fa8-49cd-bc35-d201cb69a9f7-c000.csv

 

 TRUE PAIR:
Name OHSU is : J_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLimNoneAcrossNone_FiltNormalsCohortlim10.0Across2_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleLim0.0CohortLimNoneAcrossNon


 TRUE PAIR:
Name OHSU is : J_TCGA-A2-A0SX-01A-12R-A084-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLim1.0Across2_FiltNormalsCohortlimAnyAcross2_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-A2-A0SX-01A-12R-A084-07.all_ref_SampleLim0.0CohortLim1.0Across2_FiltNormalsGtexcoreCohortlimNoneAcross2_FiltUniprot.tsv
Path to check is : /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0SX-01A-12R-A084-07.all/commit_d4aee54_GTEXcore/G_TCGA-A2-A0SX-01A-12R-A084-07.all_ref_SampleLim0.0CohortLim1.0Across2_FiltNormalsGtexcoreCohortlimNoneAcross2_FiltUniprot.tsv/part-00000-e839bb39-6b77-4ac6-95e8-798ccc7d4528-c000.csv

 

 TRUE PAIR:
Name OHSU is : J_TCGA-A2-A0SX-01A-12R-A084-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLim5.0Across10_FiltNormalsCohortlim0.0Across0_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-A2-A0SX-01A-12R-A084-07.all_ref_SampleLim0.0CohortLim5.0Across10_FiltNormalsGtex


 TRUE PAIR:
Name OHSU is : J_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLim0.0Across1_FiltNormalsCohortlim3.0Across10_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleLim0.0CohortLim0.0Across1_FiltNormalsGtexcoreCohortlim3.0Across10_FiltUniprot.tsv
Path to check is : /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-AO-A0JM-01A-21R-A056-07.all/commit_d4aee54_GTEXcore/G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleLim0.0CohortLim0.0Across1_FiltNormalsGtexcoreCohortlim3.0Across10_FiltUniprot.tsv/part-00000-9ca3603b-1394-43fe-a8d9-5bce771a75da-c000.csv

 

 TRUE PAIR:
Name OHSU is : J_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLim0.0Across1_FiltNormalsCohortlim3.0Across2_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleLim0.0CohortLim0.0Across1_FiltNormalsGtexc


 TRUE PAIR:
Name OHSU is : J_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_NormalCohortGtexcore_SampleLim2CohortLim5.0Across2_FiltNormalsCohortlim0.0Across0_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleLim2.0CohortLim5.0Across2_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv
Path to check is : /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-AO-A0JM-01A-21R-A056-07.all/commit_d4aee54_GTEXcore/G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleLim2.0CohortLim5.0Across2_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv/part-00000-65626542-4882-48e0-9892-cfc35687d7ea-c000.csv

 

 TRUE PAIR:
Name OHSU is : J_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_NormalCohortGtexcore_SampleLim2CohortLim5.0Across2_FiltNormalsCohortlim10.0Across2_FiltUniprot.tsv
Equivalent Name ETH is : G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleLim2.0CohortLim5.0Across2_FiltNormalsGtexcor

# Example: code to load from file pair


In [40]:
valid_idx = [0]
simple_content = {}
with tarfile.open(tar_file_OHSU, "r:*") as tar:
    for idx in valid_idx:
        df_eth = pd.read_csv(file_pair['eth'][idx], sep="\t", usecols = ['kmer'])['kmer'].tolist()
        df_ohsu = pd.read_csv(tar.extractfile(file_pair['ohsu'][idx]), sep="\t", usecols = ['kmer'])['kmer'].tolist()
        label_eth = file_pair['eth'][idx].split('/')[-2]
        label_ohsu = file_pair['ohsu'][idx]
        simple_content[label_ohsu] = np.unique(df_ohsu)
        simple_content[label_eth] = np.unique(df_eth)

print('done')

done


In [41]:
simple_content

{'J_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_NormalCohortGtexcore_SampleLim0CohortLim0.0Across10_FiltNormalsCohortlim0.0Across0_FiltUniprot.tsv': array(['AARHSKFFP', 'AFFPRGEGW', 'AGGGVPLVG', 'AGTGRRGKG', 'AIFIGWGGG',
        'ALPAWQVYL', 'ALPSLLAGG', 'AQPHPSTPQ', 'AQPHTSTPQ', 'ATAQPHPST',
        'ATAQPHTST', 'ATFTFMLVR', 'ATTAGTGRR', 'CHLNGWGRG', 'CIKSAARHS',
        'DCIKSAARH', 'EALPAWQVY', 'EATFTFMLV', 'EATTAGTGR', 'EEKVSQPEA',
        'EKVSQPEAL', 'FGGVGWLWV', 'FIGWGGGAV', 'FMLVRSRDG', 'FPFGGVGWL',
        'FTFMLVRSR', 'GGGAVGGRD', 'GGGVPLVGE', 'GGVGWLWVG', 'GGVPLVGET',
        'GLLLAFFPR', 'GPAIFIGWG', 'GRGSFLLLV', 'GTGRRGKGA', 'GVGWLWVGF',
        'GWGGGAVGG', 'GWGRGSFLL', 'HLNGWGRGS', 'HPSTPQPVL', 'HRYTEDGCP',
        'HTSTPQPVL', 'IFIGWGGGA', 'IGWGGGAVG', 'IKSAARHSK', 'IMVGLLLAF',
        'KMEATTAGT', 'KSAARHSKF', 'KVSQPEALP', 'LAFFPRGEG', 'LAGGGVPLV',
        'LEATFTFML', 'LIMVGLLLA', 'LLAFFPRGE', 'LLAGGGVPL', 'LLLAFFPRG',
        'LNGWGRGSF', 'LPSLLAGGG', 'LSRSHRYTE', 'MEATTAGT

# Helper functions bonus: to convert a path into a condition vector 

In [38]:
def path_to_condition(all_paths):
    '''Converts ETH Names into experimental conditions '''
    # None handling added 
    
    all_paths = [path.replace('None', '0') for path in all_paths]
    sample_expr = [np.float(os.path.basename(path).split('ref_SampleLim')[1].split('Cohort')[0]) 
                   for path in all_paths ]

    foreground_cohort_expr = [np.float(os.path.basename(path).split('CohortLim')[1].split('Across')[0])
                                       for path in all_paths ]

    foreground_cohort_samples = [np.int(os.path.basename(path).split('Across')[1].split('_Filt')[0] ) 
                                 for path in all_paths ]

    background_cohort_expr = [np.float(os.path.basename(path).split('Cohortlim')[1].split('Across')[0])
                              for path in all_paths ]

    background_cohort_samples = [np.int(os.path.basename(path).split('Across')[2].split('_FiltUn')[0])
                                 for path in all_paths ]
    background_cohort_id = [os.path.basename(path).split('Normals')[1].split('lim')[0] for path in all_paths ]

    legend_quant = pd.DataFrame({'sample_expr':sample_expr, 
                  'foreground_cohort_expr': foreground_cohort_expr, 
                  'foreground_cohort_samples': foreground_cohort_samples, 
                  'background_cohort_expr': background_cohort_expr,
                  'background_cohort_samples' : background_cohort_samples, 
                    'background_cohort_id':background_cohort_id, 
                    'original_name':all_paths})


    sort_legend = False
    if sort_legend:
        legend_quant = legend_quant.sort_values("background_cohort_samples", ascending=False).\
        sort_values("background_cohort_expr", ascending=False).\
        sort_values("foreground_cohort_samples", ascending=False).\
        sort_values("foreground_cohort_expr", ascending=False).\
        sort_values("sample_expr", ascending=False)
    return legend_quant

### Apply: to convert a path into a condition vector 

In [39]:
eth_names = [os.path.basename(os.path.dirname(path)) for path in file_pair['eth']]
path_to_condition(eth_names)

Unnamed: 0,sample_expr,foreground_cohort_expr,foreground_cohort_samples,background_cohort_expr,background_cohort_samples,background_cohort_id,original_name
0,0.0,0.0,10,0.0,0,GtexcoreCohort,G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleL...
1,0.0,0.0,10,10.0,2,GtexcoreCohort,G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleL...
2,0.0,0.0,10,3.0,10,GtexcoreCohort,G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleL...
3,0.0,0.0,10,3.0,2,GtexcoreCohort,G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleL...
4,0.0,0.0,10,3.0,0,GtexcoreCohort,G_TCGA-BH-A18V-01A-11R-A12D-07.all_ref_SampleL...
...,...,...,...,...,...,...,...
595,2.0,0.0,0,10.0,2,GtexcoreCohort,G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleL...
596,2.0,0.0,0,3.0,10,GtexcoreCohort,G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleL...
597,2.0,0.0,0,3.0,2,GtexcoreCohort,G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleL...
598,2.0,0.0,0,3.0,0,GtexcoreCohort,G_TCGA-AO-A0JM-01A-21R-A056-07.all_ref_SampleL...
