In [1]:
import os 
import pandas as pd
from collections import defaultdict
import numpy as np 
import glob
import gzip
import pickle
from helper_modular import *

# Previous steps


### Step 1: 


Get a list of kmers passing the filtering.
Can also be a kmer "longlist" which is a pool of kmers from multiple filter experiments. The peptide to experiment correspondance will then be stored in a table.


### Step 2

Given a list of kmers, grep the corresponding metadata information from the ImmunoPepper metadata file (very big, no kmer information) 

See /GitHub/projects2020_ohsu/eth/peptide_search_format/p20220115_format_peptides_longlist_pr
ior.sh

### Step 3. 



This notebook performs the following tasks * Given a kmer_metadata file, generate a fasta file with metadata relevant for the tryptic digestion 2. A table containing the peptide IDs per sample 


# Define Samples


In [2]:
### Paths ###
# Inputs
run_type = 'ov'

if run_type == 'brca':
    samples = ['TCGA-C8-A12P-01A-11R-A115-07',
                      'TCGA-AO-A0JM-01A-21R-A056-07',
                      'TCGA-BH-A18V-01A-11R-A12D-07',
                      'TCGA-A2-A0D2-01A-21R-A034-07',
                      'TCGA-A2-A0SX-01A-12R-A084-07']
    basefolder = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
elif run_type == 'ov':
    samples = ['TCGA-25-1319-01A-01R-1565-13',
                      'TCGA-25-1313-01A-01R-1565-13',
                      'TCGA-61-2008-01A-02R-1568-13',
                      'TCGA-24-1431-01A-01R-1566-13',
                      'TCGA-24-2298-01A-01R-1569-13']
    basefolder = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374'
    

# Outputs
filter_folder = 'filtering_samples/filters_19May_order_5ge_wAnnot_GPstar'
experiment_folder = os.path.join(basefolder, filter_folder)

# OPTIONAL Inputs
metadata_pooled = os.path.join(basefolder, experiment_folder, 'peptides.tsv.gz')
longlists = glob.glob(os.path.join(basefolder, experiment_folder, '*longlist*'))
print(experiment_folder)
whitelist_kmer = None


/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar


# Preprocess the annotation 
### Input: Annotation

In [3]:
### Annotation coordinates ###

ann_path = '/cluster/work/grlab/projects/projects2020_OHSU/annotation/gencode.v32.annotation.gtf'

transcript_to_gene_dict, gene_to_transcript_dict, \
gene_cds_begin_dict, transcript_to_cds_dict, \
transcript_cds_begin_dict, transcript_to_strand = preprocess_annot_custom(ann_path)

gene_cds_begin_dict_bis, gene_cds_end_dict_bis = my_CDS_collection( \
                                                 transcript_to_gene_dict, gene_cds_begin_dict, \
                                                 transcript_to_cds_dict, \
                                                 transcript_cds_begin_dict, transcript_to_strand)

# Step 4. Format the peptide raw file  
### Input:  file meta with kmers and bi-exons matching the longlist of the samples 

In [4]:
def extract_peptide_fields(pep):
    cols_correct = ['kmer', 'kmer_coord', 'peptide','id','readFrame','readFrameAnnotated','geneName','geneChr','geneStrand',
    'mutationMode','hasStopCodon','isInJunctionList',
    'isIsolated','variantComb','variantSegExpr','modifiedExonsCoord',
    'originalExonsCoord',
    'vertexIdx','kmerType', 'dummy1', 'dummy2', 'dummy3']
    res = {}

    if len(pep) == 2: # Case where we have a tuple
        pep = pep[1]
        
    for idx, p in enumerate(pep):
        res[cols_correct[idx]] = p
    return res 

In [33]:
sample = samples[4]
print(sample)

TCGA-24-2298-01A-01R-1569-13


In [34]:
### Files ###

# Meta Input File
file_meta = os.path.join(experiment_folder, f'G_{sample}_grep_metadata_raw.tsv.gz')
file_meta = None

# Fasta output File
if not whitelist_kmer:
    file_save = os.path.join(experiment_folder, f'G_{sample}_pool_kmer.fa')
else:
    file_save = os.path.join(experiment_folder, f'G_{sample}_pool_kmer.fa')



In [35]:
### OPTIONAL: Restrict the fasta to a whitelist ###
whitelist_kmer = False  
if whitelist_kmer:
    with open(os.path.join(experiment_folder, 'G_TCGA_Allsamples_intersect.pickle'), 'rb') as handle:
        whitelist_kmer = pickle.load(handle)
    print(len(whitelist_kmer))


In [36]:
### OPTIONAL: Pooled kmer-metadata all filtering output ###
if metadata_pooled:
    # reads pooled metadata
    meta_pooled = pd.read_csv(metadata_pooled, sep = '\t')
    #display(meta_pooled.head())
    print(meta_pooled.shape)
    meta_pooled = meta_pooled.drop_duplicates()

    # get longlist fr sample
    longlist = [i for i in longlists if sample in i][0]
    print(f'read {longlist}')
    kmers_sample_long = pd.read_csv(longlist, sep = '\t', header = None)
    kmers_sample_long.columns = ['kmer', 'coord']
    #display(kmers_sample_long.head())
    print(kmers_sample_long.shape)

    # extract raw metadata from longlist
    raw_metadata = meta_pooled.merge(kmers_sample_long, how = 'inner', on = ['kmer', 'coord'])

    # some data wrangling to align with previous pipeline
    raw_metadata = raw_metadata.drop(['strand', 'keep', 
                                    'junction_coordinate1', 'junction_coordinate2'], axis = 1)
    raw_metadata = raw_metadata.iloc[:, [1, 0] + list(np.arange(2, len(raw_metadata.columns))) ]
    print(raw_metadata.shape)
    #display(meta_pooled.head())
else:
    meta_pooled = None

(3160507, 26)
read /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-24-2298-01A-01R-1569-13_kmer_longlist.tsv.gz
(145355, 2)
(1152479, 22)


In [37]:
# # Small Test 
# print(file_meta)
# ! zcat {file_meta} | head -2 
# lines = readlines_custom(file_meta)
# for line in lines[0:3]:
#     res = extract_peptide_fields(line)
#     print(res)

In [38]:
file_meta

In [39]:
### Main ###

subset_run_test = False
write_ = True
print_= False
test = 0 

pep_idx = 0 
peptide_junctions = set()

with open(file_save, 'w') as sp: # Fasta file to save
    
    ### Iterate over peptides 
    lines = readlines_custom(file_=file_meta, pandas_df=raw_metadata)
    for line in lines:
        res = extract_peptide_fields(line)
        
        if whitelist_kmer and res['kmer'] not in whitelist_kmer:
            continue
            

        # some peptides do not contain a junction
        if int(res['isIsolated']):
            continue 
            
        # Genomic coordinates : get include 3' or 5' flag 
        res['originalExonsCoord'] = split_coord(res['originalExonsCoord'])
        start_cds = [ first_exon[0] for first_exon in gene_cds_begin_dict_bis[res['geneName']] ] 
        end_cds = [ last_exon[1] for last_exon in gene_cds_end_dict_bis[res['geneName']] ] 
        pep_start, pep_end = extract_end_starts(res['originalExonsCoord'], res['geneStrand'])
        pep_5include, pep_3include = get_include_flag(start_cds, end_cds, 
                                                      pep_start, pep_end, 
                                                      res['hasStopCodon'] )

        # Modified genomic coordinates: 
        res['modifiedExonsCoord'] = split_coord(res['modifiedExonsCoord'])
        if len(res['modifiedExonsCoord']) <=2:
            continue
            
            
        jx_list, shift_list, jx_list_ori = get_nt_len_with_aa_shift(res['modifiedExonsCoord'])
        # get "junction position" flag
        aa_junction_pos, between_codons, \
        aa_junction_pos1, between_codons1, \
        aa_junction_pos1_from_start = get_aaPos_betweenFlag(shift_list, jx_list)
        #get "genomic coordinates" flag 
        genome_junction_pos, \
        genome_junction_pos1 = get_genomic_coordinates(res['modifiedExonsCoord'], res['geneStrand'])


#             print_stats(print_, res['kmer'], res['peptide'], res['geneStrand'], res['originalExonsCoord'], 
#                         res['modifiedExonsCoord'], jx_list, jx_list_ori, genome_junction_pos,
#                         genome_junction_pos1, aa_junction_pos, aa_junction_pos1, 
#                         between_codons, between_codons1)
        
        # Get the 2 peptides sequences 
        jx_pep1, jx_pep2 = cut_peptides(res['peptide'], jx_list, between_codons,
                                            between_codons1, aa_junction_pos, 
                                            aa_junction_pos1, 
                                            aa_junction_pos1_from_start, 
                                            print_=print_)

        # skip duplicates (Issue 3-exons?)
        if (str(genome_junction_pos) + str(genome_junction_pos1) + res['peptide'] in peptide_junctions):
            continue
        else:
            peptide_junctions.add(str(genome_junction_pos) + str(genome_junction_pos1) + res['peptide'] )
            pep_idx +=1


        # write fasta file 
        write_fasta(write_, sp, res['peptide'], pep_idx, aa_junction_pos, 
                    aa_junction_pos1_from_start, between_codons, between_codons1,
                    pep_5include, pep_3include, res['geneName'], 
                    genome_junction_pos, genome_junction_pos1, 
                    res['kmer'], jx_pep1, jx_pep2, res['readFrameAnnotated'], \
                    res['kmer_coord'], res['kmerType'], res['geneStrand'], do_write=True)
    

if write_:
    print('written to:')
    print(file_save)


written to:
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-24-2298-01A-01R-1569-13_pool_kmer.fa
