In [1]:
%load_ext autoreload
%autoreload 2
import os 
import pandas as pd
from collections import defaultdict
import numpy as np 
import glob
import gzip
import pickle
from helper_longlist import *

# Previous steps


### Step 1: 


Get a list of kmers passing the filtering.
Can also be a kmer "longlist" which is a pool of kmers from multiple filter experiments. The peptide to experiment correspondance will then be stored in a table.


### Step 2

Given a list of kmers, grep the corresponding metadata information from the ImmunoPepper metadata file (very big, no kmer information) 

See /GitHub/projects2020_ohsu/eth/peptide_search_format/p20220115_format_peptides_longlist_pr
ior.sh

### Step 3. 



This notebook performs the following tasks * Given a kmer_metadata file, generate a fasta file with metadata relevant for the tryptic digestion 2. A table containing the peptide IDs per sample 


# Define Samples


In [2]:
### Paths INPUTS ###
run_type = 'ov'
filter_folder = 'filtering_samples/filters_19May_order_5ge_wAnnot_GPstar'
sample_idx = 0

if run_type == 'brca':
    basefolder = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
    samples = ['TCGA-C8-A12P-01A-11R-A115-07',
                      'TCGA-AO-A0JM-01A-21R-A056-07',
                      'TCGA-BH-A18V-01A-11R-A12D-07',
                      'TCGA-A2-A0D2-01A-21R-A034-07',
                      'TCGA-A2-A0SX-01A-12R-A084-07']
    sample = samples[sample_idx]
    print(sample)  
    
    experiment_folder = os.path.join(basefolder, filter_folder)
    file_meta = os.path.join(experiment_folder, f'G_{sample}_grep_metadata_raw.tsv.gz') # Generated with Grep
    metadata_pooled = None

    
    
elif run_type == 'ov':
    basefolder = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374'
    samples = ['TCGA-25-1319-01A-01R-1565-13',
                      'TCGA-25-1313-01A-01R-1565-13',
                      'TCGA-61-2008-01A-02R-1568-13',
                      'TCGA-24-1431-01A-01R-1566-13',
                      'TCGA-24-2298-01A-01R-1569-13']
    sample = samples[sample_idx]
    print(sample)  
    
    experiment_folder = os.path.join(basefolder, filter_folder)    
    file_meta = None
    metadata_pooled = os.path.join(basefolder, experiment_folder, 'peptides_25012024.tsv.gz') # Generated with a notebook

    


# OPTIONAL Inputs
longlists = glob.glob(os.path.join(basefolder, experiment_folder, '*longlist*'))
print(experiment_folder)
whitelist_kmer = None


TCGA-25-1319-01A-01R-1565-13
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar


In [3]:
### Files OUTPUT ###

# Fasta output File
if not whitelist_kmer:
    file_save = os.path.join(experiment_folder, f'G_{sample}_pool_kmer_25012024.fa')
else:
    file_save = os.path.join(experiment_folder, f'G_{sample}_pool_kmer_25012024.fa')


# Preprocess the annotation 
### Input: Annotation

In [4]:
### Annotation coordinates ###

ann_path = '/cluster/work/grlab/projects/projects2020_OHSU/annotation/gencode.v32.annotation.gtf'

transcript_to_gene_dict, gene_to_transcript_dict, \
gene_cds_begin_dict, transcript_to_cds_dict, \
transcript_cds_begin_dict, transcript_to_strand = preprocess_annot_custom(ann_path)

gene_cds_begin_dict_bis, gene_cds_end_dict_bis = my_CDS_collection( \
                                                 transcript_to_gene_dict, gene_cds_begin_dict, \
                                                 transcript_to_cds_dict, \
                                                 transcript_cds_begin_dict, transcript_to_strand)

# Step 4. Format the peptide raw file  
### Input:  file meta with kmers and bi-exons matching the longlist of the samples 

In [5]:
def extract_peptide_fields(pep):
    cols_correct = ['kmer', 'kmer_coord', 'peptide','id','readFrame','readFrameAnnotated','geneName','geneChr','geneStrand',
    'mutationMode','hasStopCodon','isInJunctionList',
    'isIsolated','variantComb','variantSegExpr','modifiedExonsCoord',
    'originalExonsCoord',
    'vertexIdx','kmerType', 'dummy1', 'dummy2', 'dummy3']
    res = {}

    if len(pep) == 2: # Case where we have a tuple
        pep = pep[1]
        
    for idx, p in enumerate(pep):
        res[cols_correct[idx]] = p
    return res 

In [6]:
### OPTIONAL: Restrict the fasta to a whitelist ###
whitelist_kmer = False  
if whitelist_kmer:
    with open(os.path.join(experiment_folder, 'G_TCGA_Allsamples_intersect.pickle'), 'rb') as handle:
        whitelist_kmer = pickle.load(handle)
    print(len(whitelist_kmer))


In [7]:
### OPTIONAL: Pooled kmer-metadata all filtering output ###
### Filtered candidates are merged to the metadata from immunopepper

if metadata_pooled:
    # reads pooled metadata
    meta_pooled = pd.read_csv(metadata_pooled, sep = '\t')
    #display(meta_pooled.head())
    print(meta_pooled.shape)
    meta_pooled = meta_pooled.drop_duplicates()

    # get longlist fr sample
    longlist = [i for i in longlists if sample in i][0]
    print(f'read {longlist}')
    kmers_sample_long = pd.read_csv(longlist, sep = '\t', header = None)
    kmers_sample_long.columns = ['kmer', 'coord']
    #display(kmers_sample_long.head())
    print(kmers_sample_long.shape)

    # extract raw metadata from longlist
    raw_metadata = meta_pooled.merge(kmers_sample_long, how = 'inner', on = ['kmer', 'coord'])

    # some data wrangling to align with previous pipeline
    raw_metadata = raw_metadata.drop(['strand', 'keep', 
                                    'junction_coordinate1', 'junction_coordinate2'], axis = 1)
    raw_metadata = raw_metadata.iloc[:, [1, 0] + list(np.arange(2, len(raw_metadata.columns))) ]
    print(raw_metadata.shape)
    #display(meta_pooled.head())
else:
    raw_metadata = None

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(7538815, 31)
read /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-25-1319-01A-01R-1565-13_kmer_longlist.tsv.gz
(100978, 2)
(2668517, 27)


In [8]:
# # Small Test 
# print(file_meta)
# ! zcat {file_meta} | head -2 
# lines = readlines_custom(file_meta)
# for line in lines[0:3]:
#     res = extract_peptide_fields(line)
#     print(res)

In [9]:
file_meta

In [10]:
### Main ###

subset_run_test = False
write_ = True
print_= False
test = 0 

pep_idx = 0 
peptide_junctions = set()

with open(file_save, 'w') as sp: # Fasta file to save
    
    ### Iterate over peptides 
    lines = readlines_custom(file_=file_meta, pandas_df=raw_metadata)
    for line in lines:
        res = extract_peptide_fields(line)
        
        if whitelist_kmer and res['kmer'] not in whitelist_kmer:
            continue
            

        # some peptides do not contain a junction
        if int(res['isIsolated']):
            continue 
            
        # Genomic coordinates : get include 3' or 5' flag 
        res['originalExonsCoord'] = split_coord(res['originalExonsCoord'])
        start_cds = [ first_exon[0] for first_exon in gene_cds_begin_dict_bis[res['geneName']] ] 
        end_cds = [ last_exon[1] for last_exon in gene_cds_end_dict_bis[res['geneName']] ] 
        pep_start, pep_end = extract_end_starts(res['originalExonsCoord'], res['geneStrand'])
        pep_5include, pep_3include = get_include_flag(start_cds, end_cds, 
                                                      pep_start, pep_end, 
                                                      res['hasStopCodon'] )

        # Modified genomic coordinates: 
        res['modifiedExonsCoord'] = split_coord(res['modifiedExonsCoord'])
        if len(res['modifiedExonsCoord']) <=2:
            continue
            
            
        jx_list, shift_list, jx_list_ori = get_nt_len_with_aa_shift(res['modifiedExonsCoord'])
        # get "junction position" flag
        aa_junction_pos, between_codons, \
        aa_junction_pos1, between_codons1, \
        aa_junction_pos1_from_start = get_aaPos_betweenFlag(shift_list, jx_list)
        #get "genomic coordinates" flag 
        genome_junction_pos, \
        genome_junction_pos1 = get_genomic_coordinates(res['modifiedExonsCoord'], res['geneStrand'])


#             print_stats(print_, res['kmer'], res['peptide'], res['geneStrand'], res['originalExonsCoord'], 
#                         res['modifiedExonsCoord'], jx_list, jx_list_ori, genome_junction_pos,
#                         genome_junction_pos1, aa_junction_pos, aa_junction_pos1, 
#                         between_codons, between_codons1)
        
        # Get the 2 peptides sequences 
        jx_pep1, jx_pep2 = cut_peptides(res['peptide'], jx_list, between_codons,
                                            between_codons1, aa_junction_pos, 
                                            aa_junction_pos1, 
                                            aa_junction_pos1_from_start, 
                                            print_=print_)

        # skip duplicates (Issue 3-exons?)
        if (str(genome_junction_pos) + str(genome_junction_pos1) + res['peptide'] in peptide_junctions):
            continue
        else:
            peptide_junctions.add(str(genome_junction_pos) + str(genome_junction_pos1) + res['peptide'] )
            pep_idx +=1


        # write fasta file 
        write_fasta(write_, sp, res['peptide'], pep_idx, aa_junction_pos, 
                    aa_junction_pos1_from_start, between_codons, between_codons1,
                    pep_5include, pep_3include, res['geneName'], 
                    genome_junction_pos, genome_junction_pos1, 
                    res['kmer'], jx_pep1, jx_pep2, res['readFrameAnnotated'], \
                    res['kmer_coord'], res['kmerType'], res['geneStrand'], do_write=write_)
    

if write_:
    print('written to:')
    print(file_save)


IndexError: list index out of range

In [11]:
file_meta

In [12]:
raw_metadata

Unnamed: 0,coord,index,kmer,junction_coordinate2_x,peptide,id,readFrame,readFrameAnnotated,geneName,geneChr,...,variantSegExpr,modifiedExonsCoord,originalExonsCoord,vertexIdx,junctionExpr,segmentExpr,kmerType,junction_coordinate2_y,junction_coordinate1_x,junction_coordinate1_y
0,6415714:6415723:6415603:6415621:None:None,0,AAAPPAQGE,,AGGPPPHQYPPQGWGNTYPQWQPPAPHDPSKAAAAAPPAQGEPPQP...,ENSG00000088247.17:707_697_557:0:6415895:3-exo...,1,False,ENSG00000088247.17,chr19,...,6415807;6415895;6415714;6415734;6415534;6415621,6415807;6415896;6415714;6415734;6415533;6415621,707;697;557,3-exons_9-mer,,,,6415621:6415714,6415621:6415714,6415734:6415807
1,6415714:6415723:6415603:6415621:None:None,1,AAAPPAQGE,,QPPAPHDPSKAAAAAPPAQGEPPQPPPTGQSDYTKAWEEYYKKIGE...,ENSG00000088247.17:703_697_521:0:6415832:3-exo...,1,False,ENSG00000088247.17,chr19,...,6415807;6415832;6415714;6415734;6415381;6415621,6415807;6415834;6415714;6415734;6415379;6415621,703;697;521,3-exons_9-mer,,,,6415621:6415714,6415621:6415714,6415734:6415807
2,6415714:6415723:6415603:6415621:None:None,4,AAAPPAQGE,,QPPAPHDPSKAAAAAPPAQGEPPQPPPTGQSDYTKAWEEYYKKI,ENSG00000088247.17:703_697_557:0:6415832:3-exo...,1,False,ENSG00000088247.17,chr19,...,6415807;6415832;6415714;6415734;6415534;6415621,6415807;6415834;6415714;6415734;6415533;6415621,703;697;557,3-exons_9-mer,,,,6415621:6415714,6415621:6415714,6415734:6415807
3,6415714:6415723:6415603:6415621:None:None,5,AAAPPAQGE,,AGGPPPHQYPPQGWGNTYPQWQPPAPHDPSKAAAAAPPAQGEPPQP...,ENSG00000088247.17:707_697_521:0:6415895:3-exo...,1,False,ENSG00000088247.17,chr19,...,6415807;6415895;6415714;6415734;6415381;6415621,6415807;6415896;6415714;6415734;6415379;6415621,707;697;521,3-exons_9-mer,,,,6415621:6415714,6415621:6415714,6415734:6415807
4,6415714:6415723:6415603:6415621:None:None,48,AAAPPAQGE,,KAAAAAPPAQGEPPQPPPTGQSDYTKAWEEYYKKIGESAGWRGGSS...,ENSG00000088247.17:697_521:0:6415732:2-exons,0,False,ENSG00000088247.17,chr19,...,6415714;6415732;6415381;6415621,6415714;6415734;6415379;6415621,697;521,2-exons,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2668512,133400548:133400572:133404852:133404855:None:None,0,SWASSCQQL,,MSERRVVVDLPTSASSSMPLQRRRASFRGPRSSSSLESPPASRTNA...,ENSG00000170819.5:1_3:0:133400083:2-exons,0,True,ENSG00000170819.5,chr3,...,133400083;133400572;133404852;133404873,133400073;133400572;133404852;133404874,1;3,2-exons,,,,,,
2668513,133400551:133400572:133404852:133404858:None:None,2,WASSCQQLC,,MSERRVVVDLPTSASSSMPLQRRRASFRGPRSSSSLESPPASRTNA...,ENSG00000170819.5:1_3:0:133400083:2-exons,0,True,ENSG00000170819.5,chr3,...,133400083;133400572;133404852;133404873,133400073;133400572;133404852;133404874,1;3,2-exons,,,,,,
2668514,66592439:66592463:66581963:66581966:None:None,0,VRRKRIPSV,,QPFIDEDPDKEKKIKELEMLLMSAENEVRRKRIPSV,ENSG00000185697.16:32_30:0:66592544:2-exons,0,False,ENSG00000185697.16,chr8,...,66592439;66592544;66581768;66581966,66592439;66592544;66581766;66581966,32;30,2-exons,,,,,,
2668515,12718516:12718529:12718600:12718614:None:None,0,YKSKPFISA,,YSLPSRKLVALQLRSIFIKYKSKPFISA,ENSG00000128789.21:66_86_91:0:12712703:3-exons...,0,False,ENSG00000128789.21,chr18,...,12712703;12712760;12718516;12718529;12718600;1...,12712701;12712760;12718516;12718529;12718600;1...,66;86;91,3-exons_9-mer,,,,12718529:12718600,12718529:12718600,12712760:12718516
