In [1]:
import os 
import pandas as pd
from collections import defaultdict
import numpy as np 
import glob

In [2]:
samples = ['TCGA-BH-A18V-01A-11R-A12D-07.all', 
        'TCGA-C8-A12P-01A-11R-A115-07.all', 
        'TCGA-AO-A0JM-01A-21R-A056-07.all', 
        'TCGA-A2-A0SX-01A-12R-A084-07.all',
        'TCGA-A2-A0D2-01A-21R-A034-07.all']

In [3]:
sample = samples[4]
background = 'commit_d4aee54_GTEXcore'
print(sample)

TCGA-A2-A0D2-01A-21R-A034-07.all


In [4]:
basefolder = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102'

In [5]:
experiment_folder = os.path.join(basefolder, 'filter_{}/{}'.format(sample, background))

In [6]:
experiment_folder

'/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore'

In [7]:
# # Metadata file output of ImmunoPepper
# file_biexon_generated = os.path.join(basefolder, 'cohort_mutNone', 'meta_peptide_pooled_pq.tsv')

In [8]:
# file_longlist  = os.path.join(experiment_folder, 
#                          '{}_{}_{}'.format(sample, 
#                                       background.replace('commit_', ''),
#                                       'kmer_longlist.tsv'))

In [9]:
# file_meta = os.path.join(experiment_folder, 
#                          '{}_{}_{}'.format(sample, 
#                                       background.replace('commit_', ''),
#                                       'kmer_peptides_raw333.tsv'))


# Previous steps


### Step 1: 


Get a list of kmers passing the filtering.
Can also be a kmer "longlist" which is a pool of kmers from multiple filter experiments. The peptide to experiment correspondance will then be stored in a table.


### Step 2

Given a list of kmers, grep the corresponding metadata information from the ImmunoPepper metadata file (very big, no kmer information) 

See /GitHub/projects2020_ohsu/eth/peptide_search_format/p20220115_format_peptides_longlist_pr
ior.sh

### Step 3. 



This notebook performs the following tasks * Given a kmer_metadata file, generate a fasta file with metadata relevant for the tryptic digestion 2. A table containing the peptide IDs per sample 


# Preprocess the annotation 
### Input: Annotation

In [10]:
ann_path = '/cluster/work/grlab/projects/projects2020_OHSU/annotation/gencode.v32.annotation.gtf'

In [11]:
from immunopepper.preprocess import attribute_item_to_dict
from immunopepper.preprocess import leq_strand

In [12]:
# Partial copy From Immunopepper preprocess.py 
transcript_to_gene_dict = {}    # transcript -> gene id


gene_to_transcript_dict = {}    # gene_id -> list of transcripts
gene_cds_begin_dict = {}        # gene -> list of first CDS exons

transcript_to_cds_dict = {}     # transcript -> list of CDS exons
transcript_cds_begin_dict = {}  # transcript -> first exon of the CDS
transcript_to_strand = {}

file_type = ann_path.split('.')[-1]
chromesome_set = set()
# collect information from annotation file
for line in open(ann_path, 'r'):
    if line[0] == '#':
        continue
    item = line.strip().split('\t')
    chromesome_set.add(item[0])
    feature_type = item[2]
    attribute_item = item[-1]
    attribute_dict = attribute_item_to_dict(attribute_item, file_type, feature_type)
    # store relationship between gene ID and its transcript IDs
    if feature_type in ['transcript', 'mRNA']:
        gene_id = attribute_dict['gene_id']
        transcript_id = attribute_dict['transcript_id']
        if attribute_dict['gene_type'] != 'protein_coding' or attribute_dict['transcript_type']  != 'protein_coding':
            continue
        assert (transcript_id not in transcript_to_gene_dict)
        transcript_to_gene_dict[transcript_id] = gene_id
        if gene_id in gene_to_transcript_dict and transcript_id not in gene_to_transcript_dict[gene_id]:
            gene_to_transcript_dict[gene_id].append(transcript_id)
        else:
            gene_to_transcript_dict[gene_id] = [transcript_id]
        # Todo python is 0-based while gene annotation file(.gtf, .vcf, .maf) is one based
    elif feature_type == "CDS":
        parent_ts = attribute_dict['transcript_id']
        strand_mode = item[6]
        cds_left = int(item[3])-1
        cds_right = int(item[4])
        frameshift = int(item[7])
        transcript_to_strand[parent_ts] = strand_mode
        if parent_ts in transcript_to_cds_dict:
            transcript_to_cds_dict[parent_ts].append((cds_left, cds_right, frameshift))
        else:
            transcript_to_cds_dict[parent_ts] = [(cds_left, cds_right, frameshift)]
        if strand_mode == "+" :
            cds_start, cds_stop = cds_left, cds_right
        else:
            cds_start, cds_stop = cds_right, cds_left

        # we only consider the start of the whole CoDing Segment
        if parent_ts not in transcript_cds_begin_dict or \
           leq_strand(cds_start, transcript_cds_begin_dict[parent_ts][0], strand_mode):
            transcript_cds_begin_dict[parent_ts] = (cds_start, cds_stop, item)

# collect first CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    if target_gene not in gene_cds_begin_dict:
        gene_cds_begin_dict[target_gene] = []
    if ts_key in transcript_cds_begin_dict:
        gene_cds_begin_dict[target_gene].append(transcript_cds_begin_dict[ts_key])


In [13]:
# Custom collection of CDS 
transcript_cds_begin_dict_bis = {}
transcript_cds_end_dict_bis = {}

gene_cds_begin_dict_bis = defaultdict(list)
gene_cds_end_dict_bis = defaultdict(list)

# will be in reading order 
for ts_key in transcript_to_cds_dict:
    if transcript_to_strand[ts_key] == '+': # '+'
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][0],
                                                 transcript_to_cds_dict[ts_key][0][1], '+')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][0],
                                                 transcript_to_cds_dict[ts_key][-1][1], '+')
        

    else: 
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][1],
                                                 transcript_to_cds_dict[ts_key][0][0], '-')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][1],
                                                 transcript_to_cds_dict[ts_key][-1][0], '-')
    
    assert(transcript_cds_begin_dict_bis[ts_key][0] == transcript_cds_begin_dict[ts_key][0])
    assert(transcript_cds_begin_dict_bis[ts_key][1] == transcript_cds_begin_dict[ts_key][1])

# collect first, last CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    gene_cds_begin_dict_bis[target_gene].append(transcript_cds_begin_dict_bis[ts_key])
    gene_cds_end_dict_bis[target_gene].append(transcript_cds_end_dict_bis[ts_key])

# Step 3. Get the experiement/ kmer map 

## Step a. Get the experiment IDS
### Input: J_experiment_map file

In [14]:
experiement_files = glob.glob(experiment_folder + '/*Uniprot*tsv')

In [15]:
folder_OHSU = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/Aug21_graph_data_updatedfilters'

In [16]:
exp_map = pd.read_csv(os.path.join(folder_OHSU, 'J_experiment_map.tsv'), sep = '\t')

In [17]:
exp_map.head()

Unnamed: 0,id,motif_filter,min_sample_reads,#_of_cohort_samples,reads_per_cohort_sample,normal_cohort_id,#_normal_samples_allowed,reads_per_normal_sample
0,1,1,0,0,0,paired,0,0
1,2,1,0,0,0,paired,2,3
2,3,1,0,0,0,paired,2,10
3,4,1,0,0,0,paired,2,-1
4,5,1,0,0,0,paired,10,3


In [18]:
def path_to_condition(all_paths):
    '''Converts ETH Names into experimental conditions '''
    # None handling added 
    
    #all_paths = [path.replace('None', '-1') for path in all_paths]
    sample_expr = [os.path.basename(path).split('ref_SampleLim')[1].split('Cohort')[0]
                   for path in all_paths ]

    foreground_cohort_expr = [os.path.basename(path).split('CohortLim')[1].split('Across')[0]
                                       for path in all_paths ]

    foreground_cohort_samples = [os.path.basename(path).split('Across')[1].split('_Filt')[0] 
                                 for path in all_paths ]

    background_cohort_expr = [os.path.basename(path).split('Cohortlim')[1].split('Across')[0]
                              for path in all_paths ]

    background_cohort_samples = [os.path.basename(path).split('Across')[2].split('_FiltUn')[0]
                                 for path in all_paths ]
    background_cohort_id = [os.path.basename(path).split('Normals')[1].split('lim')[0] for path in all_paths ]

    legend_quant = pd.DataFrame({'min_sample_reads':sample_expr, 
                  'reads_per_cohort_sample': foreground_cohort_expr, 
                  '#_of_cohort_samples': foreground_cohort_samples, 
                  'reads_per_normal_sample': background_cohort_expr,
                  '#_normal_samples_allowed' : background_cohort_samples, 
                    'normal_cohort_id':background_cohort_id, 
                    'original_name':all_paths})

    return legend_quant

In [19]:
experiement_files

['/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore/G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleLim2.0CohortLim5.0Across10_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot.tsv',
 '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore/G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleLim0.0CohortLim0.0Across2_FiltNormalsGtexcoreCohortlimNoneAcross2_FiltUniprot.tsv',
 '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore/G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleLim0.0CohortLim5.0Across1_FiltNormalsGtexcoreCohortlim3.0Across2_FiltUniprot.tsv',
 '/cluster/

In [20]:
path_to_table = path_to_condition(experiement_files)


In [21]:
path_to_table.loc[path_to_table['min_sample_reads'] == 'None', 
                  'min_sample_reads'] = 0
path_to_table.loc[path_to_table['reads_per_cohort_sample'] == 'None', 
                  'reads_per_cohort_sample'] = 0
path_to_table.loc[path_to_table['#_of_cohort_samples'] == 'None', 
                  '#_of_cohort_samples'] = 0
path_to_table.loc[path_to_table['reads_per_normal_sample'] == 'None', 
                  'reads_per_normal_sample'] = -1
path_to_table.loc[path_to_table['#_normal_samples_allowed'] == 'None', 
                  '#_normal_samples_allowed'] = -1

In [22]:
path_to_table.head()

Unnamed: 0,min_sample_reads,reads_per_cohort_sample,#_of_cohort_samples,reads_per_normal_sample,#_normal_samples_allowed,normal_cohort_id,original_name
0,2.0,5.0,10,0.0,0,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...
1,0.0,0.0,2,-1.0,2,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...
2,0.0,5.0,1,3.0,2,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...
3,0.0,1.0,10,-1.0,2,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...
4,0.0,1.0,10,0.0,0,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...


In [23]:
exp_map.loc[exp_map['normal_cohort_id'] == 'core_GTEx', 'normal_cohort_id'] = 'GtexcoreCohort'

In [24]:
path_to_table['motif_filter'] = 0 

In [25]:
for col in path_to_table.columns: 
    if col != 'original_name':
#         print(col)
#         print(exp_map[col].dtypes)
#         print(path_to_table[col].dtypes)
        if (exp_map[col].dtypes != path_to_table[col].dtypes) :
#             print('change type')
            if exp_map[col].dtypes == 'int64':
                path_to_table[col] = path_to_table[col].astype( 'float64' )
            path_to_table[col] = path_to_table[col].astype( exp_map[col].dtypes ) 
#         print('\n')

In [26]:
exp_map.columns

Index(['id', 'motif_filter', 'min_sample_reads', '#_of_cohort_samples',
       'reads_per_cohort_sample', 'normal_cohort_id',
       '#_normal_samples_allowed', 'reads_per_normal_sample'],
      dtype='object')

In [27]:
path_to_table.columns

Index(['min_sample_reads', 'reads_per_cohort_sample', '#_of_cohort_samples',
       'reads_per_normal_sample', '#_normal_samples_allowed',
       'normal_cohort_id', 'original_name', 'motif_filter'],
      dtype='object')

In [28]:
id_mapped = path_to_table.merge(exp_map, on = ['min_sample_reads', 'reads_per_cohort_sample', \
                                   '#_of_cohort_samples', 'reads_per_normal_sample', \
                                   '#_normal_samples_allowed',
                                   'normal_cohort_id', 'motif_filter'], how = 'inner')

In [29]:
id_mapped['filename'] = [os.path.basename(name) for name in id_mapped['original_name']]

In [30]:
id_mapped.shape

(121, 10)

In [31]:
id_mapped.head()

Unnamed: 0,min_sample_reads,reads_per_cohort_sample,#_of_cohort_samples,reads_per_normal_sample,#_normal_samples_allowed,normal_cohort_id,original_name,motif_filter,id,filename
0,2,5,10,0,0,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...,0,475,G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleL...
1,0,0,2,-1,2,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...,0,388,G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleL...
2,0,5,1,3,2,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...,0,380,G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleL...
3,0,1,10,-1,2,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...,0,412,G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleL...
4,0,1,10,0,0,GtexcoreCohort,/cluster/work/grlab/projects/projects2020_OHSU...,0,409,G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleL...


In [32]:
# path_to_table.loc[(path_to_table['min_sample_reads'] == 0) & \
#             (path_to_table['reads_per_cohort_sample'] == 0) & \
#            (path_to_table['#_of_cohort_samples'] == 2) & \
#            (path_to_table['reads_per_normal_sample'] == -1) & \
#            (path_to_table['#_normal_samples_allowed'] == 2) & \
#            (path_to_table['normal_cohort_id'] == 'GtexcoreCohort') ]

## Step b. Get the longlist of peptides
### Input: Peptide files from ETH filtering

In [33]:
# Changed to support file that are the result of parameters not included 
kmer_file_idx = defaultdict(list) 
for io, eth_path0 in enumerate(experiement_files):
    eth_path = glob.glob(eth_path0 + '/*part*')
    if eth_path:
        eth_path = eth_path[0]
        df_eth = pd.read_csv(eth_path, sep="\t", usecols = ['kmer'])
        df_eth = set(df_eth['kmer'])
        base_name = eth_path.split('/')[-2]
        idx_exp = id_mapped.loc[id_mapped['filename'] == base_name, 'id']
        if len(idx_exp):
            idx_exp = idx_exp.values[0]
            for kmer in df_eth:
                kmer_file_idx[kmer].append(str(idx_exp))
    else:
        print(eth_path0, 'not found')

/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore/G_TCGA-A2-A0D2-01A-21R-A034-07.all_ref_SampleLim2.0CohortLim0.0Across1_FiltNormalsGtexcoreCohortlim0.0Across0_FiltUniprot_FiltMHC.tsv not found


# Step 4. Format the peptide raw file  
### Input:  file meta with kmers and bi-exons matching the longlist of the samples 

In [34]:
file_meta = os.path.join(experiment_folder,
                         '{}_{}_{}'.format(sample, 
                                      background.replace('commit_', ''),
                                     'kmer_peptides_raw_flag.tsv'))

In [35]:
file_save = os.path.join(experiment_folder,
                         'G_{}_{}'.format(sample.split('.')[0], 
                                      'query_peptides_kmers_flag_jx.fa'))

In [36]:
file_save_experiement= os.path.join(experiment_folder,
                         'G_{}_{}'.format(sample.split('.')[0], 
                                      'experiments_per_peptide_flag.tsv'))

In [37]:
fields_meta_peptide_dict = ['peptide', 'id', 'readFrame', 'geneName', 'geneChr', 'geneStrand',
                                'mutationMode',
                                'junctionAnnotated', 'hasStopCodon', 'isInJunctionList',
                                'isIsolated', 'variantComb', 'variantSegExpr', 'modifiedExonsCoord',
                                'originalExonsCoord', 'vertexIdx',
                                'kmerType']



In [38]:
def extract_peptide_fields(pep):
    '''0 peptide
    1 id
    2 readFrame
    3 readFrameAnnotated
    4 geneName
    5 geneChr
    6 geneStrand
    7 mutationMode
    8 junctionAnnotated
    9 hasStopCodon
    10 isInJunctionList
    11 isIsolated
    12 variantComb
    13 variantSegExpr
    14 modifiedExonsCoord
    15 originalExonsCoord
    16 vertexIdx
    17 kmerType '''
    
    
    pep = pep.split(',')
    pep_seq = pep[0]
    pep_gene = pep[4]
    pep_orig_coordS = pep[15].split('/')
    pep_modif_coordS = pep[14].split('/')
    strand = pep[6]
    has_stop_codon = pep[9]
    readFrameAnnotated = pep[3]
    junctionAnnotated = pep[8]
    kmer_type = pep[-1].replace('-','').replace('-','').replace('\n','')

    return pep, pep_seq, pep_gene, pep_orig_coordS, pep_modif_coordS, \
           strand, has_stop_codon, readFrameAnnotated, junctionAnnotated, kmer_type

In [39]:
def extract_end_starts(pep_orig_coord):
    ''' Get peptide end and start coordinates'''
    if strand == '+': # Do - strand 
        pep_start = np.int(pep_orig_coord[0])
        pep_end = np.int(pep_orig_coord[-1])
    else: 
        pep_start = np.int(pep_orig_coord[1])
        pep_end = np.int(pep_orig_coord[-2])
    return pep_start, pep_end

In [40]:
def get_include_flag(start_cds, end_cds, pep_start, pep_end, has_stop_codon ):
    '''Use end and start coordinates for 3' 5' include flag'''
    if pep_start in start_cds: # We will always miss things that are new in the graph 
        pep_5include = 1
    else: 
        pep_5include = 0 
    if (pep_end in end_cds) or (has_stop_codon) == '1':
        pep_3include = 1
    else: 
        pep_3include = 0 
    return pep_5include, pep_3include

In [41]:
def get_nt_len_with_aa_shift(pep_modi_coord):
    '''Get nt length of each exon involved -> jx_list, shift_list'''
    tot_len = 0 
    shift = 0 
    jx_list = []
    jx_list_ori = []
    shift_list = []
    for pair in np.arange(0, len(pep_modi_coord), 2):
        cds = int(pep_modi_coord[pair + 1]) - int(pep_modi_coord[pair])  # 0 based, open right 
        jx_list_ori.append(cds)
        cds += shift 
        shift = cds % 3
        jx_list.append(cds - shift)
        shift_list.append(shift)
        
    return jx_list, shift_list, jx_list_ori

In [42]:
def get_aaPos_betweenFlag(shift_list, jx_list):
    '''Get aa position of the junction
    the junction coordinate jx_pos is the 0-based position in the peptide 
    of the amino acid that either overlaps the junction (if the junction is 
    in the middle of a codon), or is immediately before it if the junction 
    occurs between codons'''
    if shift_list[0]: # junction is inside an amino acid
        aa_junction_pos0 = int((jx_list[0] / 3)) # because 0 based
        between_codons0 = 0 
    else: # junction is between amino acids 
        aa_junction_pos0 = int((jx_list[0] / 3) - 1)  # because 0 based
        between_codons0 = 1
        
    if len(shift_list) > 2: #third exon 
        if shift_list[1]: # junction is inside an amino acid
            aa_junction_pos1 = int((jx_list[1] / 3)) # because 0 based
            between_codons1 = 0 
        else: # junction is between amino acids 
            aa_junction_pos1 = int((jx_list[1] / 3) - 1)  # because 0 based
            between_codons1 = 1 
        aa_junction_pos1_from_start = aa_junction_pos1 + aa_junction_pos0 + 1 
    else:
        aa_junction_pos1 = None
        between_codons1 = None
        aa_junction_pos1_from_start = None
    
    return aa_junction_pos0, between_codons0, aa_junction_pos1, between_codons1, \
           aa_junction_pos1_from_start

In [43]:
def get_genomic_coordinates(pep_modi_coord):
    '''We have in + case: exon1_start, exon 1_stop, exon2_start, exon2_stop, exon3_start, exon3_stop
     In the - case: exon1_stop, exon 1_start, exon 2_stop, exon2_start, exon3_stop, exon3_start'''
    genome_junction_pos1 = None
    if strand == '+':
        genome_junction_pos0 = '{}_{}'.format(pep_modi_coord[1], pep_modi_coord[2])
        if len(pep_modi_coord) > 4:
            genome_junction_pos1 = '{}_{}'.format(pep_modi_coord[3], pep_modi_coord[4])
    else:
        genome_junction_pos0 = '{}_{}'.format(pep_modi_coord[0], pep_modi_coord[3])
        if len(pep_modi_coord) > 4:
            genome_junction_pos1 = '{}_{}'.format(pep_modi_coord[2], pep_modi_coord[5])
    return genome_junction_pos0, genome_junction_pos1

In [44]:
def split_coord(pep_coord):
    pep_coord = pep_coord.split(';')
    pep_coord = [coord for coord in pep_coord if (coord != 'None') and (coord != 'nan')]
    return pep_coord

In [45]:
def preprocess_line(line):
    line = line.replace('3-exons_9-mer ', '3-exons_9-mer@').replace('2-exons ', '2-exons@')
    kmer = line.split(',')[0]
    peptides = ','.join(line.split(',')[1:])
    peptides = peptides.split('@')
    return line, kmer, peptides

In [46]:
def write_peptide_to_experiment(filepointer, pep_idx=None, pep_seq=None,\
                                idx=None, header=False):
    if header:
        header_exp = 'peptide_id\tpeptide_sequence\texperiment_ids\n'
        filepointer.write(header_exp)
    elif pep_idx is not None:
        exp_line = '{}\t{}\t{}\n'.format(pep_idx, 
                                         pep_seq,
                                         idx)
        filepointer.write(exp_line)


In [47]:
def write_fasta(filepointer, pep_idx, aa_junction_pos, 
                aa_junction_pos1_from_start, between_codons, between_codons1,
                pep_5include, pep_3include, pep_gene, 
                genome_junction_pos, genome_junction_pos1, 
                kmer, jx_pep1, jx_pep2, readFrameAnnotated, \
                junctionAnnotated, kmer_type, do_write=True):
        
    
    pep_handle1 = '>pepID-{};jx_pos-{};between_codons-{};includes_5\'-{};includes_3\'-{};gene-{};jx_coord-{};kmer-{};readFrameAnnotated-{};junctionAnnotated-{};origin-{}'.format(
    pep_idx, aa_junction_pos, between_codons, pep_5include, 
    pep_3include, pep_gene, genome_junction_pos, kmer, readFrameAnnotated, 
    junctionAnnotated, kmer_type)

    pep_handle2 = '>pepID-{};jx_pos-{};between_codons-{};includes_5\'-{};includes_3\'-{};gene-{};jx_coord-{};kmer-{};readFrameAnnotated-{};junctionAnnotated-{};origin-{}'.format(
    pep_idx, aa_junction_pos1_from_start, between_codons1, pep_5include, 
    pep_3include, pep_gene, genome_junction_pos1, kmer, readFrameAnnotated, 
    junctionAnnotated, kmer_type)

    if kmer in jx_pep1:
        pep_idx+=1
        sp.write(pep_handle1 + '\n')
        sp.write(pep_seq + '\n')
    elif kmer in jx_pep2:
        pep_idx+=1
        sp.write(pep_handle2 + '\n')
        sp.write(pep_seq + '\n')
    else:
        pep_idx+=1
        sp.write(pep_handle1 + '\n')
        sp.write(pep_seq + '\n')
        pep_idx+=1
        sp.write(pep_handle2 + '\n')
        sp.write(pep_seq + '\n')
    return pep_idx

In [48]:
def cut_peptides(jx_list, between_codons, between_codons1, aa_junction_pos, 
                 aa_junction_pos1, aa_junction_pos1_from_start, 
                 print_ = False):
    peptide_cut = []
    aa_junction_pos_shift = aa_junction_pos + 1 
    exon1, aa_jx1, exon2, aa_jx2, exon3 = '', '', '', '', ''

    if len(jx_list) == 2:
        if between_codons:
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:]
        else:
            exon1 = pep_seq[:aa_junction_pos]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:]
    elif len(jx_list) == 3:
        aa_junction2_pos_shift =  aa_junction_pos1 + aa_junction_pos + 1
        aa_junction2_pos_sshift = aa_junction_pos1 + aa_junction_pos + 2
        assert(aa_junction_pos1_from_start == aa_junction2_pos_shift)
        if between_codons and between_codons1: 
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:
                          aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (not between_codons) and between_codons1: 
            exon1 = pep_seq[:aa_junction_pos ]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:
                          aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (between_codons) and (not between_codons1): 
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift :
                          aa_junction2_pos_shift]
            aa_jx2 = pep_seq[aa_junction2_pos_shift:
                         aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (not between_codons) and (not between_codons1): 
            exon1 = pep_seq[:aa_junction_pos ]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift :
                          aa_junction2_pos_shift]
            aa_jx2 = pep_seq[aa_junction2_pos_shift:
                         aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
    if print_:
        print(f'exon1:{exon1}, aa_containing_jx1:{aa_jx1}, exon2:{exon2}, aa_containing_jx2:{aa_jx2}, exon3:{exon3}')
        print(f'junction positions jx1: {aa_junction_pos}, jx2:{aa_junction_pos1_from_start}')
        print(f'is junction between a codon jx1: {between_codons}, jx2: {between_codons1}')
        print('\n')
    return exon1 + aa_jx1 + exon2, exon2 + aa_jx2 + exon3 

In [49]:
def print_stats(print_, kmer, pep_seq, strand, pep_orig_coord, pep_modi_coord, jx_list,
               jx_list_ori, genome_junction_pos, genome_junction_pos1,
               aa_junction_pos, aa_junction_pos1, between_codons, between_codons1):
    
    if print_:
        p_ori_coord = ';'.join(pep_orig_coord)
        p_modif_coord = ';'.join(pep_modi_coord)
        print(f'INSTANCE: \n kmer {kmer}/ sequence {pep_seq}/ strand {strand} / \n original coordinates {p_ori_coord} / \n modif coordinates {p_modif_coord} /  \n junction list origin {jx_list_ori}/ junction list {jx_list} / \n junction coordinates 1 {genome_junction_pos} / junction coordinates 2 {genome_junction_pos1}')
        #print(aa_junction_pos, between_codons, aa_junction_pos1, between_codons1)
        print('peptide length', len(pep_seq))


In [50]:
# Note: corrected the inversion of coordinates, extracted the right matching coordinate position for 3 exon peptide, remove has_stop_codon == 0 break 

In [51]:

## Format and create flags 
pep_idx = 0 
kmer_len = 9 

print_ = False
test_query = 'VGEHRQRRA'
subset_run_test = False
write_= True

with open(file_save_experiement, 'w') as ep: # Experiment file to save
    write_peptide_to_experiment(ep, header=True)
    with open(file_save, 'w') as sp: # Fasta file to save
        with open(file_meta, 'r') as fp: # metadata from immunopepper collected previously
            print("open {}".format(file_meta))
            lines = fp.readlines()
            ### Iterate over kmers 
            for line in lines:
                if 'modifiedExonsCoord' in line: # Skip header
                    continue

                # --- TEST ---
                if subset_run_test and (test_query in line):
                    print(f'\n METADATA (ImmunoPepper derived): \n {line}')
                # --- END TEST ---

                line, kmer, peptides = preprocess_line(line)

                #Iterate over kmer-associated peptides
                for pep in peptides:
                    pep, pep_seq, pep_gene, pep_orig_coordS, pep_modif_coordS, \
                    strand, has_stop_codon, readFrameAnnotated, \
                    junctionAnnotated, kmer_type = extract_peptide_fields(pep)


                    # Iterate over original genomic coordinates : get include 3' or 5' flag
                    for pep_orig_coord in pep_orig_coordS:
                        pep_orig_coord = split_coord(pep_orig_coord)
                        start_cds = [ first_exon[0] for first_exon in 
                                     gene_cds_begin_dict_bis[pep_gene] ] 
                        end_cds = [ last_exon[1] for last_exon in 
                                   gene_cds_end_dict_bis[pep_gene] ] 
                        pep_start, pep_end = extract_end_starts(pep_orig_coord)
                        pep_5include, pep_3include = get_include_flag(start_cds, end_cds, 
                                                                      pep_start, pep_end, 
                                                                      has_stop_codon )

                    # Iterate over modified genomic coordinates: get 
                    # "junction position" flag and "between codon" flag
                    #"and genomic coordinates"
                    for pep_modi_coord in pep_modif_coordS:
                        pep_modi_coord = split_coord(pep_modi_coord)
                        if len(pep_modi_coord) <=2:
                            continue
                        jx_list, shift_list, \
                        jx_list_ori = get_nt_len_with_aa_shift(pep_modi_coord)

                        aa_junction_pos, between_codons, \
                        aa_junction_pos1, between_codons1, \
                        aa_junction_pos1_from_start = get_aaPos_betweenFlag(shift_list,
                                                                                  jx_list)

                        genome_junction_pos, \
                        genome_junction_pos1 = get_genomic_coordinates(pep_modi_coord)

                        # --- TEST ---
                        if subset_run_test and (test_query not in pep_seq): 
                            continue
                        # --- END TEST ---

                        print_stats(print_, kmer, pep_seq, strand, pep_orig_coord, 
                                    pep_modi_coord, jx_list, jx_list_ori, genome_junction_pos,
                                    genome_junction_pos1, aa_junction_pos, aa_junction_pos1, 
                                    between_codons, between_codons1)

                        jx_pep1, jx_pep2 = cut_peptides(jx_list, between_codons,
                                                        between_codons1, aa_junction_pos, 
                                                        aa_junction_pos1, 
                                                        aa_junction_pos1_from_start, 
                                                        print_=print_)

                        # write fasta file 
                        pep_idx = write_fasta(sp, pep_idx, aa_junction_pos,
                                    aa_junction_pos1_from_start, between_codons, 
                                    between_codons1, pep_5include, pep_3include, pep_gene, 
                                    genome_junction_pos, genome_junction_pos1, 
                                    kmer, jx_pep1, jx_pep2, 
                                    readFrameAnnotated, \
                                    junctionAnnotated, \
                                    kmer_type, \
                                    do_write=write_)
                        # write experiment file 
                        write_peptide_to_experiment(ep, pep_idx, pep_seq,';'.join(
                            kmer_file_idx[kmer])) 


if write_:
    print('written to:')
    print(file_save_experiement)
    print(file_save)


open /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore/TCGA-A2-A0D2-01A-21R-A034-07.all_d4aee54_GTEXcore_kmer_peptides_raw_flag.tsv
written to:
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore/G_TCGA-A2-A0D2-01A-21R-A034-07_experiments_per_peptide_flag.tsv
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/v2_v2.5f0752a_conf2_annotFrame_cap0_runs_pya0.17.1/TCGA_Breast_1102/filter_TCGA-A2-A0D2-01A-21R-A034-07.all/commit_d4aee54_GTEXcore/G_TCGA-A2-A0D2-01A-21R-A034-07_query_peptides_kmers_flag_jx.fa
