In [1]:
import os 
import pandas as pd
from collections import defaultdict
import numpy as np 
import glob
import gzip
import pickle

# Previous steps


### Step 1: 


Get a list of kmers passing the filtering.
Can also be a kmer "longlist" which is a pool of kmers from multiple filter experiments. The peptide to experiment correspondance will then be stored in a table.


### Step 2

Given a list of kmers, grep the corresponding metadata information from the ImmunoPepper metadata file (very big, no kmer information) 

See /GitHub/projects2020_ohsu/eth/peptide_search_format/p20220115_format_peptides_longlist_pr
ior.sh

### Step 3. 



This notebook performs the following tasks * Given a kmer_metadata file, generate a fasta file with metadata relevant for the tryptic digestion 2. A table containing the peptide IDs per sample 


# Define Samples


In [2]:
samples = ['TCGA-BH-A18V-01A-11R-A12D-07', 
        'TCGA-C8-A12P-01A-11R-A115-07', 
        'TCGA-AO-A0JM-01A-21R-A056-07', 
        'TCGA-A2-A0SX-01A-12R-A084-07',
        'TCGA-A2-A0D2-01A-21R-A034-07']

basefolder = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
filter_folder = 'filtering_samples/filters_19May_order_5ge_wAnnot_GPstar'


experiment_folder = os.path.join(basefolder, filter_folder)

print(experiment_folder)

/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar


# Preprocess the annotation 
### Input: Annotation

In [3]:
ann_path = '/cluster/work/grlab/projects/projects2020_OHSU/annotation/gencode.v32.annotation.gtf'

In [4]:
from immunopepper.preprocess import attribute_item_to_dict
from immunopepper.preprocess import leq_strand

In [5]:
# Partial copy From Immunopepper preprocess.py 
transcript_to_gene_dict = {}    # transcript -> gene id


gene_to_transcript_dict = {}    # gene_id -> list of transcripts
gene_cds_begin_dict = {}        # gene -> list of first CDS exons

transcript_to_cds_dict = {}     # transcript -> list of CDS exons
transcript_cds_begin_dict = {}  # transcript -> first exon of the CDS
transcript_to_strand = {}

file_type = ann_path.split('.')[-1]
chromesome_set = set()
# collect information from annotation file
for line in open(ann_path, 'r'):
    if line[0] == '#':
        continue
    item = line.strip().split('\t')
    chromesome_set.add(item[0])
    feature_type = item[2]
    attribute_item = item[-1]
    attribute_dict = attribute_item_to_dict(attribute_item, file_type, feature_type)
    # store relationship between gene ID and its transcript IDs
    if feature_type in ['transcript', 'mRNA']:
        gene_id = attribute_dict['gene_id']
        transcript_id = attribute_dict['transcript_id']
        if attribute_dict['gene_type'] != 'protein_coding' or attribute_dict['transcript_type']  != 'protein_coding':
            continue
        assert (transcript_id not in transcript_to_gene_dict)
        transcript_to_gene_dict[transcript_id] = gene_id
        if gene_id in gene_to_transcript_dict and transcript_id not in gene_to_transcript_dict[gene_id]:
            gene_to_transcript_dict[gene_id].append(transcript_id)
        else:
            gene_to_transcript_dict[gene_id] = [transcript_id]
        # Todo python is 0-based while gene annotation file(.gtf, .vcf, .maf) is one based
    elif feature_type == "CDS":
        parent_ts = attribute_dict['transcript_id']
        strand_mode = item[6]
        cds_left = int(item[3])-1
        cds_right = int(item[4])
        frameshift = int(item[7])
        transcript_to_strand[parent_ts] = strand_mode
        if parent_ts in transcript_to_cds_dict:
            transcript_to_cds_dict[parent_ts].append((cds_left, cds_right, frameshift))
        else:
            transcript_to_cds_dict[parent_ts] = [(cds_left, cds_right, frameshift)]
        if strand_mode == "+" :
            cds_start, cds_stop = cds_left, cds_right
        else:
            cds_start, cds_stop = cds_right, cds_left

        # we only consider the start of the whole CoDing Segment
        if parent_ts not in transcript_cds_begin_dict or \
           leq_strand(cds_start, transcript_cds_begin_dict[parent_ts][0], strand_mode):
            transcript_cds_begin_dict[parent_ts] = (cds_start, cds_stop, item)

# collect first CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    if target_gene not in gene_cds_begin_dict:
        gene_cds_begin_dict[target_gene] = []
    if ts_key in transcript_cds_begin_dict:
        gene_cds_begin_dict[target_gene].append(transcript_cds_begin_dict[ts_key])


In [6]:
# Custom collection of CDS 
transcript_cds_begin_dict_bis = {}
transcript_cds_end_dict_bis = {}

gene_cds_begin_dict_bis = defaultdict(list)
gene_cds_end_dict_bis = defaultdict(list)

# will be in reading order 
for ts_key in transcript_to_cds_dict:
    if transcript_to_strand[ts_key] == '+': # '+'
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][0],
                                                 transcript_to_cds_dict[ts_key][0][1], '+')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][0],
                                                 transcript_to_cds_dict[ts_key][-1][1], '+')
        

    else: 
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][1],
                                                 transcript_to_cds_dict[ts_key][0][0], '-')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][1],
                                                 transcript_to_cds_dict[ts_key][-1][0], '-')
    
    assert(transcript_cds_begin_dict_bis[ts_key][0] == transcript_cds_begin_dict[ts_key][0])
    assert(transcript_cds_begin_dict_bis[ts_key][1] == transcript_cds_begin_dict[ts_key][1])

# collect first, last CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    gene_cds_begin_dict_bis[target_gene].append(transcript_cds_begin_dict_bis[ts_key])
    gene_cds_end_dict_bis[target_gene].append(transcript_cds_end_dict_bis[ts_key])

# Step 4. Format the peptide raw file  
### Input:  file meta with kmers and bi-exons matching the longlist of the samples 

In [7]:
### Restrict the fasta to a whitelist
whitelist_kmer = False  
if whitelist_kmer:
    with open(os.path.join(experiment_folder, 'G_TCGA_Allsamples_intersect.pickle'), 'rb') as handle:
        whitelist_kmer = pickle.load(handle)
    print(len(whitelist_kmer))

In [24]:
sample = samples[4]

In [25]:
# Meta Input File
file_meta = os.path.join(experiment_folder, f'G_{sample}_grep_metadata_raw.tsv.gz')

# Experiment Input File 
file_save_experiement= None

# Fasta output File
if not whitelist_kmer:
    file_save = os.path.join(experiment_folder, f'G_{sample}_pool_kmer.fa')
else:
    file_save = os.path.join(experiment_folder, f'G_{sample}_pool_kmer.fa')



In [26]:
def extract_peptide_fields(pep):
    cols_correct = ['kmer', 'kmer_coord', 'peptide','id','readFrame','readFrameAnnotated','geneName','geneChr','geneStrand',
    'mutationMode','hasStopCodon','isInJunctionList',
    'isIsolated','variantComb','variantSegExpr','modifiedExonsCoord',
    'originalExonsCoord',
    'vertexIdx','kmerType', 'dummy1', 'dummy2', 'dummy3']
    res = {}
    for idx, p in enumerate(pep):
        res[cols_correct[idx]] = p
    return res 


def extract_end_starts(pep_orig_coord, strand):
    ''' Get peptide end and start coordinates'''
    if strand == '+': # Do - strand 
        pep_start = int(pep_orig_coord[0])
        pep_end = int(pep_orig_coord[-1])
    else: 
        pep_start = int(pep_orig_coord[1])
        pep_end = int(pep_orig_coord[-2])
    return pep_start, pep_end


def get_include_flag(start_cds, end_cds, pep_start, pep_end, has_stop_codon ):
    '''Use end and start coordinates for 3' 5' include flag'''
    if pep_start in start_cds: # We will always miss things that are new in the graph 
        pep_5include = 1
    else: 
        pep_5include = 0 
    if (pep_end in end_cds) or (has_stop_codon) == '1':
        pep_3include = 1
    else: 
        pep_3include = 0 
    return pep_5include, pep_3include


def get_nt_len_with_aa_shift(pep_modi_coord):
    '''Get nt length of each exon involved -> jx_list, shift_list'''
    tot_len = 0 
    shift = 0 
    jx_list = []
    jx_list_ori = []
    shift_list = []
    for pair in np.arange(0, len(pep_modi_coord), 2):
        cds = int(pep_modi_coord[pair + 1]) - int(pep_modi_coord[pair])  # 0 based, open right 
        jx_list_ori.append(cds)
        cds += shift 
        shift = cds % 3
        jx_list.append(cds - shift)
        shift_list.append(shift)
        
    return jx_list, shift_list, jx_list_ori


def get_aaPos_betweenFlag(shift_list, jx_list):
    '''Get aa position of the junction
    the junction coordinate jx_pos is the 0-based position in the peptide 
    of the amino acid that either overlaps the junction (if the junction is 
    in the middle of a codon), or is immediately before it if the junction 
    occurs between codons'''
    if shift_list[0]: # junction is inside an amino acid
        aa_junction_pos0 = int((jx_list[0] / 3)) # because 0 based
        between_codons0 = 0 
    else: # junction is between amino acids 
        aa_junction_pos0 = int((jx_list[0] / 3) - 1)  # because 0 based
        between_codons0 = 1
        
    if len(shift_list) > 2: #third exon 
        if shift_list[1]: # junction is inside an amino acid
            aa_junction_pos1 = int((jx_list[1] / 3)) # because 0 based
            between_codons1 = 0 
        else: # junction is between amino acids 
            aa_junction_pos1 = int((jx_list[1] / 3) - 1)  # because 0 based
            between_codons1 = 1 
        aa_junction_pos1_from_start = aa_junction_pos1 + aa_junction_pos0 + 1 
    else:
        aa_junction_pos1 = None
        between_codons1 = None
        aa_junction_pos1_from_start = None
    
    return aa_junction_pos0, between_codons0, aa_junction_pos1, between_codons1, \
           aa_junction_pos1_from_start


def get_genomic_coordinates(pep_modi_coord, strand):
    '''We have in + case: exon1_start, exon 1_stop, exon2_start, exon2_stop, exon3_start, exon3_stop
     In the - case: exon1_stop, exon 1_start, exon 2_stop, exon2_start, exon3_stop, exon3_start'''
    genome_junction_pos1 = None
    if strand == '+':
        genome_junction_pos0 = '{}_{}'.format(pep_modi_coord[1], pep_modi_coord[2])
        if len(pep_modi_coord) > 4:
            genome_junction_pos1 = '{}_{}'.format(pep_modi_coord[3], pep_modi_coord[4])
    else:
        genome_junction_pos0 = '{}_{}'.format(pep_modi_coord[0], pep_modi_coord[3])
        if len(pep_modi_coord) > 4:
            genome_junction_pos1 = '{}_{}'.format(pep_modi_coord[2], pep_modi_coord[5])
    return genome_junction_pos0, genome_junction_pos1


def split_coord(pep_coord):
    pep_coord = pep_coord.split(';')
    pep_coord = [coord for coord in pep_coord if (coord != 'None') and (coord != 'nan')]
    return pep_coord


def write_peptide_to_experiment(filepointer, pep_idx=None, pep_seq=None,\
                                idx=None, header=False):
    if header:
        header_exp = 'peptide_id\tpeptide_sequence\texperiment_ids\n'
        filepointer.write(header_exp)
    elif pep_idx is not None:
        exp_line = '{}\t{}\t{}\n'.format(pep_idx, 
                                         pep_seq,
                                         idx)
        filepointer.write(exp_line)

            
def write_fasta(write_, filepointer, pep_seq, pep_idx, aa_junction_pos, 
                aa_junction_pos1_from_start, between_codons, between_codons1,
                pep_5include, pep_3include, pep_gene, 
                genome_junction_pos, genome_junction_pos1, 
                kmer, jx_pep1, jx_pep2, readFrameAnnotated, \
                kmer_coord, kmer_type, strand, do_write=True):
        
    if write_:
        pep_handle1 = (f'>pepID-{pep_idx};jx_pos-{aa_junction_pos};between_codons-{between_codons}'
                       f';includes_5\'-{pep_5include};includes_3\'-{pep_3include};gene-{pep_gene};'
                       f'jx_coord-{genome_junction_pos};kmer-{kmer};readFrameAnnotated-{readFrameAnnotated};'
                       f'kmer_coord-{kmer_coord};origin-{kmer_type};strand{strand}')

        pep_handle2 = (f'>pepID-{pep_idx};jx_pos-{aa_junction_pos1_from_start};between_codons-{between_codons1}'
                       f';includes_5\'-{pep_5include};includes_3\'-{pep_3include};gene-{pep_gene};'
                       f'jx_coord-{genome_junction_pos1};kmer-{kmer};readFrameAnnotated-{readFrameAnnotated};'
                       f'kmer_coord-{kmer_coord};origin-{kmer_type};strand{strand}')

        if kmer in jx_pep1:
            pep_idx+=1
            sp.write(pep_handle1 + '\n')
            sp.write(pep_seq + '\n')
        elif kmer in jx_pep2:
            pep_idx+=1
            sp.write(pep_handle2 + '\n')
            sp.write(pep_seq + '\n')
        return pep_idx



def cut_peptides(pep_seq, jx_list, between_codons, between_codons1, aa_junction_pos, 
                 aa_junction_pos1, aa_junction_pos1_from_start, 
                 print_ = False):
    peptide_cut = []
    aa_junction_pos_shift = aa_junction_pos + 1 
    exon1, aa_jx1, exon2, aa_jx2, exon3 = '', '', '', '', ''

    if len(jx_list) == 2:
        if between_codons:
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:]
        else:
            exon1 = pep_seq[:aa_junction_pos]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:]
    elif len(jx_list) == 3:
        aa_junction2_pos_shift =  aa_junction_pos1 + aa_junction_pos + 1
        aa_junction2_pos_sshift = aa_junction_pos1 + aa_junction_pos + 2
        assert(aa_junction_pos1_from_start == aa_junction2_pos_shift)
        if between_codons and between_codons1: 
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:
                          aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (not between_codons) and between_codons1: 
            exon1 = pep_seq[:aa_junction_pos ]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:
                          aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (between_codons) and (not between_codons1): 
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift :
                          aa_junction2_pos_shift]
            aa_jx2 = pep_seq[aa_junction2_pos_shift:
                         aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (not between_codons) and (not between_codons1): 
            exon1 = pep_seq[:aa_junction_pos ]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift :
                          aa_junction2_pos_shift]
            aa_jx2 = pep_seq[aa_junction2_pos_shift:
                         aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
    if print_:
        print(f'exon1:{exon1}, aa_containing_jx1:{aa_jx1}, exon2:{exon2}, aa_containing_jx2:{aa_jx2}, exon3:{exon3}')
        print(f'junction positions jx1: {aa_junction_pos}, jx2:{aa_junction_pos1_from_start}')
        print(f'is junction between a codon jx1: {between_codons}, jx2: {between_codons1}')
        print('\n')
    return exon1 + aa_jx1 + exon2, exon2 + aa_jx2 + exon3 

def print_stats(print_, kmer, pep_seq, strand, pep_orig_coord, pep_modi_coord, jx_list,
               jx_list_ori, genome_junction_pos, genome_junction_pos1,
               aa_junction_pos, aa_junction_pos1, between_codons, between_codons1):
    
    if print_:
        p_ori_coord = ';'.join(pep_orig_coord)
        p_modif_coord = ';'.join(pep_modi_coord)
        print(f'INSTANCE: \n kmer {kmer}/ sequence {pep_seq}/ strand {strand} / \n original coordinates {p_ori_coord} / \n modif coordinates {p_modif_coord} /  \n junction list origin {jx_list_ori}/ junction list {jx_list} / \n junction coordinates 1 {genome_junction_pos} / junction coordinates 2 {genome_junction_pos1}')
        #print(aa_junction_pos, between_codons, aa_junction_pos1, between_codons1)
        print('peptide length', len(pep_seq))


def readlines_custom(file_):
    with gzip.open(file_, 'rb') as fp: 
        lines = fp.readlines()
        for j, line in enumerate(lines):
            line = line.decode().replace('\n', '').split('\t')
            if '\\t' in line[0]:
                splitted_p = line[0].split('\\t') #correction temp
                kmer, coord, peptide = splitted_p
                line = [kmer, coord, peptide] + line[1:]
            else:
                line = [kmer, coord] + line #Use previous kmer instance
            lines[j] = line
    return lines

In [27]:
subset_run_test = False
write_ = True
print_= False
test = 0 

### Main 
pep_idx = 0 
peptide_junctions = set()

if file_save_experiement is not None: 
    ep = open(file_save_experiement, 'w') # Experiment file to save
    write_peptide_to_experiment(ep, header=True)
    
with open(file_save, 'w') as sp: # Fasta file to save
    
    ### Iterate over peptides 
    lines = readlines_custom(file_meta)
    for line in lines:
        res = extract_peptide_fields(line)
        
        if whitelist_kmer and res['kmer'] not in whitelist_kmer:
            continue
            

        # some peptides do not contain a junction
        if int(res['isIsolated']):
            continue 
            
        # Genomic coordinates : get include 3' or 5' flag 
        res['originalExonsCoord'] = split_coord(res['originalExonsCoord'])
        start_cds = [ first_exon[0] for first_exon in gene_cds_begin_dict_bis[res['geneName']] ] 
        end_cds = [ last_exon[1] for last_exon in gene_cds_end_dict_bis[res['geneName']] ] 
        pep_start, pep_end = extract_end_starts(res['originalExonsCoord'], res['geneStrand'])
        pep_5include, pep_3include = get_include_flag(start_cds, end_cds, 
                                                      pep_start, pep_end, 
                                                      res['hasStopCodon'] )

        # Modified genomic coordinates: 
        res['modifiedExonsCoord'] = split_coord(res['modifiedExonsCoord'])
        if len(res['modifiedExonsCoord']) <=2:
            continue
            
            
        jx_list, shift_list, jx_list_ori = get_nt_len_with_aa_shift(res['modifiedExonsCoord'])
        # get "junction position" flag
        aa_junction_pos, between_codons, \
        aa_junction_pos1, between_codons1, \
        aa_junction_pos1_from_start = get_aaPos_betweenFlag(shift_list, jx_list)
        #get "genomic coordinates" flag 
        genome_junction_pos, \
        genome_junction_pos1 = get_genomic_coordinates(res['modifiedExonsCoord'], res['geneStrand'])


#             print_stats(print_, res['kmer'], res['peptide'], res['geneStrand'], res['originalExonsCoord'], 
#                         res['modifiedExonsCoord'], jx_list, jx_list_ori, genome_junction_pos,
#                         genome_junction_pos1, aa_junction_pos, aa_junction_pos1, 
#                         between_codons, between_codons1)
        
        # Get the 2 peptides sequences 
        jx_pep1, jx_pep2 = cut_peptides(res['peptide'], jx_list, between_codons,
                                            between_codons1, aa_junction_pos, 
                                            aa_junction_pos1, 
                                            aa_junction_pos1_from_start, 
                                            print_=print_)

        # skip duplicates (Issue 3-exons?)
        if (str(genome_junction_pos) + str(genome_junction_pos1) + res['peptide'] in peptide_junctions):
            continue
        else:
            peptide_junctions.add(str(genome_junction_pos) + str(genome_junction_pos1) + res['peptide'] )
            pep_idx +=1


        # write fasta file 
        write_fasta(write_, sp, res['peptide'], pep_idx, aa_junction_pos, 
                    aa_junction_pos1_from_start, between_codons, between_codons1,
                    pep_5include, pep_3include, res['geneName'], 
                    genome_junction_pos, genome_junction_pos1, 
                    res['kmer'], jx_pep1, jx_pep2, res['readFrameAnnotated'], \
                    res['kmer_coord'], res['kmerType'], res['geneStrand'], do_write=True)
    
#TODO 
#             if file_save_experiement is not None: 
#                 write_peptide_to_experiment(ep, pep_idx, 
#                                             res['peptide'],';'.join(kmer_file_idx[kmer]), 
#                                             peptide_junctions) 


if write_:
    print('written to:')
    print(file_save_experiement)
    print(file_save)
    if file_save_experiement is not None:
        ep.close()


written to:
None
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/G_TCGA-A2-A0D2-01A-21R-A034-07_pool_kmer.fa
