In [5]:
import os 
import pandas as pd
from collections import defaultdict
import numpy as np 
import glob
import gzip

# Previous steps


### Step 1: 


Get a list of kmers passing the filtering.
Can also be a kmer "longlist" which is a pool of kmers from multiple filter experiments. The peptide to experiment correspondance will then be stored in a table.


### Step 2

Given a list of kmers, grep the corresponding metadata information from the ImmunoPepper metadata file (very big, no kmer information) 

See /GitHub/projects2020_ohsu/eth/peptide_search_format/p20220115_format_peptides_longlist_pr
ior.sh

### Step 3. 



This notebook performs the following tasks * Given a kmer_metadata file, generate a fasta file with metadata relevant for the tryptic digestion 2. A table containing the peptide IDs per sample 


# Define Samples


In [18]:
samples = ['TCGA-BH-A18V-01A-11R-A12D-07.all', 
        'TCGA-C8-A12P-01A-11R-A115-07.all', 
        'TCGA-AO-A0JM-01A-21R-A056-07.all', 
        'TCGA-A2-A0SX-01A-12R-A084-07.all',
        'TCGA-A2-A0D2-01A-21R-A034-07.all']

In [19]:
sample = samples[4]
print(sample)

TCGA-A2-A0D2-01A-21R-A034-07.all


In [20]:
basefolder = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
filter_folder = 'filtering_samples/filters_22March_order_wany_wAnnot'

In [21]:

experiment_folder = os.path.join(basefolder, filter_folder)

In [22]:
experiment_folder

'/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/filters_22March_order_wany_wAnnot'

# Preprocess the annotation 
### Input: Annotation

In [23]:
ann_path = '/cluster/work/grlab/projects/projects2020_OHSU/annotation/gencode.v32.annotation.gtf'

In [24]:
from immunopepper.preprocess import attribute_item_to_dict
from immunopepper.preprocess import leq_strand

In [25]:
# Partial copy From Immunopepper preprocess.py 
transcript_to_gene_dict = {}    # transcript -> gene id


gene_to_transcript_dict = {}    # gene_id -> list of transcripts
gene_cds_begin_dict = {}        # gene -> list of first CDS exons

transcript_to_cds_dict = {}     # transcript -> list of CDS exons
transcript_cds_begin_dict = {}  # transcript -> first exon of the CDS
transcript_to_strand = {}

file_type = ann_path.split('.')[-1]
chromesome_set = set()
# collect information from annotation file
for line in open(ann_path, 'r'):
    if line[0] == '#':
        continue
    item = line.strip().split('\t')
    chromesome_set.add(item[0])
    feature_type = item[2]
    attribute_item = item[-1]
    attribute_dict = attribute_item_to_dict(attribute_item, file_type, feature_type)
    # store relationship between gene ID and its transcript IDs
    if feature_type in ['transcript', 'mRNA']:
        gene_id = attribute_dict['gene_id']
        transcript_id = attribute_dict['transcript_id']
        if attribute_dict['gene_type'] != 'protein_coding' or attribute_dict['transcript_type']  != 'protein_coding':
            continue
        assert (transcript_id not in transcript_to_gene_dict)
        transcript_to_gene_dict[transcript_id] = gene_id
        if gene_id in gene_to_transcript_dict and transcript_id not in gene_to_transcript_dict[gene_id]:
            gene_to_transcript_dict[gene_id].append(transcript_id)
        else:
            gene_to_transcript_dict[gene_id] = [transcript_id]
        # Todo python is 0-based while gene annotation file(.gtf, .vcf, .maf) is one based
    elif feature_type == "CDS":
        parent_ts = attribute_dict['transcript_id']
        strand_mode = item[6]
        cds_left = int(item[3])-1
        cds_right = int(item[4])
        frameshift = int(item[7])
        transcript_to_strand[parent_ts] = strand_mode
        if parent_ts in transcript_to_cds_dict:
            transcript_to_cds_dict[parent_ts].append((cds_left, cds_right, frameshift))
        else:
            transcript_to_cds_dict[parent_ts] = [(cds_left, cds_right, frameshift)]
        if strand_mode == "+" :
            cds_start, cds_stop = cds_left, cds_right
        else:
            cds_start, cds_stop = cds_right, cds_left

        # we only consider the start of the whole CoDing Segment
        if parent_ts not in transcript_cds_begin_dict or \
           leq_strand(cds_start, transcript_cds_begin_dict[parent_ts][0], strand_mode):
            transcript_cds_begin_dict[parent_ts] = (cds_start, cds_stop, item)

# collect first CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    if target_gene not in gene_cds_begin_dict:
        gene_cds_begin_dict[target_gene] = []
    if ts_key in transcript_cds_begin_dict:
        gene_cds_begin_dict[target_gene].append(transcript_cds_begin_dict[ts_key])


In [26]:
# Custom collection of CDS 
transcript_cds_begin_dict_bis = {}
transcript_cds_end_dict_bis = {}

gene_cds_begin_dict_bis = defaultdict(list)
gene_cds_end_dict_bis = defaultdict(list)

# will be in reading order 
for ts_key in transcript_to_cds_dict:
    if transcript_to_strand[ts_key] == '+': # '+'
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][0],
                                                 transcript_to_cds_dict[ts_key][0][1], '+')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][0],
                                                 transcript_to_cds_dict[ts_key][-1][1], '+')
        

    else: 
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][1],
                                                 transcript_to_cds_dict[ts_key][0][0], '-')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][1],
                                                 transcript_to_cds_dict[ts_key][-1][0], '-')
    
    assert(transcript_cds_begin_dict_bis[ts_key][0] == transcript_cds_begin_dict[ts_key][0])
    assert(transcript_cds_begin_dict_bis[ts_key][1] == transcript_cds_begin_dict[ts_key][1])

# collect first, last CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    gene_cds_begin_dict_bis[target_gene].append(transcript_cds_begin_dict_bis[ts_key])
    gene_cds_end_dict_bis[target_gene].append(transcript_cds_end_dict_bis[ts_key])

# Step 4. Format the peptide raw file  
### Input:  file meta with kmers and bi-exons matching the longlist of the samples 

In [27]:
file_meta = os.path.join(experiment_folder, 'G_TCGA_Allsamples_all_exp_kmer_metadata_raw.tsv.gz')

In [30]:
file_save = os.path.join(experiment_folder, 'G_TCGA_Allsamples_all_exp_kmer.fa')

In [34]:
file_save_experiement= None

In [35]:
fields_meta_peptide_dict = ['peptide', 'id', 'readFrame', 'geneName', 'geneChr', 'geneStrand',
                                'mutationMode',
                                'junctionAnnotated', 'hasStopCodon', 'isInJunctionList',
                                'isIsolated', 'variantComb', 'variantSegExpr', 'modifiedExonsCoord',
                                'originalExonsCoord', 'vertexIdx',
                                'kmerType']



In [164]:
import os 
import pandas as pd
from collections import defaultdict
import numpy as np 
import glob


def extract_peptide_fields(pep):
    cols_correct = ['kmer', 'kmer_coord', 'peptide','id','readFrame','readFrameAnnotated','geneName','geneChr','geneStrand',
    'mutationMode','hasStopCodon','isInJunctionList',
    'isIsolated','variantComb','variantSegExpr','modifiedExonsCoord',
    'originalExonsCoord',
    'vertexIdx','kmerType', 'dummy1', 'dummy2', 'dummy3']
    res = {}
    for idx, p in enumerate(pep):
        res[cols_correct[idx]] = p
    return res 


def extract_end_starts(pep_orig_coord, strand):
    ''' Get peptide end and start coordinates'''
    if strand == '+': # Do - strand 
        pep_start = np.int(pep_orig_coord[0])
        pep_end = np.int(pep_orig_coord[-1])
    else: 
        pep_start = np.int(pep_orig_coord[1])
        pep_end = np.int(pep_orig_coord[-2])
    return pep_start, pep_end

def get_include_flag(start_cds, end_cds, pep_start, pep_end, has_stop_codon ):
    '''Use end and start coordinates for 3' 5' include flag'''
    if pep_start in start_cds: # We will always miss things that are new in the graph 
        pep_5include = 1
    else: 
        pep_5include = 0 
    if (pep_end in end_cds) or (has_stop_codon) == '1':
        pep_3include = 1
    else: 
        pep_3include = 0 
    return pep_5include, pep_3include

def get_nt_len_with_aa_shift(pep_modi_coord):
    '''Get nt length of each exon involved -> jx_list, shift_list'''
    tot_len = 0 
    shift = 0 
    jx_list = []
    jx_list_ori = []
    shift_list = []
    for pair in np.arange(0, len(pep_modi_coord), 2):
        cds = int(pep_modi_coord[pair + 1]) - int(pep_modi_coord[pair])  # 0 based, open right 
        jx_list_ori.append(cds)
        cds += shift 
        shift = cds % 3
        jx_list.append(cds - shift)
        shift_list.append(shift)
        
    return jx_list, shift_list, jx_list_ori

def get_aaPos_betweenFlag(shift_list, jx_list):
    '''Get aa position of the junction
    the junction coordinate jx_pos is the 0-based position in the peptide 
    of the amino acid that either overlaps the junction (if the junction is 
    in the middle of a codon), or is immediately before it if the junction 
    occurs between codons'''
    if shift_list[0]: # junction is inside an amino acid
        aa_junction_pos0 = int((jx_list[0] / 3)) # because 0 based
        between_codons0 = 0 
    else: # junction is between amino acids 
        aa_junction_pos0 = int((jx_list[0] / 3) - 1)  # because 0 based
        between_codons0 = 1
        
    if len(shift_list) > 2: #third exon 
        if shift_list[1]: # junction is inside an amino acid
            aa_junction_pos1 = int((jx_list[1] / 3)) # because 0 based
            between_codons1 = 0 
        else: # junction is between amino acids 
            aa_junction_pos1 = int((jx_list[1] / 3) - 1)  # because 0 based
            between_codons1 = 1 
        aa_junction_pos1_from_start = aa_junction_pos1 + aa_junction_pos0 + 1 
    else:
        aa_junction_pos1 = None
        between_codons1 = None
        aa_junction_pos1_from_start = None
    
    return aa_junction_pos0, between_codons0, aa_junction_pos1, between_codons1, \
           aa_junction_pos1_from_start

def get_genomic_coordinates(pep_modi_coord, strand):
    '''We have in + case: exon1_start, exon 1_stop, exon2_start, exon2_stop, exon3_start, exon3_stop
     In the - case: exon1_stop, exon 1_start, exon 2_stop, exon2_start, exon3_stop, exon3_start'''
    genome_junction_pos1 = None
    if strand == '+':
        genome_junction_pos0 = '{}_{}'.format(pep_modi_coord[1], pep_modi_coord[2])
        if len(pep_modi_coord) > 4:
            genome_junction_pos1 = '{}_{}'.format(pep_modi_coord[3], pep_modi_coord[4])
    else:
        genome_junction_pos0 = '{}_{}'.format(pep_modi_coord[0], pep_modi_coord[3])
        if len(pep_modi_coord) > 4:
            genome_junction_pos1 = '{}_{}'.format(pep_modi_coord[2], pep_modi_coord[5])
    return genome_junction_pos0, genome_junction_pos1

def split_coord(pep_coord):
    pep_coord = pep_coord.split(';')
    pep_coord = [coord for coord in pep_coord if (coord != 'None') and (coord != 'nan')]
    return pep_coord

def preprocess_line(line):
    assert(False)
    line = line.replace('3-exons_9-mer ', '3-exons_9-mer@').replace('2-exons ', '2-exons@')
    kmer = line.split(',')[0]
    peptides = ','.join(line.split(',')[1:])
    peptides = peptides.split('@')
    return line, kmer, peptides

def write_peptide_to_experiment(filepointer, pep_idx=None, pep_seq=None,\
                                idx=None, header=False):
    if header:
        header_exp = 'peptide_id\tpeptide_sequence\texperiment_ids\n'
        filepointer.write(header_exp)
    elif pep_idx is not None:
        exp_line = '{}\t{}\t{}\n'.format(pep_idx, 
                                         pep_seq,
                                         idx)
        filepointer.write(exp_line)


def write_fasta_option_MI(sp, pep_seq, pep_idx, aa_junction_pos, 
                aa_junction_pos1_from_start, between_codons, between_codons1,
                pep_5include, pep_3include, pep_gene, gene_strand,
                genome_junction_pos, genome_junction_pos1, 
                kmer, jx_pep1, jx_pep2, readFrameAnnotated, \
                junctionAnnotated, kmer_type, minorIntron, junctionExpr, sample, 
                jx_to_expression, do_write=True):
    
        
    expr_val = junctionExpr.split(';')
    
    pep_handle1 = '>pepID-{};jx_pos-{};between_codons-{};includes_5\'-{};includes_3\'-{};gene-{};jx_coord-{};kmer-{};readFrameAnnotated-{};junctionAnnotated-{};origin-{}'.format(
    pep_idx, aa_junction_pos, between_codons, pep_5include, 
    pep_3include, pep_gene, genome_junction_pos, kmer, readFrameAnnotated, 
    junctionAnnotated[0], kmer_type)
    expr1 = float(expr_val[0])
    
    if aa_junction_pos1_from_start:
        pep_handle2 = '>pepID-{};jx_pos-{};between_codons-{};includes_5\'-{};includes_3\'-{};gene-{};jx_coord-{};kmer-{};readFrameAnnotated-{};junctionAnnotated-{};origin-{}'.format(
        pep_idx, aa_junction_pos1_from_start, between_codons1, pep_5include, 
        pep_3include, pep_gene, genome_junction_pos1, kmer, readFrameAnnotated, 
        junctionAnnotated[1], kmer_type)
        expr2 = float(expr_val[1])
    else:
        pep_handle2 = pep_handle1
        expr2 = expr1
        genome_junction_pos1 = genome_junction_pos
        
    # Whether to write the first, the second junction or both 
    if (minorIntron == 1) or (minorIntron == 3) :

        if gene_strand == '+':
            #write_first 
            if expr1 > 0: 
                pep_idx+=1
                sp.write(pep_handle1 + '\n')
                sp.write(pep_seq + '\n')
                jx_to_expression[0].append(genome_junction_pos)
                jx_to_expression[3].append(expr1)
                jx_to_expression[1].append(pep_seq)
                jx_to_expression[2].append(sample)
        else:
            #write_second 
            if expr2 > 0: 
                pep_idx+=1
                sp.write(pep_handle2 + '\n')
                sp.write(pep_seq + '\n')
                jx_to_expression[0].append(genome_junction_pos1)
                jx_to_expression[3].append(expr2)
                jx_to_expression[1].append(pep_seq)
                jx_to_expression[2].append(sample)
    if (minorIntron == 2) or (minorIntron == 3) :

        if gene_strand == '+':
            #write_second 
            if expr2 > 0:
                pep_idx+=1
                sp.write(pep_handle2 + '\n')
                sp.write(pep_seq + '\n')
                jx_to_expression[0].append(genome_junction_pos1)
                jx_to_expression[3].append(expr2)
                jx_to_expression[1].append(pep_seq)
                jx_to_expression[2].append(sample)
        else:
            #write_first 
            if expr1 > 0: 
                pep_idx+=1
                sp.write(pep_handle1 + '\n')
                sp.write(pep_seq + '\n')
                jx_to_expression[0].append(genome_junction_pos)
                jx_to_expression[3].append(expr1)
                jx_to_expression[1].append(pep_seq)
                jx_to_expression[2].append(sample)
    return pep_idx, jx_to_expression



def cut_peptides(pep_seq, jx_list, between_codons, between_codons1, aa_junction_pos, 
                 aa_junction_pos1, aa_junction_pos1_from_start, 
                 print_ = False):
    peptide_cut = []
    aa_junction_pos_shift = aa_junction_pos + 1 
    exon1, aa_jx1, exon2, aa_jx2, exon3 = '', '', '', '', ''

    if len(jx_list) == 2:
        if between_codons:
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:]
        else:
            exon1 = pep_seq[:aa_junction_pos]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:]
    elif len(jx_list) == 3:
        aa_junction2_pos_shift =  aa_junction_pos1 + aa_junction_pos + 1
        aa_junction2_pos_sshift = aa_junction_pos1 + aa_junction_pos + 2
        assert(aa_junction_pos1_from_start == aa_junction2_pos_shift)
        if between_codons and between_codons1: 
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:
                          aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (not between_codons) and between_codons1: 
            exon1 = pep_seq[:aa_junction_pos ]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:
                          aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (between_codons) and (not between_codons1): 
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift :
                          aa_junction2_pos_shift]
            aa_jx2 = pep_seq[aa_junction2_pos_shift:
                         aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (not between_codons) and (not between_codons1): 
            exon1 = pep_seq[:aa_junction_pos ]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift :
                          aa_junction2_pos_shift]
            aa_jx2 = pep_seq[aa_junction2_pos_shift:
                         aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
    if print_:
        print(f'exon1:{exon1}, aa_containing_jx1:{aa_jx1}, exon2:{exon2}, aa_containing_jx2:{aa_jx2}, exon3:{exon3}')
        print(f'junction positions jx1: {aa_junction_pos}, jx2:{aa_junction_pos1_from_start}')
        print(f'is junction between a codon jx1: {between_codons}, jx2: {between_codons1}')
        print('\n')
    return exon1 + aa_jx1 + exon2, exon2 + aa_jx2 + exon3 

def print_stats(print_, kmer, pep_seq, strand, pep_orig_coord, pep_modi_coord, jx_list,
               jx_list_ori, genome_junction_pos, genome_junction_pos1,
               aa_junction_pos, aa_junction_pos1, between_codons, between_codons1):
    
    if print_:
        p_ori_coord = ';'.join(pep_orig_coord)
        p_modif_coord = ';'.join(pep_modi_coord)
        print(f'INSTANCE: \n kmer {kmer}/ sequence {pep_seq}/ strand {strand} / \n original coordinates {p_ori_coord} / \n modif coordinates {p_modif_coord} /  \n junction list origin {jx_list_ori}/ junction list {jx_list} / \n junction coordinates 1 {genome_junction_pos} / junction coordinates 2 {genome_junction_pos1}')
        #print(aa_junction_pos, between_codons, aa_junction_pos1, between_codons1)
        print('peptide length', len(pep_seq))


In [165]:
def readlines_custom(file_):
    with gzip.open(file_, 'rb') as fp: 
        lines = fp.readlines()
        for j, line in enumerate(lines):
            line = line.decode().replace('\n', '').split('\t')
            if '\\t' in line[0]:
                splitted_p = line[0].split('\\t') #correction temp
                kmer, coord, peptide = splitted_p
                line = [kmer, coord, peptide] + line[1:]
            else:
                line = [kmer, coord] + line #Use previous kmer instance
            lines[j] = line
    return lines

In [91]:
# Note: corrected the inversion of coordinates, extracted the right matching coordinate position for 3 exon peptide, remove has_stop_codon == 0 break 

In [118]:
import gzip

In [168]:
jx_to_expression = [[],[],[],[]]


pep_idx = 0 



if file_save_experiement is not None: 
    ep = open(file_save_experiement, 'w') # Experiment file to save
    write_peptide_to_experiment(ep, header=True)
with open(file_save, 'w') as sp: # Fasta file to save
    ### Iterate over peptides 
    lines = readlines_custom(file_meta)
    for line in lines[0:10]:
        res = extract_peptide_fields(line)

            
#         # --- TEST ---
#         if subset_run_test and (test_query in line['peptide']):
#             print(f'\n METADATA (ImmunoPepper derived): \n {line}')
#             break
#         # --- END TEST ---

        #res = extract_peptide_fields(line)

#         # some peptides do not contain a junction
#         if int(res['isIsolated']):
#             continue 

#         kmer = None

#         # Genomic coordinates : get include 3' or 5' flag 
#         #[ iterate only needed for old (May 2020-June 2022) 
#         # collapsed version of peptide files with multiple coordinates 
#         # searated by /]
#         for pep_orig_coord in res['originalExonsCoord']:
#             pep_orig_coord = split_coord(pep_orig_coord)
#             start_cds = [ first_exon[0] for first_exon in 
#                          gene_cds_begin_dict_bis[res['geneName']] ] 
#             end_cds = [ last_exon[1] for last_exon in 
#                        gene_cds_end_dict_bis[res['geneName']] ] 
#             pep_start, pep_end = extract_end_starts(pep_orig_coord, res['geneStrand'])
#             pep_5include, pep_3include = get_include_flag(start_cds, end_cds, 
#                                                           pep_start, pep_end, 
#                                                           res['hasStopCodon'] )

#         # Modified genomic coordinates: get 
#         # "junction position" flag and "between codon" flag
#         # and "genomic coordinates"
#         #[ iterate only needed for old (May 2020-June 2022) 
#         # collapsed version of peptide files with multiple coordinates 
#         # searated by /]
#         for id_modi_coord, pep_modi_coord in enumerate(res['modifiedExonsCoord']):
#             pep_modi_coord = split_coord(pep_modi_coord)
#             if len(pep_modi_coord) <=2:
#                 continue
#             jx_list, shift_list, \
#             jx_list_ori = get_nt_len_with_aa_shift(pep_modi_coord)

#             aa_junction_pos, between_codons, \
#             aa_junction_pos1, between_codons1, \
#             aa_junction_pos1_from_start = get_aaPos_betweenFlag(shift_list,
#                                                                       jx_list)

#             genome_junction_pos, \
#             genome_junction_pos1 = get_genomic_coordinates(pep_modi_coord, res['geneStrand'])

#             # --- TEST ---
#             if subset_run_test and (test_query not in res['peptide']): 
#                 continue
#             # --- END TEST ---

#             print_stats(print_, kmer, res['peptide'], res['geneStrand'], pep_orig_coord, 
#                         pep_modi_coord, jx_list, jx_list_ori, genome_junction_pos,
#                         genome_junction_pos1, aa_junction_pos, aa_junction_pos1, 
#                         between_codons, between_codons1)

#             jx_pep1, jx_pep2 = cut_peptides(res['peptide'], jx_list, between_codons,
#                                             between_codons1, aa_junction_pos, 
#                                             aa_junction_pos1, 
#                                             aa_junction_pos1_from_start, 
#                                             print_=print_)

#             # skip duplicates
#             if (genome_junction_pos in peptide_junctions) and \
#                (genome_junction_pos1 in peptide_junctions):
#                 continue
#             else:
#                 peptide_junctions.add(genome_junction_pos)
#                 peptide_junctions.add(genome_junction_pos1)


#             # write fasta file 
#             pep_idx, jx_to_expression = write_fasta_option_MI(sp, res['peptide'], 
#                         pep_idx, aa_junction_pos,
#                         aa_junction_pos1_from_start, between_codons, 
#                         between_codons1, pep_5include, pep_3include, 
#                         res['geneName'], res['geneStrand'], 
#                         genome_junction_pos, genome_junction_pos1, 
#                         kmer, jx_pep1, jx_pep2, 
#                         res['readFrameAnnotated'], \
#                         CANCEL, \
#                         res['kmerType'], CANCEL, sample, 
#                         jx_to_expression, \
#                         do_write=write_)
#             if file_save_experiement is not None: 
#                 write_peptide_to_experiment(ep, pep_idx, res['peptide'],';'.join(
#                     kmer_file_idx[kmer]), peptide_junctions) 

# #                 expression_dict(res['peptide'], id_sample, n_samples, 
# #                                 CANCEL, genome_junction_pos, 
# #                                 genome_junction_pos1, id_modi_coord, 
# #                                 pep_orig_coord)
# if write_:
#     print('written to:')
#     print(file_save_experiement)
#     print(file_save)
#     if file_save_experiement is not None:
#         ep.close()


In [170]:
res


{'kmer': 'AAEHLRVPS',
 'kmer_coord': '184711444:184711461:184709721:184709731:None:None',
 'peptide': 'MLQTPESRGLPVPQAEGEKDGGHDGETRAPTASQERPKEELGAGREEGAAEPALTRKGARALAAKALARRRAYRRLNRTVAELVQFLLVKDKKKSPITRSEMVKYVIGDLKILFPDIIARAAEHLRVPSSTKALRDQIM',
 'id': 'ENSG00000177383.5:11_0:0:184711821:2-exons',
 'readFrame': '2',
 'readFrameAnnotated': 'True',
 'geneName': 'ENSG00000177383.5',
 'geneChr': 'chr3',
 'geneStrand': '-',
 'mutationMode': 'ref',
 'hasStopCodon': '1',
 'isInJunctionList': 'nan',
 'isIsolated': '0',
 'variantComb': 'nan',
 'variantSegExpr': 'nan',
 'modifiedExonsCoord': '184711444;184711821;184709532;184709731',
 'originalExonsCoord': '184711444;184712064;184709531;184709731',
 'vertexIdx': '11;0',
 'kmerType': '2-exons'}

In [129]:
line = line.decode().replace('\n', '').split('\t')

In [None]:
line

In [160]:
lines[0:10]

[b'AAAFTIDVL\\t5090083:5090090:5089248:5089268:None:None\\tTVLELGSGAGLTGLAICKMCRPRAYIFSDCHSRVLEQLRGNVLLNGLSLEADITAKLDSPRVTVAQLDWDVATVHQLSAFQPDVVIAAAFTIDVLYCPEAIMSLVGVLRRLAACREHQRAPEVYVAFTVRNPETCQLFTTEL\tENSG00000118894.15:20_12:0:5090348:2-exons\t1\tFalse\tENSG00000118894.15\tchr16\t-\tref\t0\tnan\t0\tnan\tnan\t5090083;5090348;5089107;5089268\t5090083;5090349;5089106;5089268\t20;12\t2-exons\n',
 b'AAASFSMDG\\t62492703:62492724:62477828:62477834:None:None\\tNISSNVLEESAVSDDVVSPDEEGICSGKYFTESGLVGLLEQAAASFSMDGKRMFGTYFRVGFYGTKFGDLDEQEF\tENSG00000116641.18:77_54:0:62492847:2-exons\t0\tFalse\tENSG00000116641.18\tchr1\t-\tref\t0\tnan\t0\tnan\tnan\t62492703;62492847;62477753;62477834\t62492703;62492847;62477752;62477834\t77;54\t2-exons\n',
 b'NISSNVLEESAVSDDVVSPDEEGICSGKYFTESGLVGLLEQAAASFSMDGKRMFGTYFRVGFYGTKFGDLDEQEFVYKEPAITKLAEISHRLE\tENSG00000116641.18:77_52:0:62492847:2-exons\t0\tFalse\tENSG00000116641.18\tchr1\t-\tref\t0\tnan\t0\tnan\tnan\t62492703;62492847;62477699;62477834\t62492703;62492

In [162]:
lines

[['AAAFTIDVL',
  '5090083:5090090:5089248:5089268:None:None',
  'TVLELGSGAGLTGLAICKMCRPRAYIFSDCHSRVLEQLRGNVLLNGLSLEADITAKLDSPRVTVAQLDWDVATVHQLSAFQPDVVIAAAFTIDVLYCPEAIMSLVGVLRRLAACREHQRAPEVYVAFTVRNPETCQLFTTEL',
  'ENSG00000118894.15:20_12:0:5090348:2-exons',
  '1',
  'False',
  'ENSG00000118894.15',
  'chr16',
  '-',
  'ref',
  '0',
  'nan',
  '0',
  'nan',
  'nan',
  '5090083;5090348;5089107;5089268',
  '5090083;5090349;5089106;5089268',
  '20;12',
  '2-exons'],
 ['AAASFSMDG',
  '62492703:62492724:62477828:62477834:None:None',
  'NISSNVLEESAVSDDVVSPDEEGICSGKYFTESGLVGLLEQAAASFSMDGKRMFGTYFRVGFYGTKFGDLDEQEF',
  'ENSG00000116641.18:77_54:0:62492847:2-exons',
  '0',
  'False',
  'ENSG00000116641.18',
  'chr1',
  '-',
  'ref',
  '0',
  'nan',
  '0',
  'nan',
  'nan',
  '62492703;62492847;62477753;62477834',
  '62492703;62492847;62477752;62477834',
  '77;54',
  '2-exons'],
 ['AAASFSMDG',
  '62492703:62492724:62477828:62477834:None:None',
  'NISSNVLEESAVSDDVVSPDEEGICSGKYFTESGLVGLLEQAAASFSMDGKR

In [157]:
line


['AAEHLRVPS',
 '184711444:184711461:184709721:184709731:None:None',
 'MLQTPESRGLPVPQAEGEKDGGHDGETRAPTASQERPKEELGAGREEGAAEPALTRKGARALAAKALARRRAYRRLNRTVAELVQFLLVKDKKKSPITRSEMVKYVIGDLKILFPDIIARAAEHLRVPSSTKALRDQIM',
 ['ENSG00000177383.5:11_0:0:184711821:2-exons',
  '2',
  'True',
  'ENSG00000177383.5',
  'chr3',
  '-',
  'ref',
  '1',
  'nan',
  '0',
  'nan',
  'nan',
  '184711444;184711821;184709532;184709731',
  '184711444;184712064;184709531;184709731',
  '11;0',
  '2-exons']]

In [155]:
line

['AAEHLRVPS',
 '184711444:184711461:184709721:184709731:None:None',
 'MLQTPESRGLPVPQAEGEKDGGHDGETRAPTASQERPKEELGAGREEGAAEPALTRKGARALAAKALARRRAYRRLNRTVAELVQFLLVKDKKKSPITRSEMVKYVIGDLKILFPDIIARAAEHLRVPSSTKALRDQIM',
 ['ENSG00000177383.5:11_0:0:184711821:2-exons',
  '2',
  'True',
  'ENSG00000177383.5',
  'chr3',
  '-',
  'ref',
  '1',
  'nan',
  '0',
  'nan',
  'nan',
  '184711444;184711821;184709532;184709731',
  '184711444;184712064;184709531;184709731',
  '11;0',
  '2-exons']]

In [148]:
line = line[1:]

In [None]:
['hello']

In [151]:
line.insert(0, ['hello', 'world'])

In [152]:
line

[['hello', 'world'],
 'hello',
 'ENSG00000177383.5:11_0:0:184711821:2-exons',
 '2',
 'True',
 'ENSG00000177383.5',
 'chr3',
 '-',
 'ref',
 '1',
 'nan',
 '0',
 'nan',
 'nan',
 '184711444;184711821;184709532;184709731',
 '184711444;184712064;184709531;184709731',
 '11;0',
 '2-exons']

In [143]:
kmer_coord

NameError: name 'kmer_coord' is not defined

In [142]:
splitted_p

['AAEHLRVPS',
 '184711444:184711461:184709721:184709731:None:None',
 'MLQTPESRGLPVPQAEGEKDGGHDGETRAPTASQERPKEELGAGREEGAAEPALTRKGARALAAKALARRRAYRRLNRTVAELVQFLLVKDKKKSPITRSEMVKYVIGDLKILFPDIIARAAEHLRVPSSTKALRDQIM']

In [110]:
line.values


array(['AAGSEGSQG\\t40311229:40311245:40310756:40310767:None:None\\tSNQLSTRNERSPRAAGSEGSQGRCGCLWRARHPWTT\tENSG00000049089.15:61_59:0:40311284:2-exons\t1\tFalse\tENSG00000049089.15\tchr1\t-\tref\t0\tnan\t0\tnan\tnan\t40311229;40311284;40310714;40310767\t40311229;40311286;40310713;40310767\t61;59\t2-exons'],
      dtype=object)

In [108]:
line.values[0].split(' ')


['AAGSEGSQG\\t40311229:40311245:40310756:40310767:None:None\\tSNQLSTRNERSPRAAGSEGSQGRCGCLWRARHPWTT\tENSG00000049089.15:61_59:0:40311284:2-exons\t1\tFalse\tENSG00000049089.15\tchr1\t-\tref\t0\tnan\t0\tnan\tnan\t40311229;40311284;40310714;40310767\t40311229;40311286;40310713;40310767\t61;59\t2-exons']

In [93]:
%debug


> [0;32m/scratch/slurm-job.900915/ipykernel_879081/3616807193.py[0m(22)[0;36mextract_peptide_fields[0;34m()[0m
[0;32m     20 [0;31m            [0msplitted_p[0m [0;34m=[0m [0mp[0m[0;34m.[0m[0msplit[0m[0;34m([0m[0;34m'\\t'[0m[0;34m)[0m [0;31m#correction temp[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     21 [0;31m            [0;32mfor[0m [0mjdx[0m[0;34m,[0m [0mlabel[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mkmer_fields[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 22 [0;31m                [0mres[0m[0;34m[[0m[0mlabel[0m[0;34m][0m [0;34m=[0m [0msplitted_p[0m[0;34m[[0m[0mjdx[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     23 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m            [0mres[0m[0;34m[[0m[0mcols_correct[0m[0;34m[[0m[0midx[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0mp[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> splitted_p
['NISSNVLEESA