In [1]:
import os 
import pandas as pd
from collections import defaultdict
import numpy as np 
import glob

This notebook performs the following task: * Given a kmer_metadata file, generate a fasta file with metadata relevant for the tryptic digestion


In [2]:
experiment_folder = '/cluster/work/grlab/projects/projects2022-PCa_Immuno/peptides_generation/v3_84dc237_conf2_annotFrame_cap0_runs'

In [3]:
sample_file = '/cluster/work/grlab/projects/projects2022-PCa_Immuno/samples_list/PCa_samples_20220616'

In [4]:
samples = pd.read_csv(sample_file, names = ['sample'])

In [5]:
samples.head()

Unnamed: 0,sample
0,C_scr_96_n1.all
1,C_scr_96_n2.all
2,C_scr_96_n3.all
3,C_scr_96_n4.all
4,C_siU6atac_96_n1.all


In [6]:
sample = samples.iloc[0][0]

# Preprocess the annotation 
### Input: Annotation

In [7]:
ann_path = '/cluster/work/grlab/projects/projects2022-PCa_Immuno/annotation/gencode.v32.annotation.gtf'

In [8]:
from immunopepper.preprocess import attribute_item_to_dict
from immunopepper.preprocess import leq_strand

In [9]:
# Partial copy From Immunopepper preprocess.py 
transcript_to_gene_dict = {}    # transcript -> gene id


gene_to_transcript_dict = {}    # gene_id -> list of transcripts
gene_cds_begin_dict = {}        # gene -> list of first CDS exons

transcript_to_cds_dict = {}     # transcript -> list of CDS exons
transcript_cds_begin_dict = {}  # transcript -> first exon of the CDS
transcript_to_strand = {}

file_type = ann_path.split('.')[-1]
chromesome_set = set()
# collect information from annotation file
for line in open(ann_path, 'r'):
    if line[0] == '#':
        continue
    item = line.strip().split('\t')
    chromesome_set.add(item[0])
    feature_type = item[2]
    attribute_item = item[-1]
    attribute_dict = attribute_item_to_dict(attribute_item, file_type, feature_type)
    # store relationship between gene ID and its transcript IDs
    if feature_type in ['transcript', 'mRNA']:
        gene_id = attribute_dict['gene_id']
        transcript_id = attribute_dict['transcript_id']
        if attribute_dict['gene_type'] != 'protein_coding' or attribute_dict['transcript_type']  != 'protein_coding':
            continue
        assert (transcript_id not in transcript_to_gene_dict)
        transcript_to_gene_dict[transcript_id] = gene_id
        if gene_id in gene_to_transcript_dict and transcript_id not in gene_to_transcript_dict[gene_id]:
            gene_to_transcript_dict[gene_id].append(transcript_id)
        else:
            gene_to_transcript_dict[gene_id] = [transcript_id]
        # Todo python is 0-based while gene annotation file(.gtf, .vcf, .maf) is one based
    elif feature_type == "CDS":
        parent_ts = attribute_dict['transcript_id']
        strand_mode = item[6]
        cds_left = int(item[3])-1
        cds_right = int(item[4])
        frameshift = int(item[7])
        transcript_to_strand[parent_ts] = strand_mode
        if parent_ts in transcript_to_cds_dict:
            transcript_to_cds_dict[parent_ts].append((cds_left, cds_right, frameshift))
        else:
            transcript_to_cds_dict[parent_ts] = [(cds_left, cds_right, frameshift)]
        if strand_mode == "+" :
            cds_start, cds_stop = cds_left, cds_right
        else:
            cds_start, cds_stop = cds_right, cds_left

        # we only consider the start of the whole CoDing Segment
        if parent_ts not in transcript_cds_begin_dict or \
           leq_strand(cds_start, transcript_cds_begin_dict[parent_ts][0], strand_mode):
            transcript_cds_begin_dict[parent_ts] = (cds_start, cds_stop, item)

# collect first CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    if target_gene not in gene_cds_begin_dict:
        gene_cds_begin_dict[target_gene] = []
    if ts_key in transcript_cds_begin_dict:
        gene_cds_begin_dict[target_gene].append(transcript_cds_begin_dict[ts_key])


In [10]:
# Custom collection of CDS 
transcript_cds_begin_dict_bis = {}
transcript_cds_end_dict_bis = {}

gene_cds_begin_dict_bis = defaultdict(list)
gene_cds_end_dict_bis = defaultdict(list)

# will be in reading order 
for ts_key in transcript_to_cds_dict:
    if transcript_to_strand[ts_key] == '+': # '+'
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][0],
                                                 transcript_to_cds_dict[ts_key][0][1], '+')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][0],
                                                 transcript_to_cds_dict[ts_key][-1][1], '+')
        

    else: 
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][1],
                                                 transcript_to_cds_dict[ts_key][0][0], '-')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][1],
                                                 transcript_to_cds_dict[ts_key][-1][0], '-')
    
    assert(transcript_cds_begin_dict_bis[ts_key][0] == transcript_cds_begin_dict[ts_key][0])
    assert(transcript_cds_begin_dict_bis[ts_key][1] == transcript_cds_begin_dict[ts_key][1])

# collect first, last CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    gene_cds_begin_dict_bis[target_gene].append(transcript_cds_begin_dict_bis[ts_key])
    gene_cds_end_dict_bis[target_gene].append(transcript_cds_end_dict_bis[ts_key])

# Format the peptide raw file to fasta 


In [11]:
fields_meta_peptide_dict = ['peptide', 'id', 'readFrame', 'geneName', 'geneChr', 'geneStrand',
                                'mutationMode',
                                'junctionAnnotated', 'hasStopCodon', 'isInJunctionList',
                                'isIsolated', 'variantComb', 'variantSegExpr', 'modifiedExonsCoord',
                                'originalExonsCoord', 'vertexIdx',
                                'kmerType']


In [12]:
def extract_peptide_fields_pq(pep):
    '''0 peptide
    1 id
    2 readFrame
    3 readFrameAnnotated
    4 geneName
    5 geneChr
    6 geneStrand
    7 mutationMode
    8 junctionAnnotated
    9 hasStopCodon
    10 isInJunctionList
    11 isIsolated
    12 variantComb
    13 variantSegExpr
    14 modifiedExonsCoord
    15 originalExonsCoord
    16 vertexIdx
    ?junctionExpr # sample mode only
    ?segmentExpr #sample mode only
    17 kmerType '''
    
    pep_seq = pep['peptide']
    pep_gene = pep['geneName']
    pep_orig_coordS = pep['originalExonsCoord'].split('/')
    pep_modif_coordS = pep['modifiedExonsCoord'].split('/')
    strand = pep['geneStrand']
    has_stop_codon = pep['hasStopCodon']
    readFrameAnnotated = pep['readFrameAnnotated']
    junctionAnnotated = pep['junctionAnnotated']
    kmer_type = pep['kmerType'].replace('-','').replace('\n','')
    is_isolated = pep['isIsolated']
    junctionExpr = pep['junctionExpr']

    return pep, pep_seq, pep_gene, pep_orig_coordS, pep_modif_coordS, \
           strand, has_stop_codon, readFrameAnnotated, junctionAnnotated, \
           kmer_type, is_isolated, junctionExpr

In [13]:
def extract_end_starts(pep_orig_coord):
    ''' Get peptide end and start coordinates'''
    if strand == '+': # Do - strand 
        pep_start = np.int(pep_orig_coord[0])
        pep_end = np.int(pep_orig_coord[-1])
    else: 
        pep_start = np.int(pep_orig_coord[1])
        pep_end = np.int(pep_orig_coord[-2])
    return pep_start, pep_end

In [14]:
def get_include_flag(start_cds, end_cds, pep_start, pep_end, has_stop_codon ):
    '''Use end and start coordinates for 3' 5' include flag'''
    if pep_start in start_cds: # We will always miss things that are new in the graph 
        pep_5include = 1
    else: 
        pep_5include = 0 
    if (pep_end in end_cds) or (has_stop_codon) == '1':
        pep_3include = 1
    else: 
        pep_3include = 0 
    return pep_5include, pep_3include

In [15]:
def get_nt_len_with_aa_shift(pep_modi_coord):
    '''Get nt length of each exon involved -> jx_list, shift_list'''
    tot_len = 0 
    shift = 0 
    jx_list = []
    jx_list_ori = []
    shift_list = []
    for pair in np.arange(0, len(pep_modi_coord), 2):
        cds = int(pep_modi_coord[pair + 1]) - int(pep_modi_coord[pair])  # 0 based, open right 
        jx_list_ori.append(cds)
        cds += shift 
        shift = cds % 3
        jx_list.append(cds - shift)
        shift_list.append(shift)
        
    return jx_list, shift_list, jx_list_ori

In [16]:
def get_aaPos_betweenFlag(shift_list, jx_list):
    '''Get aa position of the junction
    the junction coordinate jx_pos is the 0-based position in the peptide 
    of the amino acid that either overlaps the junction (if the junction is 
    in the middle of a codon), or is immediately before it if the junction 
    occurs between codons'''
    if shift_list[0]: # junction is inside an amino acid
        aa_junction_pos0 = int((jx_list[0] / 3)) # because 0 based
        between_codons0 = 0 
    else: # junction is between amino acids 
        aa_junction_pos0 = int((jx_list[0] / 3) - 1)  # because 0 based
        between_codons0 = 1
        
    if len(shift_list) > 2: #third exon 
        if shift_list[1]: # junction is inside an amino acid
            aa_junction_pos1 = int((jx_list[1] / 3)) # because 0 based
            between_codons1 = 0 
        else: # junction is between amino acids 
            aa_junction_pos1 = int((jx_list[1] / 3) - 1)  # because 0 based
            between_codons1 = 1 
        aa_junction_pos1_from_start = aa_junction_pos1 + aa_junction_pos0 + 1 
    else:
        aa_junction_pos1 = None
        between_codons1 = None
        aa_junction_pos1_from_start = None
    
    return aa_junction_pos0, between_codons0, aa_junction_pos1, between_codons1, \
           aa_junction_pos1_from_start

In [17]:
def get_genomic_coordinates(pep_modi_coord):
    '''We have in + case: exon1_start, exon 1_stop, exon2_start, exon2_stop, exon3_start, exon3_stop
     In the - case: exon1_stop, exon 1_start, exon 2_stop, exon2_start, exon3_stop, exon3_start'''
    genome_junction_pos1 = None
    if strand == '+':
        genome_junction_pos0 = '{}_{}'.format(pep_modi_coord[1], pep_modi_coord[2])
        if len(pep_modi_coord) > 4:
            genome_junction_pos1 = '{}_{}'.format(pep_modi_coord[3], pep_modi_coord[4])
    else:
        genome_junction_pos0 = '{}_{}'.format(pep_modi_coord[0], pep_modi_coord[3])
        if len(pep_modi_coord) > 4:
            genome_junction_pos1 = '{}_{}'.format(pep_modi_coord[2], pep_modi_coord[5])
    return genome_junction_pos0, genome_junction_pos1

In [18]:
def split_coord(pep_coord):
    pep_coord = pep_coord.split(';')
    pep_coord = [coord for coord in pep_coord if (coord != 'None') and (coord != 'nan')]
    return pep_coord

In [19]:
def preprocess_line(line):
    assert(False)
    line = line.replace('3-exons_9-mer ', '3-exons_9-mer@').replace('2-exons ', '2-exons@')
    kmer = line.split(',')[0]
    peptides = ','.join(line.split(',')[1:])
    peptides = peptides.split('@')
    return line, kmer, peptides

In [20]:
def write_peptide_to_experiment(filepointer, pep_idx=None, pep_seq=None,\
                                idx=None, header=False):
    if header:
        header_exp = 'peptide_id\tpeptide_sequence\texperiment_ids\n'
        filepointer.write(header_exp)
    elif pep_idx is not None:
        exp_line = '{}\t{}\t{}\n'.format(pep_idx, 
                                         pep_seq,
                                         idx)
        filepointer.write(exp_line)


In [21]:
def write_fasta_option(filepointer, pep_idx, aa_junction_pos, 
                aa_junction_pos1_from_start, between_codons, between_codons1,
                pep_5include, pep_3include, pep_gene, 
                genome_junction_pos, genome_junction_pos1, 
                kmer, jx_pep1, jx_pep2, readFrameAnnotated, \
                junctionAnnotated, kmer_type, do_write=True):
    
        
    if (kmer is not None):
        assert(kmer in jx_pep1)
        
    pep_handle1 = '>pepID-{};jx_pos-{};between_codons-{};includes_5\'-{};includes_3\'-{};gene-{};jx_coord-{};kmer-{};readFrameAnnotated-{};junctionAnnotated-{};origin-{}'.format(
    pep_idx, aa_junction_pos, between_codons, pep_5include, 
    pep_3include, pep_gene, genome_junction_pos, kmer, readFrameAnnotated, 
    junctionAnnotated, kmer_type)

    pep_handle2 = '>pepID-{};jx_pos-{};between_codons-{};includes_5\'-{};includes_3\'-{};gene-{};jx_coord-{};kmer-{};readFrameAnnotated-{};junctionAnnotated-{};origin-{}'.format(
    pep_idx, aa_junction_pos1_from_start, between_codons1, pep_5include, 
    pep_3include, pep_gene, genome_junction_pos1, kmer, readFrameAnnotated, 
    junctionAnnotated, kmer_type)
    
        
    pep_idx+=1
    sp.write(pep_handle1 + '\n')
    sp.write(pep_seq + '\n')
    if aa_junction_pos1_from_start is not None:
        if (kmer is None) or (kmer in jx_pep2):
            pep_idx+=1
            sp.write(pep_handle2 + '\n')
            sp.write(pep_seq + '\n')
        
    return pep_idx



In [22]:
def cut_peptides(jx_list, between_codons, between_codons1, aa_junction_pos, 
                 aa_junction_pos1, aa_junction_pos1_from_start, 
                 print_ = False):
    peptide_cut = []
    aa_junction_pos_shift = aa_junction_pos + 1 
    exon1, aa_jx1, exon2, aa_jx2, exon3 = '', '', '', '', ''

    if len(jx_list) == 2:
        if between_codons:
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:]
        else:
            exon1 = pep_seq[:aa_junction_pos]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:]
    elif len(jx_list) == 3:
        aa_junction2_pos_shift =  aa_junction_pos1 + aa_junction_pos + 1
        aa_junction2_pos_sshift = aa_junction_pos1 + aa_junction_pos + 2
        assert(aa_junction_pos1_from_start == aa_junction2_pos_shift)
        if between_codons and between_codons1: 
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:
                          aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (not between_codons) and between_codons1: 
            exon1 = pep_seq[:aa_junction_pos ]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift:
                          aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (between_codons) and (not between_codons1): 
            exon1 = pep_seq[:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift :
                          aa_junction2_pos_shift]
            aa_jx2 = pep_seq[aa_junction2_pos_shift:
                         aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
        if (not between_codons) and (not between_codons1): 
            exon1 = pep_seq[:aa_junction_pos ]
            aa_jx1 = pep_seq[aa_junction_pos:aa_junction_pos_shift]
            exon2 = pep_seq[aa_junction_pos_shift :
                          aa_junction2_pos_shift]
            aa_jx2 = pep_seq[aa_junction2_pos_shift:
                         aa_junction2_pos_sshift]
            exon3 = pep_seq[aa_junction2_pos_sshift:]
    if print_:
        print(f'exon1:{exon1}, aa_containing_jx1:{aa_jx1}, exon2:{exon2}, aa_containing_jx2:{aa_jx2}, exon3:{exon3}')
        print(f'junction positions jx1: {aa_junction_pos}, jx2:{aa_junction_pos1_from_start}')
        print(f'is junction between a codon jx1: {between_codons}, jx2: {between_codons1}')
        print('\n')
    return exon1 + aa_jx1 + exon2, exon2 + aa_jx2 + exon3 

In [23]:
def print_stats(print_, kmer, pep_seq, strand, pep_orig_coord, pep_modi_coord, jx_list,
               jx_list_ori, genome_junction_pos, genome_junction_pos1,
               aa_junction_pos, aa_junction_pos1, between_codons, between_codons1):
    
    if print_:
        p_ori_coord = ';'.join(pep_orig_coord)
        p_modif_coord = ';'.join(pep_modi_coord)
        print(f'INSTANCE: \n kmer {kmer}/ sequence {pep_seq}/ strand {strand} / \n original coordinates {p_ori_coord} / \n modif coordinates {p_modif_coord} /  \n junction list origin {jx_list_ori}/ junction list {jx_list} / \n junction coordinates 1 {genome_junction_pos} / junction coordinates 2 {genome_junction_pos1}')
        #print(aa_junction_pos, between_codons, aa_junction_pos1, between_codons1)
        print('peptide length', len(pep_seq))


In [24]:
def update_expression_dict(genome_junction_pos, expr_val, n_samples, id_sample, pep_seq):
         # Check that each junction position in one sample has one expression value
        if genome_junction_pos in jx_to_expression:
            assert(jx_to_expression[genome_junction_pos][id_sample] == float(expr_val))
        # Update the expression 
        jx_to_expression.setdefault(genome_junction_pos, 
                                    np.zeros(n_samples))[id_sample] = expr_val
        # Update the peptides
        jx_to_peptides[genome_junction_pos].append(pep_seq)

In [45]:
def expression_dict(pep_seq, id_sample, n_samples, junctionExpr, 
                    genome_junction_pos, genome_junction_pos1, 
                   id_modi_coord, pep_orig_coord):
    ''' 
       junctionExpr: string: example (1,1)/(2,0)' 
    - each tuple represents an expression value instance.
    - First entry of the tuple is first junction expression value and
    - the second entry is the second junction expressiion value  '''
    
    



        
    
    expression = [pair.split(',') for pair in junctionExpr.replace(',)', '').\
                                                                   replace('(', '').\
                                                                   replace(')', '').\
                                                                   replace(' ', '').\
                                                                  split('/')]   
    
    # Edge case: some junctions have the same expression values 
    # because they have the same position inside the same peptide!
    id_modi_coord = min(len(expression) - 1 , id_modi_coord) 
    expr_val = expression[id_modi_coord]
    # Update 2-exon case
    update_expression_dict(genome_junction_pos, expr_val[0], 
                           n_samples, id_sample, pep_seq)
    # Update 3-exon case
    if genome_junction_pos1: 
        update_expression_dict(genome_junction_pos1, expr_val[1], 
                       n_samples, id_sample, pep_seq)

    #TEST prints
    #if len(expression)> 1:
    print(junctionExpr)
    print(pep['peptide'])
    print('strand', pep['geneStrand'])
    print('modif', pep['modifiedExonsCoord'], 'orig', pep_orig_coord) 
    print(expression )
    print(id_modi_coord, 'id of coordinate')
    print(expr_val, 'expression value')
    print( '\n')

In [46]:
## Format and create flags 
pep_idx = 0 
kmer_len = 9 
file_save_experiement = None 

print_ = False
test_query = 'MRGQRSLLLGPARLCLRLLLLLGYRRRCPPLLRGLVQRWRYGKVCLRSLLYNSFGGSDTAVDAAFEPVYWLVDNVIRWFGVVSDVQGAGKGVLWGAEGHVSHRPNPGFGGRPPPPGKLRVSFGHSSCAMSP'
subset_run_test = False
write_= False


In [47]:
jx_to_expression = dict()
jx_to_peptides = defaultdict(list)
n_samples = len(samples['sample'])

for id_sample, sample in enumerate(samples['sample']):
    print(sample)

    file_meta = os.path.join(experiment_folder,
                             f'{sample}_mutNone/ref_sample_peptides_meta.mig_filtered.pq')

    file_save = os.path.join(experiment_folder,
                             f'{sample}_mutNone/ref_sample_peptides_meta.mig_filtered.fasta')




    if file_save_experiement is not None: 
        ep = open(file_save_experiement, 'w') # Experiment file to save
        write_peptide_to_experiment(ep, header=True)
    with open(file_save, 'w') as sp: # Fasta file to save
        fp = pd.read_parquet(file_meta) # metadata from immunopepper as parquet
        print("open {}".format(file_meta))
        ### Iterate over peptides 
        for j, line in fp.iterrows():
            print('new')
            peptide_junctions = set() #each line is one peptide / peptides matching a kmer
                                      #each peptide contains at most one kmer

            # --- TEST ---
            if subset_run_test and (test_query in line['peptide']):
                print(f'\n METADATA (ImmunoPepper derived): \n {line}')
                break
            # --- END TEST ---


            pep, pep_seq, pep_gene, pep_orig_coordS, pep_modif_coordS, \
            strand, has_stop_codon, readFrameAnnotated, \
            junctionAnnotated, kmer_type, is_isolated, \
            junctionExpr = extract_peptide_fields_pq(line)
        
            # some peptides do not contain a junction
            if int(is_isolated):
                continue 

            kmer = None

            # Iterate over original genomic coordinates : get include 3' or 5' flag
            for pep_orig_coord in pep_orig_coordS:
                pep_orig_coord = split_coord(pep_orig_coord)
                start_cds = [ first_exon[0] for first_exon in 
                             gene_cds_begin_dict_bis[pep_gene] ] 
                end_cds = [ last_exon[1] for last_exon in 
                           gene_cds_end_dict_bis[pep_gene] ] 
                pep_start, pep_end = extract_end_starts(pep_orig_coord)
                pep_5include, pep_3include = get_include_flag(start_cds, end_cds, 
                                                              pep_start, pep_end, 
                                                              has_stop_codon )

            # Iterate over modified genomic coordinates: get 
            # "junction position" flag and "between codon" flag
            #"and genomic coordinates"
            for id_modi_coord, pep_modi_coord in enumerate(pep_modif_coordS):
                pep_modi_coord = split_coord(pep_modi_coord)
                if len(pep_modi_coord) <=2:
                    continue
                jx_list, shift_list, \
                jx_list_ori = get_nt_len_with_aa_shift(pep_modi_coord)

                aa_junction_pos, between_codons, \
                aa_junction_pos1, between_codons1, \
                aa_junction_pos1_from_start = get_aaPos_betweenFlag(shift_list,
                                                                          jx_list)

                genome_junction_pos, \
                genome_junction_pos1 = get_genomic_coordinates(pep_modi_coord)

                # --- TEST ---
                if subset_run_test and (test_query not in pep_seq): 
                    continue
                # --- END TEST ---

                print_stats(print_, kmer, pep_seq, strand, pep_orig_coord, 
                            pep_modi_coord, jx_list, jx_list_ori, genome_junction_pos,
                            genome_junction_pos1, aa_junction_pos, aa_junction_pos1, 
                            between_codons, between_codons1)

                jx_pep1, jx_pep2 = cut_peptides(jx_list, between_codons,
                                                between_codons1, aa_junction_pos, 
                                                aa_junction_pos1, 
                                                aa_junction_pos1_from_start, 
                                                print_=print_)

                # skip duplicates
                if (genome_junction_pos in peptide_junctions) and \
                   (genome_junction_pos1 in peptide_junctions):
                    continue
                else:
                    peptide_junctions.add(genome_junction_pos)
                    peptide_junctions.add(genome_junction_pos1)
                print('aa position', aa_junction_pos, aa_junction_pos1_from_start)
                print('genome position', genome_junction_pos, genome_junction_pos1 )
                # write fasta file 
                pep_idx = write_fasta_option(sp, pep_idx, aa_junction_pos,
                            aa_junction_pos1_from_start, between_codons, 
                            between_codons1, pep_5include, pep_3include, pep_gene, 
                            genome_junction_pos, genome_junction_pos1, 
                            kmer, jx_pep1, jx_pep2, 
                            readFrameAnnotated, \
                            junctionAnnotated, \
                            kmer_type, \
                            do_write=write_)
                if file_save_experiement is not None: 
                    write_peptide_to_experiment(ep, pep_idx, pep_seq,';'.join(
                        kmer_file_idx[kmer]), peptide_junctions) 

                expression_dict(pep_seq, id_sample, n_samples, 
                                junctionExpr, genome_junction_pos, 
                                genome_junction_pos1, id_modi_coord, 
                                pep_orig_coord)
    if write_:
        print('written to:')
        print(file_save_experiement)
        print(file_save)
        if file_save_experiement is not None: 
            ep.close()


C_scr_96_n1.all
open /cluster/work/grlab/projects/projects2022-PCa_Immuno/peptides_generation/v3_37b9a8f_conf2_annotFrame_cap0_runs/C_scr_96_n1.all_mutNone/ref_sample_peptides_meta.mig_filtered.pq
new
aa position 34 None
genome position 60536859_60014152 None
(17.0,)
MSFRFGQHLIKPSVVFLKTELSFALVNRKPVVPGHVLVCPLRPVERFHDLRPDEVADLFQTTQRVGTVVEKHFHGTSLTFSMQ
strand -
modif 60536859;60536962;60014006;60014152;None;None orig ['60536859', '60536979', '60014006', '60014152']
[['17.0']]
0 id of coordinate
['17.0'] expression value


new
aa position 34 None
genome position 60536859_60014152 None
(17.0,)
MSFRFGQHLIKPSVVFLKTELSFALVNRKPVVPGHVLVCPLRPVERFHDLRPDEVADLFQTTQRVGTVVEKHFHGTSL
strand -
modif 60536859;60536962;60014021;60014152;None;None orig ['60536859', '60536979', '60014019', '60014152']
[['17.0']]
0 id of coordinate
['17.0'] expression value


new
new
aa position 30 None
genome position 19608272_19608973 None
(0.0,)
HFSPENANDTAKETCLNWFFKIASIRELIPRLRGGIHPEM
strand +
modif 19608179;19608272;1960

AssertionError: 

In [49]:
154618949 - 154585206

33743

In [41]:
'MQAEGRGTGGSTGDADGPGGPGIPDGPGGNAGGPGEAGATGGRGPRGAGAARASGPGGGAPRGPHGGAASGLNGCCRCGARGPESRLLDYLAMPFATPMEAELARRSLAQDAPPLPVPGVLLK'\
== 'MQAEGRGTGGSTGDADGPGGPGIPDGPGGNAGGPGEAGATGGRGPRGAGAARASGPGGGAPRGPHGGAASGLNGCCRCGARGPESRLLEFYLAMPFATPMEAELARRSLAQDAPPLPVPGVLLKEFTVSGNILT'

False

In [None]:
defaultdict([1,1,1])

In [71]:
foo = dict()

In [76]:
foo.setdefault(123, np.zeros(32))[5] = 22

In [74]:
foo.setdefault(123, np.zeros(32))[10] = 8

In [77]:
foo

{123: array([ 0.,  0.,  0.,  0.,  0., 22.,  0.,  0.,  0.,  0.,  8.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.])}

In [59]:
if genome_junction_pos1: 
    print('t')

In [46]:
154618949 - 154619218

-269

In [47]:
154618104 - 154618273

-169

In [48]:
154585206 - 154585475

-269

In [49]:
154586151 - 154586320

-169

In [27]:
test = '(1,1)/(2,0)'

In [28]:
#'(1,1)/(2,0)' each tuple represents an expression value instance.
#- First entry of the tuple is first junction expression value and
#- the second entry is the second junction expressiion value 
np.array([pair.split(',') for pair in 
 test.replace(',)', '').replace('(', '').replace(')', '').replace(' ', '').split('/')],
         dtype = float)

array([[1., 1.],
       [2., 0.]])