In [1]:
import os 
import pandas as pd
from collections import defaultdict
import numpy as np 
import glob
from helpers_minor_intron import * 

This notebook performs the following task: * Given a kmer_metadata file, generate a fasta file with metadata relevant for the tryptic digestion


In [2]:
experiment_folder = '/cluster/work/grlab/projects/projects2022-PCa_Immuno/peptides_generation/v3_84dc237_conf2_annotFrame_cap0_runs'

In [3]:
sample_file = '/cluster/work/grlab/projects/projects2022-PCa_Immuno/samples_list/PCa_samples_20220616'

In [4]:
samples = pd.read_csv(sample_file, names = ['sample'])

In [5]:
samples.head()

Unnamed: 0,sample
0,C_scr_96_n1.all
1,C_scr_96_n2.all
2,C_scr_96_n3.all
3,C_scr_96_n4.all
4,C_siU6atac_96_n1.all


In [6]:
sample = samples.iloc[0][0]

# Preprocess the annotation 
### Input: Annotation

In [7]:
ann_path = '/cluster/work/grlab/projects/projects2022-PCa_Immuno/annotation/gencode.v32.annotation.gtf'

In [8]:
from immunopepper.preprocess import attribute_item_to_dict
from immunopepper.preprocess import leq_strand

In [9]:
# Partial copy From Immunopepper preprocess.py 
transcript_to_gene_dict = {}    # transcript -> gene id


gene_to_transcript_dict = {}    # gene_id -> list of transcripts
gene_cds_begin_dict = {}        # gene -> list of first CDS exons

transcript_to_cds_dict = {}     # transcript -> list of CDS exons
transcript_cds_begin_dict = {}  # transcript -> first exon of the CDS
transcript_to_strand = {}

file_type = ann_path.split('.')[-1]
chromesome_set = set()
# collect information from annotation file
for line in open(ann_path, 'r'):
    if line[0] == '#':
        continue
    item = line.strip().split('\t')
    chromesome_set.add(item[0])
    feature_type = item[2]
    attribute_item = item[-1]
    attribute_dict = attribute_item_to_dict(attribute_item, file_type, feature_type)
    # store relationship between gene ID and its transcript IDs
    if feature_type in ['transcript', 'mRNA']:
        gene_id = attribute_dict['gene_id']
        transcript_id = attribute_dict['transcript_id']
        if attribute_dict['gene_type'] != 'protein_coding' or attribute_dict['transcript_type']  != 'protein_coding':
            continue
        assert (transcript_id not in transcript_to_gene_dict)
        transcript_to_gene_dict[transcript_id] = gene_id
        if gene_id in gene_to_transcript_dict and transcript_id not in gene_to_transcript_dict[gene_id]:
            gene_to_transcript_dict[gene_id].append(transcript_id)
        else:
            gene_to_transcript_dict[gene_id] = [transcript_id]
        # Todo python is 0-based while gene annotation file(.gtf, .vcf, .maf) is one based
    elif feature_type == "CDS":
        parent_ts = attribute_dict['transcript_id']
        strand_mode = item[6]
        cds_left = int(item[3])-1
        cds_right = int(item[4])
        frameshift = int(item[7])
        transcript_to_strand[parent_ts] = strand_mode
        if parent_ts in transcript_to_cds_dict:
            transcript_to_cds_dict[parent_ts].append((cds_left, cds_right, frameshift))
        else:
            transcript_to_cds_dict[parent_ts] = [(cds_left, cds_right, frameshift)]
        if strand_mode == "+" :
            cds_start, cds_stop = cds_left, cds_right
        else:
            cds_start, cds_stop = cds_right, cds_left

        # we only consider the start of the whole CoDing Segment
        if parent_ts not in transcript_cds_begin_dict or \
           leq_strand(cds_start, transcript_cds_begin_dict[parent_ts][0], strand_mode):
            transcript_cds_begin_dict[parent_ts] = (cds_start, cds_stop, item)

# collect first CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    if target_gene not in gene_cds_begin_dict:
        gene_cds_begin_dict[target_gene] = []
    if ts_key in transcript_cds_begin_dict:
        gene_cds_begin_dict[target_gene].append(transcript_cds_begin_dict[ts_key])


In [10]:
# Custom collection of CDS 
transcript_cds_begin_dict_bis = {}
transcript_cds_end_dict_bis = {}

gene_cds_begin_dict_bis = defaultdict(list)
gene_cds_end_dict_bis = defaultdict(list)

# will be in reading order 
for ts_key in transcript_to_cds_dict:
    if transcript_to_strand[ts_key] == '+': # '+'
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][0],
                                                 transcript_to_cds_dict[ts_key][0][1], '+')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][0],
                                                 transcript_to_cds_dict[ts_key][-1][1], '+')
        

    else: 
        transcript_cds_begin_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][0][1],
                                                 transcript_to_cds_dict[ts_key][0][0], '-')
        transcript_cds_end_dict_bis[ts_key] = (transcript_to_cds_dict[ts_key][-1][1],
                                                 transcript_to_cds_dict[ts_key][-1][0], '-')
    
    assert(transcript_cds_begin_dict_bis[ts_key][0] == transcript_cds_begin_dict[ts_key][0])
    assert(transcript_cds_begin_dict_bis[ts_key][1] == transcript_cds_begin_dict[ts_key][1])

# collect first, last CDS exons for all transcripts of a gene
for ts_key in transcript_to_gene_dict:
    target_gene = transcript_to_gene_dict[ts_key]
    gene_cds_begin_dict_bis[target_gene].append(transcript_cds_begin_dict_bis[ts_key])
    gene_cds_end_dict_bis[target_gene].append(transcript_cds_end_dict_bis[ts_key])

# Format the peptide raw file to fasta 


In [11]:
## Format and create flags 

kmer_len = 9 
file_save_experiement = None 

print_ = False
test_query = 'MRGQRSLLLGPARLCLRLLLLLGYRRRCPPLLRGLVQRWRYGKVCLRSLLYNSFGGSDTAVDAAFEPVYWLVDNVIRWFGVVSDVQGAGKGVLWGAEGHVSHRPNPGFGGRPPPPGKLRVSFGHSSCAMSP'
subset_run_test = False
write_= False


In [12]:
jx_to_expression = [[],[],[],[]]

for id_sample, sample in enumerate(samples['sample']):
    pep_idx = 0 
    print(sample)
    file_meta = os.path.join(experiment_folder,
                             f'{sample}_mutNone/ref_sample_peptides_meta.mig_filtered.pq')

    file_save = os.path.join(experiment_folder,
                             f'{sample}_mutNone/ref_sample_peptides_meta.mig_filtered.fasta')


    if file_save_experiement is not None: 
        ep = open(file_save_experiement, 'w') # Experiment file to save
        write_peptide_to_experiment(ep, header=True)
    with open(file_save, 'w') as sp: # Fasta file to save
        fp = pd.read_parquet(file_meta) # metadata from immunopepper as parquet
        print("open {}".format(file_meta))
        ### Iterate over peptides 
        for j, line in fp.iterrows():
            peptide_junctions = set() #each line is one peptide / peptides matching a kmer
                                      #each peptide contains at most one kmer
            # --- TEST ---
            if subset_run_test and (test_query in line['peptide']):
                print(f'\n METADATA (ImmunoPepper derived): \n {line}')
                break
            # --- END TEST ---


            pep, pep_seq, pep_gene, pep_orig_coordS, pep_modif_coordS, \
            strand, has_stop_codon, readFrameAnnotated, \
            junctionAnnotated, kmer_type, is_isolated, \
            junctionExpr, minorIntron = extract_peptide_fields_pq(line)
        
            # some peptides do not contain a junction
            if int(is_isolated):
                continue 

            kmer = None

            # Genomic coordinates : get include 3' or 5' flag 
            #[ iterate only needed for old (May 2020-June 2022) 
            # collapsed version of peptide files with multiple coordinates 
            # searated by /]
            for pep_orig_coord in pep_orig_coordS:
                pep_orig_coord = split_coord(pep_orig_coord)
                start_cds = [ first_exon[0] for first_exon in 
                             gene_cds_begin_dict_bis[pep_gene] ] 
                end_cds = [ last_exon[1] for last_exon in 
                           gene_cds_end_dict_bis[pep_gene] ] 
                pep_start, pep_end = extract_end_starts(pep_orig_coord, strand)
                pep_5include, pep_3include = get_include_flag(start_cds, end_cds, 
                                                              pep_start, pep_end, 
                                                              has_stop_codon )

            # Modified genomic coordinates: get 
            # "junction position" flag and "between codon" flag
            # and "genomic coordinates"
            #[ iterate only needed for old (May 2020-June 2022) 
            # collapsed version of peptide files with multiple coordinates 
            # searated by /]
            for id_modi_coord, pep_modi_coord in enumerate(pep_modif_coordS):
                pep_modi_coord = split_coord(pep_modi_coord)
                if len(pep_modi_coord) <=2:
                    continue
                jx_list, shift_list, \
                jx_list_ori = get_nt_len_with_aa_shift(pep_modi_coord)

                aa_junction_pos, between_codons, \
                aa_junction_pos1, between_codons1, \
                aa_junction_pos1_from_start = get_aaPos_betweenFlag(shift_list,
                                                                          jx_list)

                genome_junction_pos, \
                genome_junction_pos1 = get_genomic_coordinates(pep_modi_coord, strand)

                # --- TEST ---
                if subset_run_test and (test_query not in pep_seq): 
                    continue
                # --- END TEST ---

                print_stats(print_, kmer, pep_seq, strand, pep_orig_coord, 
                            pep_modi_coord, jx_list, jx_list_ori, genome_junction_pos,
                            genome_junction_pos1, aa_junction_pos, aa_junction_pos1, 
                            between_codons, between_codons1)

                jx_pep1, jx_pep2 = cut_peptides(pep_seq, jx_list, between_codons,
                                                between_codons1, aa_junction_pos, 
                                                aa_junction_pos1, 
                                                aa_junction_pos1_from_start, 
                                                print_=print_)

                # skip duplicates
                if (genome_junction_pos in peptide_junctions) and \
                   (genome_junction_pos1 in peptide_junctions):
                    continue
                else:
                    peptide_junctions.add(genome_junction_pos)
                    peptide_junctions.add(genome_junction_pos1)
                
                
                # write fasta file 
                pep_idx, jx_to_expression = write_fasta_option_MI(sp, pep_seq, 
                            pep_idx, aa_junction_pos,
                            aa_junction_pos1_from_start, between_codons, 
                            between_codons1, pep_5include, pep_3include, 
                            pep_gene, strand, 
                            genome_junction_pos, genome_junction_pos1, 
                            kmer, jx_pep1, jx_pep2, 
                            readFrameAnnotated, \
                            junctionAnnotated, \
                            kmer_type, minorIntron, junctionExpr, sample, 
                            jx_to_expression, \
                            do_write=write_)
                if file_save_experiement is not None: 
                    write_peptide_to_experiment(ep, pep_idx, pep_seq,';'.join(
                        kmer_file_idx[kmer]), peptide_junctions) 

#                 expression_dict(pep_seq, id_sample, n_samples, 
#                                 junctionExpr, genome_junction_pos, 
#                                 genome_junction_pos1, id_modi_coord, 
#                                 pep_orig_coord)
    if write_:
        print('written to:')
        print(file_save_experiement)
        print(file_save)
        if file_save_experiement is not None:
            ep.close()


C_scr_96_n1.all
open /cluster/work/grlab/projects/projects2022-PCa_Immuno/peptides_generation/v3_84dc237_conf2_annotFrame_cap0_runs/C_scr_96_n1.all_mutNone/ref_sample_peptides_meta.mig_filtered.pq
C_scr_96_n2.all
open /cluster/work/grlab/projects/projects2022-PCa_Immuno/peptides_generation/v3_84dc237_conf2_annotFrame_cap0_runs/C_scr_96_n2.all_mutNone/ref_sample_peptides_meta.mig_filtered.pq
C_scr_96_n3.all
open /cluster/work/grlab/projects/projects2022-PCa_Immuno/peptides_generation/v3_84dc237_conf2_annotFrame_cap0_runs/C_scr_96_n3.all_mutNone/ref_sample_peptides_meta.mig_filtered.pq
C_scr_96_n4.all
open /cluster/work/grlab/projects/projects2022-PCa_Immuno/peptides_generation/v3_84dc237_conf2_annotFrame_cap0_runs/C_scr_96_n4.all_mutNone/ref_sample_peptides_meta.mig_filtered.pq
C_siU6atac_96_n1.all
open /cluster/work/grlab/projects/projects2022-PCa_Immuno/peptides_generation/v3_84dc237_conf2_annotFrame_cap0_runs/C_siU6atac_96_n1.all_mutNone/ref_sample_peptides_meta.mig_filtered.pq
C_siU

In [None]:
%debug

In [None]:
expression_table = pd.DataFrame(jx_to_expression).T

In [None]:
expression_table.columns = ['junction_coordinates', 'peptide_sequence', 'sample', 'expression']

In [None]:
expression_table.head()


In [None]:
expression_table = expression_table.drop_duplicates()

In [None]:
 expression_table.to_csv(os.path.join(experiment_folder, 'expression_counts_minor_Introns_unormalized.tsv'), index = None, sep = '\t')

In [None]:
expression_table.shape

In [None]:
print(os.path.join(experiment_folder, 'expression_counts_minor_Introns_unormalized.tsv'))