In [1]:

import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord


In [2]:

sqanti_classification = pd.read_table('./jurkat_classification.txt')
sqanti_novel_genes = sqanti_classification[sqanti_classification['associated_gene'].str.startswith('novel')]

novel_gene_isoforms = set(sqanti_novel_genes['isoform'])

novel_gene_sequences = []
for record in SeqIO.parse('./jurkat_corrected.fasta', 'fasta'):
    if record.id in novel_gene_isoforms:
        novel_gene_sequences.append(record)

SeqIO.write(novel_gene_sequences, './jurkat.novel_gene_isoforms.fasta', 'fasta')



13240

In [3]:
# RUN CPAT then continue
cpat_best_orfs = pd.read_table('./jurkat_novel_gene.ORF_prob.best.tsv')
best_orf_accs = set(cpat_best_orfs['ID'])
# %%
best_orf_fasta = []
for record in SeqIO.parse('./jurkat_novel_gene.ORF_seqs.fa', 'fasta'):
    if record.id in best_orf_accs:
        accession = record.id.split('_')[0]

        translated_id = f'nv|{accession}|fullname GN=NOVELGENE'
        translated_seq = record.seq.translate(stop_symbol='')
        translated_record = SeqRecord(translated_seq, id=translated_id, description='')

        best_orf_fasta.append(translated_record)

SeqIO.write(best_orf_fasta,'./jurkat.novel_gene.CPAT.best.fasta', 'fasta')

        


13209

In [13]:
high_scoring = cpat_best_orfs[cpat_best_orfs['Coding_prob'] > 0.364]

novel_gene_peptides = pd.read_excel('./NovelGenePeptides.xlsx')




In [14]:
sqanti_novel_genes.groupby('structural_category').size()


structural_category
antisense        1871
genic_intron       21
intergenic      11348
dtype: int64

In [17]:
def get_higheset_cpat_score(protein_accession, cpat_scores):
    accs = protein_accession.split('|')
    cpat_score = 0.0
    for acc in accs:
        if acc in cpat_scores.keys():
            if cpat_scores[acc] > cpat_score:
                cpat_score = cpat_scores[acc]
    return cpat_score

def get_sqanti_category(protein_accession, sqanti_categories):
    acc_categories = set()
    accs = protein_accession.split('|')
    for acc in accs:
        if acc in sqanti_categories:
            acc_categories.add(sqanti_categories[acc])

    return '|'.join(acc_categories)



cpat_scores = pd.Series(cpat_best_orfs['Coding_prob'].values, index=cpat_best_orfs['seq_ID']).to_dict()

sqanti_categories = pd.Series(sqanti_novel_genes['structural_category'].values, index=sqanti_novel_genes['isoform']).to_dict()

novel_gene_peptides['Highest CPAT Score'] = novel_gene_peptides['Protein Accession'].apply(lambda acc: get_higheset_cpat_score(acc, cpat_scores))
novel_gene_peptides['SQANTI structural category'] = novel_gene_peptides['Protein Accession'].apply(lambda acc: get_sqanti_category(acc, sqanti_categories))

peps_high_cpat = novel_gene_peptides[novel_gene_peptides['Highest CPAT Score']>0.364]
isoforms_with_peps_high_cpat = list(peps_high_cpat['Protein Accession'].str.split('|'))
isoforms_with_peps_high_cpat = set().union(*isoforms_with_peps_high_cpat)


In [11]:

sqanti_novel_genes[sqanti_novel_genes['isoform'].isin(isoforms_with_peps_high_cpat)]


Unnamed: 0,isoform,chrom,strand,length,exons,structural_category,associated_gene,associated_transcript,ref_length,ref_exons,...,CDS_genomic_end,predicted_NMD,perc_A_downstream_TTS,seq_A_downstream_TTS,dist_to_cage_peak,within_cage_peak,dist_to_polya_site,within_polya_site,polyA_motif,polyA_dist
21744,PB.901.1,chr1,-,3883,1,intergenic,novelGene_483,novel,,,...,,,60.0,AAAAACAAAAATTAGGAGGT,,,,,,
85270,PB.13014.5,chr17,-,2930,1,intergenic,novelGene_9431,novel,,,...,,,95.0,AAAAAGAAAAAAAAAAAAAA,,,,,,
96654,PB.13888.20,chr17,+,2629,2,intergenic,novelGene_9827,novel,,,...,,,80.0,AAAAATAAATAAATAAAAAT,,,,,,
110890,PB.15039.1,chr19,-,2614,1,antisense,novelGene_ENSG00000105497.8_AS,novel,,,...,,,80.0,AAGAAAAAAAAAAAAAAGGT,,,,,,
110891,PB.15039.2,chr19,-,1536,2,antisense,novelGene_ENSG00000105497.8_AS,novel,,,...,,,65.0,AAAAATACAAAAAATTAGCC,,,,,,
195978,PB.7033.1,chr8,-,1923,3,antisense,novelGene_ENSG00000176595.4_AS,novel,,,...,,,100.0,AAAAAAAAAAAAAAAAAAAA,,,,,,


In [18]:
novel_gene_peptides

Unnamed: 0,File Name,Scan Number,Scan Retention Time,Num Experimental Peaks,Total Ion Current,Precursor Scan Number,Precursor Charge,Precursor MZ,Precursor Mass,Score,...,Cumulative Target,Cumulative Decoy,QValue,Cumulative Target Notch,Cumulative Decoy Notch,QValue Notch,PEP,PEP_QValue,Highest CPAT Score,SQANTI structural category
0,120426_Jurkat_highLC_Frac17,10620,88.51345,200,2007754.0,10617,2,811.38343,1620.7523,18.266,...,18067,0,0.0,17977,0,0.0,0.000187,8.3e-05,0.999924,intergenic
1,120426_Jurkat_highLC_Frac8_120430121912,4652,46.01599,200,867044.1,4649,2,875.92231,1749.83006,11.049,...,57669,24,0.000414,57273,21,0.000365,0.841718,0.045523,0.007278,antisense
2,120426_Jurkat_highLC_Frac11,20333,148.36445,169,1105796.0,20330,2,728.88694,1455.75933,10.289,...,59639,28,0.000469,59226,25,0.000422,0.301155,0.009264,0.101972,intergenic
3,120426_Jurkat_highLC_Frac17,4589,53.38795,193,1426055.0,4583,1,717.44904,716.44176,10.103,...,63709,76,0.001192,63267,72,0.001137,0.036388,0.00166,0.017964,intergenic
4,120426_Jurkat_highLC_Frac2,1910,29.95894,200,915674.6,1904,2,409.20625,816.39794,10.093,...,63924,85,0.001329,63476,81,0.001275,0.763988,0.034482,0.060343,intergenic
5,120426_Jurkat_highLC_Frac9,1834,27.62694,175,1684308.0,1833,2,468.25664,934.49872,10.068,...,64369,99,0.001537,63903,92,0.001439,0.002824,0.000263,0.993432,antisense
6,120426_Jurkat_highLC_Frac27,16997,162.04191,200,156263.7,16994,2,790.90874,1579.80293,10.034,...,64842,111,0.00171,64335,102,0.001584,0.837348,0.044724,0.820876,intergenic
7,120426_Jurkat_highLC_Frac27,10561,108.58996,128,11661580.0,10560,2,708.35335,1414.69215,9.225,...,67736,136,0.002006,67220,126,0.001872,0.028647,0.001413,0.2791,intergenic
8,120426_Jurkat_highLC_Frac17,4369,52.11873,175,4884039.0,4366,2,398.21637,794.41819,9.204,...,68147,146,0.002142,67624,136,0.002011,0.006238,0.000437,0.021512,intergenic
9,120426_Jurkat_highLC_Frac9,4012,46.18971,200,2700783.0,4006,2,494.77506,987.53557,9.164,...,69077,177,0.002562,68545,166,0.002421,0.295279,0.009085,0.189291,intergenic


In [19]:
novel_gene_peptides.columns

Index(['File Name', 'Scan Number', 'Scan Retention Time',
       'Num Experimental Peaks', 'Total Ion Current', 'Precursor Scan Number',
       'Precursor Charge', 'Precursor MZ', 'Precursor Mass', 'Score',
       'Delta Score', 'Notch', 'Base Sequence', 'Full Sequence',
       'Essential Sequence', 'PSM Count (unambiguous, <0.01 q-value)', 'Mods',
       'Mods Chemical Formulas', 'Mods Combined Chemical Formula',
       'Num Variable Mods', 'Missed Cleavages', 'Peptide Monoisotopic Mass',
       'Mass Diff (Da)', 'Mass Diff (ppm)', 'Protein Accession',
       'Protein Name', 'Gene Name', 'Organism Name',
       'Identified Sequence Variations', 'Splice Sites', 'Contaminant',
       'Decoy', 'Peptide Description', 'Start and End Residues In Protein',
       'Previous Amino Acid', 'Next Amino Acid', 'Theoreticals Searched',
       'Decoy/Contaminant/Target', 'Matched Ion Series',
       'Matched Ion Mass-To-Charge Ratios', 'Matched Ion Mass Diff (Da)',
       'Matched Ion Mass Diff (Ppm

In [23]:
novel_gene_peptides.to_csv('./novel_gene_peptides.tsv', sep='\t', index=False)