In [1]:

import pandas as pd
from Bio import SeqIO
import gtfparse

In [2]:

def get_cpat_best_translations(best_orfs, orf_fasta):
    best_orf_ids = best_orfs['ID'].unique()
    best_orf_sequences = []
    best_orf_map = {}
    
    for record in SeqIO.parse(orf_fasta, 'fasta'):
        if record.id in best_orf_ids:
            transcript = record.id.split('|')[0]
            record.id=transcript
            record.description=''
            record.name=''
            record.seq = record.seq.translate(stop_symbol='')
            best_orf_sequences.append(record)

            best_orf_map[transcript] = record
  
    with open('GENCODE.CPAT.best.translations.fa', 'w') as ofile:
        SeqIO.write(best_orf_sequences, ofile, 'fasta')
    return best_orf_map

cpat_best_orfs = pd.read_table('./GENCODE.ORF_prob.best.tsv')
orf_fasta = './GENCODE.ORF_seqs.fa'
gencode_cpat_best_map  = get_cpat_best_translations(cpat_best_orfs, orf_fasta)
gencode_sequences = {}
for record in SeqIO.parse('./gencode.v35.pc_translations.fa', 'fasta'):
    transcript = record.id.split('|')[1]
    gencode_sequences[transcript] = record
    


In [None]:
def compare_reference_to_cpat(reference, cpat):
    sequence_comparisons = []
    for key in reference.keys():
        if key in cpat.keys():
            is_match = reference[key].seq == cpat[key].seq
            sequence_comparisons.append([key, is_match, str(reference[key].seq), str(cpat[key].seq)])
    else:
        sequence_comparisons.append([key, False, str(reference[key].seq), ''])
    return sequence_comparisons

sequence_comparisons = compare_reference_to_cpat(gencode_sequences, gencode_cpat_best_map)
sequence_comparisons_df = pd.DataFrame(sequence_comparisons, columns=['Transcript', 'Match', 'Ref-Seq','CPAT-Seq'])
sequence_comparisons_df.groupby('Match').size()
sequence_comparisons_df.to_csv('./GENCODE.CPAT.COMPARISIONS.tsv', sep='\t', index=False)


gtf_filename = './gencode.v35.basic.annotation.gtf'
gtf = gtfparse.read_gtf(gtf_filename)


transcripts = gtf[gtf['feature'] == 'transcript']
transcripts = transcripts[[
    'seqname', 'source', 'strand', 'gene_type', 
    'gene_name', 'transcript_id', 'transcript_type', 
    'transcript_name','transcript_support_level', 
    'tag', 'protein_id']]
sequence_comparison= sequence_comparisons_df.merge(transcripts, how = 'left', left_on='Transcript', right_on='transcript_id')



def first_n_match(sequence, other_sequence, n):
    if type(sequence) is not str or type(other_sequence) is not str:
        print(sequence)
        print(other_sequence)
        return False
    if len(sequence) >= n and len(other_sequence) >= n:
        return sequence[:n] == other_sequence[:n]
    elif len(sequence) == len(other_sequence):
        n = len(sequence)
        return sequence[:n] == other_sequence[:n]
    else:
        return False

def last_n_match(sequence, other_sequence, n):
    if type(sequence) is not str or type(other_sequence) is not str:
        print(sequence)
        print(other_sequence)
        return False
    if len(sequence) >= n and len(other_sequence) >= n:
        return sequence[-n:] == other_sequence[-n:]
    elif len(sequence) == len(other_sequence):
        n = len(sequence)
        return sequence[-n:] == other_sequence[-n:]
    else:
        return False
 


sequence_comparison = pd.read_table('./GENCODE.CPAT.comparison_with_meta.tsv')
sequence_comparison.dropna(subset=['CPAT-Seq'], inplace=True)
for n in [5,10,25]:
    sequence_comparison[f'first_{n}_match'] = sequence_comparison.apply(lambda row: first_n_match(row['Ref-Seq'], row['CPAT-Seq'], n), axis = 1)
    sequence_comparison[f'last_{n}_match'] = sequence_comparison.apply(lambda row: last_n_match(row['Ref-Seq'], row['CPAT-Seq'], n), axis = 1)
sequence_comparison.to_csv('GENCODE.CPAT.comparison_with_meta.tsv', sep='\t', index=False)

