In [7]:
# This script get gene annotations for the known protein coding genes in gencode 19, result saved to data/anno_refs
import re
import operator
import pickle

In [8]:
def get_annotation(annotation_file_path):
    name_dict = {}  # {transcript:[gene_name]}
    transcript_dict = {}  # {'gene_name':['ENST00000']}
    transcript_info_dict = {}  # {transcriptID:'exon'[]; 'CDS'[];'gene'[];'UTR
    
    with open(annotation_file_path, 'r') as annotation_f:
        for lines in annotation_f:
            if lines.startswith("chr"):
                line_split = lines.split('\t')
                chr_n, source, feature, start_pos, end_pos = line_split[0:5]
                strand, frame, attribute = line_split[6:9]
                position_list = [int(start_pos), int(end_pos)]
                    
                # feature = line_split[2] # 'exon' / 'CDS'/ 'UTR'/ 'start_codon'/ 'stop_codon'/ 'transcript'/ 'gene'
                if feature !='gene':
                    # if this is a protein coding gene and is the primary transcript:
                    if 'protein_coding' in attribute and "KNOWN" in attribute:
                        if 'appris_principal' in attribute or 'appris_candidate_longest' in attribute:
                            # parse the attributes
                            attribute_split = attribute.split(';')
                            gene_id_col, transcript_id_col, gene_type_col, gene_status_col, gene_name_col, transcript_type_col, \
                            transcript_status_col, transcript_name_col = attribute_split[0:8]
                            # not include the version number
                            gene_id = re.findall(r'(ENSG\d+|ENSGR\d+)', gene_id_col)[0]
                            transcript_id = re.findall(r'(ENST\d+|ENSTR\d+)', transcript_id_col)[0]
                            gene_status = re.findall(r'\"(.*?)\"', gene_status_col)[0]  # KNOWN or not
                            gene_name = re.findall(r'\"(.*?)\"', gene_name_col)[0]
                            transcript_status = re.findall(r'\"(.*?)\"', transcript_status_col)[0]  # KNOWN or not

                            if gene_name not in transcript_dict:
                                transcript_dict[gene_name] = []
                            if transcript_id not in transcript_dict[gene_name]:
                                transcript_dict[gene_name].append(transcript_id)

                            if transcript_id not in transcript_info_dict:
                                transcript_info_dict[transcript_id] = {}
                                transcript_info_dict[transcript_id]['strand'] = strand
                                transcript_info_dict[transcript_id]['chr'] = chr_n
                                transcript_info_dict[transcript_id]['exon'] = []
                                transcript_info_dict[transcript_id]['CDS'] = []
                                transcript_info_dict[transcript_id]['UTR'] = []
                                transcript_info_dict[transcript_id]['transcript'] = []

                                # if the transcript and genes are known, parse according to feature type
                            if feature == 'exon':
                                # exon_n = re.findall(r'\d+', attribute_split[8])
                                transcript_info_dict[transcript_id]['exon'].append(position_list)
                            elif feature == 'CDS':
                                transcript_info_dict[transcript_id]['CDS'].append(position_list)
                            elif feature == 'UTR':
                                transcript_info_dict[transcript_id]['UTR'].append(position_list)
                            elif feature == 'transcript':
                                transcript_info_dict[transcript_id]['transcript'] = position_list

                            if transcript_id not in name_dict:
                                    name_dict[transcript_id] = gene_name


    # delete the transcript record that have 2 or more transcripts, only keep the longest transcript
    for names in transcript_dict:
        if len(transcript_dict[names]) > 1:
            dict_t={}
            for t in transcript_dict[names]:
                len_t = abs(transcript_info_dict[t]['transcript'][1] - transcript_info_dict[t]['transcript'][0])
                dict_t[t]=len_t
            dt = sorted(dict_t.items(), key=operator.itemgetter(1), reverse=True)[1:]
            for t in dt:
                del transcript_info_dict[t[0]]

    return name_dict, transcript_info_dict

In [9]:
f_anno = ('../data/anno_refs/gencode_v19/gencode.v19.annotation.gtf')
dict_name, dict_transcript_info = get_annotation(f_anno)
# pickle.dump(dict_name, open('../data/proc_refs/dict_name_062121.pkl','wb'))
# pickle.dump(dict_transcript_info, open('../data/proc_refs/dict_transcript_info_062121.pkl','wb'))