In [2]:
import re
import operator
import pickle
import pandas as pd
import mmap
import tqdm as tqdm, notebook

In [42]:
final_genelist = []
with open('../anno_ref/genes-final.csv','r') as f:
    for lines in f:
        final_genelist.append(lines.strip('\n'))

In [70]:
def get_annotation(annotation_file_path):
    name_dict = {}  # {transcript:[gene_name]}
    transcript_dict = {}  # {'gene_name':['ENST00000']}
    transcript_info_dict = {}  # {transcriptID:'exon'[]; 'CDS'[];'gene'[];'UTR
    
    with open(annotation_file_path, 'r') as annotation_f:
        for lines in annotation_f:
            if lines.startswith("chr"):
                line_split = lines.split('\t')
                chr_n, source, feature, start_pos, end_pos = line_split[0:5]
                strand, frame, attribute = line_split[6:9]
                position_list = [int(start_pos), int(end_pos)]
                    
                # feature = line_split[2] # 'exon' / 'CDS'/ 'UTR'/ 'start_codon'/ 'stop_codon'/ 'transcript'/ 'gene'
                if feature !='gene':
                    # if this is a protein coding gene and is the primary transcript:
                    if 'protein_coding' in attribute:
                        if 'appris_principal' in attribute or 'appris_candidate_longest' in attribute:
                            # parse the attributes
                            attribute_split = attribute.split(';')
                            gene_id_col, transcript_id_col, gene_type_col, gene_name_col, transcript_type_col, \
                            transcript_name_col = attribute_split[0:6]
                            # not include the version number
                            gene_id = re.findall(r'(ENSG\d+|ENSGR\d+)', gene_id_col)[0]
                            transcript_id = re.findall(r'(ENST\d+|ENSTR\d+)', transcript_id_col)[0]
                            gene_name = re.findall(r'\"(.*?)\"', gene_name_col)[0]
#                             print(gene_id, gene_name)

                            # ignore the genes not in final gene_list
                            if gene_name not in final_genelist:
                                continue

                            if gene_name not in transcript_dict:
                                transcript_dict[gene_name] = []
                            if transcript_id not in transcript_dict[gene_name]:
                                transcript_dict[gene_name].append(transcript_id)

                            if transcript_id not in transcript_info_dict:
                                transcript_info_dict[transcript_id] = {}
                                transcript_info_dict[transcript_id]['strand'] = strand
                                transcript_info_dict[transcript_id]['chr'] = chr_n
                                transcript_info_dict[transcript_id]['exon'] = []
                                transcript_info_dict[transcript_id]['CDS'] = []
                                transcript_info_dict[transcript_id]['UTR'] = []
                                transcript_info_dict[transcript_id]['transcript'] = []

                                # if the transcript and genes are known, parse according to feature type
                            if feature == 'exon':
                                # exon_n = re.findall(r'\d+', attribute_split[8])
                                transcript_info_dict[transcript_id]['exon'].append(position_list)
                            elif feature == 'CDS':
                                transcript_info_dict[transcript_id]['CDS'].append(position_list)
                            elif feature == 'UTR':
                                transcript_info_dict[transcript_id]['UTR'].append(position_list)
                            elif feature == 'transcript':
                                transcript_info_dict[transcript_id]['transcript'] = position_list

                            if transcript_id not in name_dict:
                                name_dict[transcript_id] = gene_name
                                

    # delete the transcript record that have 2 or more transcripts, only keep the longest transcript
    for names in transcript_dict:
        if len(transcript_dict[names]) > 1:
            dict_t={}
            for t in transcript_dict[names]:
                len_t = abs(transcript_info_dict[t]['transcript'][1] - transcript_info_dict[t]['transcript'][0])
                dict_t[t]=len_t
            dt = sorted(dict_t.items(), key=operator.itemgetter(1), reverse=True)[1:]
            for t in dt:
                del transcript_info_dict[t[0]]
        

    return name_dict, transcript_info_dict

In [71]:
f_anno = ('../anno_ref/gencode_v38/gencode.v38.annotation.gtf')
dict_name, dict_transcript_info = get_annotation(f_anno)
# pickle.dump(dict_name, open('../proc_09152020/dict_name_v38.pkl','wb'))
# pickle.dump(dict_transcript_info, open('../proc_09152020/dict_transcript_info_v38.pkl','wb'))

In [7]:
f_anno = ('../anno_ref/gencode_v38/gencode.v38.annotation.gtf')
df_anno = pd.read_csv(f_anno, skiprows = 5, sep = '\t')

In [8]:
df_anno

Unnamed: 0,chr1,HAVANA,gene,11869,14412,.,+,..1,"gene_id ""ENSG00000223972.4""; transcript_id ""ENSG00000223972.4""; gene_type ""pseudogene""; gene_status ""KNOWN""; gene_name ""DDX11L1""; transcript_type ""pseudogene""; transcript_status ""KNOWN""; transcript_name ""DDX11L1""; level 2; havana_gene ""OTTHUMG00000000961.2"";"
0,chr1,HAVANA,transcript,11869,14409,.,+,.,"gene_id ""ENSG00000223972.4""; transcript_id ""EN..."
1,chr1,HAVANA,exon,11869,12227,.,+,.,"gene_id ""ENSG00000223972.4""; transcript_id ""EN..."
2,chr1,HAVANA,exon,12613,12721,.,+,.,"gene_id ""ENSG00000223972.4""; transcript_id ""EN..."
3,chr1,HAVANA,exon,13221,14409,.,+,.,"gene_id ""ENSG00000223972.4""; transcript_id ""EN..."
4,chr1,ENSEMBL,transcript,11872,14412,.,+,.,"gene_id ""ENSG00000223972.4""; transcript_id ""EN..."
...,...,...,...,...,...,...,...,...,...
2619438,chrM,ENSEMBL,transcript,15888,15953,.,+,.,"gene_id ""ENSG00000210195.2""; transcript_id ""EN..."
2619439,chrM,ENSEMBL,exon,15888,15953,.,+,.,"gene_id ""ENSG00000210195.2""; transcript_id ""EN..."
2619440,chrM,ENSEMBL,gene,15956,16023,.,-,.,"gene_id ""ENSG00000210196.2""; transcript_id ""EN..."
2619441,chrM,ENSEMBL,transcript,15956,16023,.,-,.,"gene_id ""ENSG00000210196.2""; transcript_id ""EN..."


In [3]:
dict_transcript_info = pickle.load(open('../proc_09152020/dict_transcript_info_v38.pkl','rb'))

In [4]:
dict_transcript_info['ENST00000322527']

{'strand': '-',
 'chr': 'chr13',
 'exon': [[102758702, 102759072],
  [102758543, 102758576],
  [102758410, 102758452],
  [102729367, 102750388]],
 'CDS': [[102758702, 102758932],
  [102758543, 102758576],
  [102758410, 102758452],
  [102729454, 102750388]],
 'UTR': [[102758933, 102759072], [102729367, 102729453]],
 'transcript': [102729367, 102759072]}