-
Notifications
You must be signed in to change notification settings - Fork 0
/
Transcript.py
124 lines (96 loc) · 4.36 KB
/
Transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- encoding: utf-8 -*-
import sys
from Exon import *
class Transcript( object ):
def __init__( self, record ):
self.transcript_id = record.group_dict['transcript_id']
self.source = record.source # protein_coding|...|miRNA| etc
self.seqname = record.seqname
self.start = record.start # 1-based
self.end = record.end # 1-based
self.strand = record.strand
self.exons = dict()
self.CDS = list() # exons by the way
self.UTR = dict() # exons by the way
self.start_codon = None
self.stop_codon = None
#=============================================================================
def is_complete( self ):
if self.start_codon is not None and self.stop_codon is not None:
return True
else:
return False
#=============================================================================
def region_str( self, zero_based=False ):
if zero_based:
return "%s:%s-%s" % tuple( map( str, [ self.seqname, int( self.start ) - 1, int( self.end ) - 1 ]))
elif not zero_based:
return "%s:%s-%s" % ( self.seqname, self.start, self.end )
#=============================================================================
def cds_region_str( self, zero_based=False ):
if self.strand == "+":
if zero_based:
return "%s:%s-%s" % tuple( map( str, [ self.seqname, int( self.start_codon ) - 1, int( self.stop_codon ) + 1 ]))
elif not zero_based:
return "%s:%s-%s" % ( self.seqname, self.start_codon , str( int( self.stop_codon ) + 2 ))
elif self.strand == "-":
if zero_based:
return "%s:%s-%s" % tuple( map( str, [ self.seqname, int( self.stop_codon ) - 3, int( self.start_codon ) - 1 ]))
elif not zero_based:
return "%s:%s-%s" % ( self.seqname, str( int( self.stop_codon ) - 2 ), self.start_codon )
#=============================================================================
def get_cds_length( self ):
length = 0
for E in self.CDS:
length += ( int( E.end ) - int( E.start ) + 1 )
return length
#=============================================================================
def get_full_length( self ):
length = 0
for exon_id,E in self.exons.iteritems():
length += ( int( E.end ) - int( E.start ) + 1 )
return length
#=============================================================================
def utr_region_str( self, zero_based=False ):
if self.strand == "+":
if zero_based:
return "%s:%s-%s" % tuple( map( str, [ self.seqname, int( self.start ) - 1, int( self.start_codon ) - 2 ]))
elif not zero_based:
return "%s:%s-%s" % ( self.seqname, self.start, str( int( self.start_codon ) - 1 ))
elif self.strand == "-":
if zero_based:
return "%s:%s-%s" % tuple( map( str, [ self.seqname, int( self.start_codon ), int( self.end ) - 1] ))
elif not zero_based:
return "%s:%s-%s" % ( self.seqname, str( int( self.start_codon ) + 1 ), self.end )
#=============================================================================
def process_exon( self, record ):
self.exons[int( record.group_dict['exon_number'] )] = Exon( record )
# self.exons[record.group_dict['exon_id']] = Exon( record )
#=============================================================================
def process_CDS( self, record ):
self.CDS.append( Exon( record ))
#=============================================================================
def process_UTR( self, record ):
pass
#=============================================================================
def process_start_codon( self, record ):
self.start_codon = record.start
#=============================================================================
def process_stop_codon( self, record ):
self.stop_codon = record.start
#=============================================================================
def __repr__( self ):
# return "%s [%s]" % ( self.transcript_id, self.region_str())
if self.strand == "+":
return "%s\t[%s(%s):(%s)%s]" % ( self.transcript_id, self.start, self.start_codon, self.stop_codon, self.end )
elif self.strand == "-":
return "%s\t[%s(%s):(%s)%s]" % ( self.transcript_id, self.end, self.start_codon, self.stop_codon, self.start )
#=============================================================================
def __eq__( self, T ):
"""
we define equality based on having the same CDS
"""
if self.start_codon == T.start_codon and self.stop_codon == T.stop_codon:
return True
else:
return False