forked from jeffhussmann/ribosomes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gtf.py
158 lines (135 loc) · 5.43 KB
/
gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from collections import defaultdict, Counter
from Sequencing import genomes, utilities
import positions
import transcript as transcript_utils
class Feature(object):
def __init__(self, line=None):
if line == None:
# Allow __init__ to be called with no arguments to allow the
# @classmethod constructor below.
return
fields = line.strip().split('\t')
self.seqname = fields[0]
self.source = fields[1]
self.feature = fields[2]
self.start = int(fields[3]) - 1
self.end = int(fields[4]) - 1
self.score = fields[5]
self.strand = fields[6]
self.frame = fields[7]
if self.frame != '.':
self.frame = int(self.frame)
self.attribute_string = fields[8]
self.parse_attribute_string()
@classmethod
def from_fields(cls, seqname, source, feature, start, end, score, strand, frame, attribute_string):
obj = cls()
obj.seqname = seqname
obj.source = source
obj.feature = feature
obj.start = start
obj.end = end
obj.score = score
obj.strand = strand
obj.frame = frame
obj.attribute_string = attribute_string
obj.parse_attribute_string()
return obj
@classmethod
def sequence_edge(cls, seqname, position):
obj = cls()
obj.seqname = seqname
obj.feature = 'edge'
obj.start = position
obj.end = position
obj.strand = '.'
obj.source = '.'
obj.score = '.'
obj.frame = '.'
obj.attribute_string = '.'
obj.parse_attribute_string()
return obj
def parse_attribute_string(self):
if self.attribute_string == '.':
parsed = {}
else:
fields = self.attribute_string.strip(';').split('; ')
pairs = [field.split() for field in fields]
parsed = {name: value.strip('"') for name, value in pairs}
self.attribute = parsed
def __str__(self):
fields = (self.seqname,
self.source,
self.feature,
str(self.start + 1),
str(self.end + 1),
self.score,
self.strand,
str(self.frame),
self.attribute_string,
)
line = '\t'.join(fields)
return line
@property
def pasteable(self):
return '{0}:{1}-{2}'.format(self.seqname, self.start, self.end)
def __hash__(self):
return hash(str(self))
def __eq__(self, other):
return str(self) == str(other)
@property
def comparison_key(self):
key = (self.seqname,
self.start,
self.end,
self.feature,
self.strand,
)
return key
def __lt__(self, other):
return self.comparison_key < other.comparison_key
def is_contained_in(self, other):
return self.seqname == other.seqname and \
self.start >= other.start and \
self.end <= other.end
def get_all_features(gtf_fn):
all_features = [Feature(line) for line in open(gtf_fn)]
return all_features
def get_noncoding_RNA_transcripts(gtf_fn):
all_features = get_all_features(gtf_fn)
transcripts = transcript_utils.get_transcripts(all_features, '/dev/null', '/dev/null')
rRNA_transcripts = []
tRNA_transcripts = []
other_ncRNA_transcripts = []
for transcript in transcripts:
if any(exon.source == 'rRNA' for exon in transcript.exons):
rRNA_transcripts.append(transcript)
elif any(exon.source == 'tRNA' for exon in transcript.exons):
tRNA_transcripts.append(transcript)
elif any('RNA' in exon.source for exon in transcript.exons):
other_ncRNA_transcripts.append(transcript)
return rRNA_transcripts, tRNA_transcripts, other_ncRNA_transcripts
def get_translated_transcripts(transcripts):
''' Returns a list of all elements in transcripts that are translated.
'''
translated_transcripts = [transcript for transcript in transcripts if any(transcript.CDSs)]
return translated_transcripts
def get_CDSs(gtf_fn, genome_dir, utr_fn):
all_features = get_all_features(gtf_fn)
transcripts = transcript.get_transcripts(all_features, genome_dir, utr_fn)
translated_transcripts = get_translated_transcripts(transcripts)
return sorted(translated_transcripts)
def make_yeast_list():
weinberg_fn = '/home/jah/projects/ribosomes/experiments/weinberg/most_weinberg_transcripts.txt'
yeast_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_list.txt'
gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome'
all_features = get_all_features(gtf_fn)
transcripts = get_transcripts(all_features, genome_dir)
translated = get_translated_transcripts(transcripts)
nonoverlapping, _ = get_nonoverlapping_transcripts(transcripts, require_same_strand=True)
weinberg_names = {line.strip() for line in open(weinberg_fn)}
final_set = ({t.name for t in translated} & {n.name for n in nonoverlapping}) | weinberg_names
with open(yeast_fn, 'w') as yeast_fh:
for name in sorted(final_set):
yeast_fh.write('{0}\n'.format(name))