forked from skiniry/Trips-Viz
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_shelve_reads2.py
executable file
·395 lines (358 loc) · 13 KB
/
fetch_shelve_reads2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
import os
import collections
from bokeh.palettes import all_palettes
from sqlitedict import SqliteDict
# Merge two dictionaries
def merge_dicts(dict1,dict2):
for readlen in dict2:
if readlen not in dict1:
dict1[readlen] = dict2[readlen]
else:
for pos in dict2[readlen]:
if pos in dict1[readlen]:
dict1[readlen][pos] += dict2[readlen][pos]
else:
dict1[readlen][pos] = dict2[readlen][pos]
return dict1
# Create dictionary of read counts at each position in a transcript
def get_reads(read_type, min_read, max_read, tran, user_files,tranlen,coverage, organism, subcodon, noisered, primetype, filetype, readscore, secondary_readscore=1,pcr=False,get_mismatches=False):
mismatch_dict = collections.OrderedDict()
mismatch_dict["A"] = {}
mismatch_dict["T"] = {}
mismatch_dict["G"] = {}
mismatch_dict["C"] = {}
if get_mismatches == True:
for i in range(0,tranlen+1):
mismatch_dict["A"][i] = 0
mismatch_dict["T"][i] = 0
mismatch_dict["G"][i] = 0
mismatch_dict["C"][i] = 0
master_dict = collections.OrderedDict()
three_frame_dict = {0:{},1:{},2:{}}
for i in range(0,tranlen):
master_dict[i] = 0
master_file_dict = {}
list_master_dict = {}
# first make a master dict consisting of all the read dicts from each filename
offset_dict = {}
secondary_offset_dict = {}
if read_type == "unambig":
if filetype in user_files:
for file_id in user_files[filetype]:
filename = user_files[filetype][file_id]
if os.path.isfile(filename):
sqlite_db = SqliteDict(filename, autocommit=False)
else:
return "File not found",filename.split("/")[-1]
accepted_offsets = {}
accepted_secondary_offsets = {}
if "offsets" in sqlite_db:
all_offsets = sqlite_db["offsets"][primetype]["offsets"]
scores = sqlite_db["offsets"][primetype]["read_scores"]
else:
all_offsets = {}
scores = {}
if scores == {}:
for i in range(25,150):
scores[i] = 1
all_offsets[i] = 15
for rl in scores:
if float(scores[rl]) >= float(readscore):
if rl in all_offsets:
accepted_offsets[rl] = all_offsets[rl]
else:
accepted_offsets[rl] = 15
if accepted_offsets != {}:
offset_dict[filename] = {}
for length in accepted_offsets:
#if length not in accepted_secondary_offsets:
offset_dict[filename][length] = accepted_offsets[length]
if get_mismatches == True:
try:
sqlite_db_seqvar = sqlite_db[tran]["seq"]
for pos in sqlite_db_seqvar:
#convert to one based
fixed_pos = pos+1
for char in sqlite_db_seqvar[pos]:
#if char != "N":
if fixed_pos not in mismatch_dict[char]:
mismatch_dict[char][fixed_pos] = 0
count = sqlite_db_seqvar[pos][char]
mismatch_dict[char][fixed_pos] += count
except Exception as e:
pass
try:
alltrandict = sqlite_db[tran]
unambig_tran_dict = alltrandict["unambig"]
if pcr == True:
if "unambig_pcr" in alltrandict:
unambig_tran_dict = merge_dicts(unambig_tran_dict, alltrandict["unambig_pcr"])
sqlite_db.close()
master_file_dict[filename] = unambig_tran_dict
except Exception as e:
print "error", e
pass
else:
if filetype in user_files:
for file_id in user_files[filetype]:
filename = user_files[filetype][file_id]
if os.path.isfile(filename):
sqlite_db = SqliteDict(filename, autocommit=False)
else:
return "File not found",mismatch_dict
accepted_offsets = {}
accepted_secondary_offsets = {}
if "offsets" in sqlite_db:
all_offsets = sqlite_db["offsets"][primetype]["offsets"]
scores = sqlite_db["offsets"][primetype]["read_scores"]
else:
all_offsets = {}
scores = {}
#oyster has no readscores so subcodon profiles not displaying, so i put this in to give every readlength a score of 1
if scores == {}:
for i in range(25,50):
scores[i] = 1
all_offsets[i] = 15
for rl in scores:
if float(scores[rl]) >= float(readscore):
if rl in all_offsets:
accepted_offsets[rl] = all_offsets[rl]
else:
accepted_offsets[rl] = 15
if accepted_offsets != {}:
offset_dict[filename] = {}
for length in accepted_offsets:
offset_dict[filename][length] = accepted_offsets[length]
try:
sqlite_db_seqvar = sqlite_db[tran]["seq"]
for pos in sqlite_db_seqvar:
#convert to one based
fixed_pos = pos+1
for char in sqlite_db_seqvar[pos]:
if char != "N":
if fixed_pos not in mismatch_dict[char]:
mismatch_dict[char][fixed_pos] = 0
count = sqlite_db_seqvar[pos][char]
mismatch_dict[char][fixed_pos] += count
except:
pass
try:
alltrandict = sqlite_db[tran]
sqlite_db.close()
unambig_tran_dict = alltrandict["unambig"]
if "ambig" in alltrandict:
ambig_tran_dict = alltrandict["ambig"]
else:
ambig_tran_dict = {}
# TODO: Change merge_dicts to take a list of dicts instead of two
trandict = merge_dicts(unambig_tran_dict, ambig_tran_dict)
if pcr == True:
if "unambig_pcr" in alltrandict:
trandict = merge_dicts(trandict, alltrandict["unambig_pcr"])
if "ambig_pcr" in alltrandict:
trandict = merge_dicts(trandict, alltrandict["ambig_pcr"])
master_file_dict[filename] = trandict
except Exception as e:
print "error: ", e
pass
#Next check coverage, if that's true then calculate coverage for each rl and return dict
if coverage == True and subcodon == False:
for filename in master_file_dict:
for readlen in master_file_dict[filename]:
if readlen >= min_read and readlen <= max_read:
for pos in master_file_dict[filename][readlen]:
count = master_file_dict[filename][readlen][pos]
if pos-1 not in master_dict:
master_dict[pos-1] = 0
for i in range(pos,pos+(readlen+1)):
if i in master_dict:
master_dict[i] += count
else:
master_dict[i] = count
#use this so line graph does not have 'ramps'
if i+1 not in master_dict:
master_dict[i+1] = 0
return master_dict, mismatch_dict
#Next check if subcodon is true, if not just give an offset of 15 to everything
if subcodon == False:
sorted_master_dict = collections.OrderedDict()
for filename in master_file_dict:
for readlen in master_file_dict[filename]:
if readlen >= min_read and readlen <= max_read:
for pos in master_file_dict[filename][readlen]:
#use this so line graph does not have 'ramps'
count = master_file_dict[filename][readlen][pos]
offset_pos = pos+15
master_dict[offset_pos] += count
if offset_pos+1 not in master_dict:
master_dict[offset_pos+1] = 0
return master_dict, mismatch_dict
#Fetching subcodon reads
if subcodon == True:
if coverage == False:
for filename in master_file_dict:
if filename not in offset_dict:
continue
for readlen in master_file_dict[filename]:
if readlen >= min_read and readlen <= max_read:
if readlen in offset_dict[filename]:
offset = offset_dict[filename][readlen]
for pos in master_file_dict[filename][readlen]:
count = master_file_dict[filename][readlen][pos]
if primetype == "threeprime":
pos += readlen
offset_pos = pos+offset
try:
master_dict[offset_pos] += count
except Exception as e:
print "Error tried adding to position {} but tranlen is only {}".format(e,tranlen)
pass
elif coverage == True:
for filename in master_file_dict:
if filename not in offset_dict:
continue
for readlen in master_file_dict[filename]:
if readlen >= min_read and readlen <= max_read:
if readlen in offset_dict[filename]:
offset = offset_dict[filename][readlen]
for pos in master_file_dict[filename][readlen]:
count = master_file_dict[filename][readlen][pos]
if primetype == "threeprime":
pos += readlen
for i in range(0,readlen,3):
new_offset_pos = (i+pos)+(offset%3)
try:
master_dict[new_offset_pos] += count
except Exception as e:
print "Error tried adding to position {} but tranlen is only {}".format(e,tranlen)
pass
return master_dict, mismatch_dict
# Create dictionary of counts at each position, averged by readlength
def get_readlength_breakdown(read_type, min_read, max_read, tran, user_files,offset_dict,tranlen,coverage, organism, subcodon, noisered, primetype, preprocess, filetype, colorbar_minread,colorbar_maxread):
master_dict = {}
color_range = float(colorbar_maxread - colorbar_minread)
color_list = all_palettes["RdYlGn"][10]
for i in range(0,tranlen+max_read):
master_dict[i] = {}
for x in range(min_read,max_read+1):
master_dict[i][x] = 0
# the keys of master readlen dict are readlengths the value is a dictionary of position:count, there is also a colour key
colored_master_dict = {}
master_file_dict = {}
# first make a master dict consisting of all the read dicts from each filename
if read_type == "unambig":
if filetype in user_files:
for file_id in user_files[filetype]:
filename = user_files[filetype][file_id]
if os.path.isfile(filename):
openshelf = SqliteDict(filename)
else:
continue
if tran in openshelf:
alltrandict = dict(openshelf[tran])
unambig_tran_dict = alltrandict["unambig"]
openshelf.close()
trandict = unambig_tran_dict
master_file_dict[filename] = trandict
else:
if filetype in user_files:
for file_id in user_files[filetype]:
filename = user_files[filetype][file_id]
if os.path.isfile(filename):
openshelf = SqliteDict(filename)
else:
continue
if tran in openshelf:
alltrandict = dict(openshelf[tran])
openshelf.close()
unambig_tran_dict = alltrandict["unambig"]
ambig_tran_dict = alltrandict["ambig"]
# TODO: Change merge_dicts to take a list of dicts instead of two
trandict = merge_dicts(unambig_tran_dict, ambig_tran_dict)
master_file_dict[filename] = trandict
for filename in master_file_dict:
for readlen in master_file_dict[filename]:
if readlen >= min_read and readlen <= max_read:
if coverage == True:
for pos in master_file_dict[filename][readlen]:
count = master_file_dict[filename][readlen][pos]
for i in range(pos,pos+(readlen+1)):
master_dict[i][readlen] += count
else:
if os.path.isfile(filename):
openshelf = SqliteDict(filename)
else:
continue
offsets = openshelf["offsets"]["fiveprime"]["offsets"]
if readlen in offsets:
offset = offsets[readlen]
else:
offset = 15
for pos in master_file_dict[filename][readlen]:
count = master_file_dict[filename][readlen][pos]
if (pos+offset) in master_dict:
master_dict[pos+offset][readlen] += count
sorted_master_dict = collections.OrderedDict()
for key in sorted(master_dict.keys()):
sorted_master_dict[key] = master_dict[key]
for pos in sorted_master_dict:
count = sum(sorted_master_dict[pos].values())
tot_readlen = 0.0
tot_count = 0.0001
for readlen in sorted_master_dict[pos]:
tot_count += sorted_master_dict[pos][readlen]
tot_readlen += (sorted_master_dict[pos][readlen]*readlen)
avg_readlen = int(tot_readlen/tot_count)
if avg_readlen < colorbar_minread:
avg_readlen = colorbar_minread
if avg_readlen > colorbar_maxread:
avg_readlen = colorbar_maxread
# find where this avg readlen lies in the range of min readlen to max readlen and use that to assign a color
y = avg_readlen - colorbar_minread
per = y/color_range
final_per = int(per*10)
if final_per > 9:
final_per = 9
color = color_list[final_per]
if color not in colored_master_dict:
colored_master_dict[color] = collections.OrderedDict()
for i in range(0,tranlen+max_read+1):
colored_master_dict[color][i] = 0
if pos not in colored_master_dict[color]:
colored_master_dict[color][pos] = 0
colored_master_dict[color][pos] += count
return color_list, colored_master_dict
# Create a dictionary of mismatches at each position
def get_seq_var(user_files, tranlen, organism, tran):
mismatch_dict = collections.OrderedDict()
mismatch_dict["A"] = {}
mismatch_dict["T"] = {}
mismatch_dict["G"] = {}
mismatch_dict["C"] = {}
for i in range(0,tranlen+1):
mismatch_dict["A"][i] = 0
mismatch_dict["T"][i] = 0
mismatch_dict["G"][i] = 0
mismatch_dict["C"][i] = 0
for filetype in ["riboseq","rnaseq"]:
if filetype in user_files:
for file_id in user_files[filetype]:
filename = user_files[filetype][file_id]
if os.path.isfile(filename):
sqlite_db = SqliteDict(filename, autocommit=False)
else:
return "File not found"
if tran in sqlite_db:
if "seq" in sqlite_db[tran]:
sqlite_db_seqvar = dict(sqlite_db[tran]["seq"])
for pos in sqlite_db_seqvar:
#convert to one based
fixed_pos = pos+1
for char in sqlite_db_seqvar[pos]:
if char != "N":
if fixed_pos not in mismatch_dict[char]:
mismatch_dict[char][fixed_pos] = 0
count = sqlite_db_seqvar[pos][char]
mismatch_dict[char][fixed_pos] += count
sqlite_db.close()
return mismatch_dict