-
Notifications
You must be signed in to change notification settings - Fork 0
/
trans_dist_mmei_v1_batch.py
431 lines (370 loc) · 20.7 KB
/
trans_dist_mmei_v1_batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
py_name = 'trans_dist_mmei_v1_batch.py'
# Leo Vo, Feb 2019
# For measuring distance between end of protospacer to TN7 flanks for NGS reads from an MmeI NGS library
# Reads in FastQ files outputed from GENEIOUS Prime Software, which have transposon flank sequences removed
# and are reverse complemented from the original Illumina fastq output.
# Uses a master csv to run multiple files in a batch
# Outputs: The script can output multiple things depending on the output options in the code below:
# - An Excel file containing all relevant information
# - Plots of transposition distances; either RL and LR plotted separately, or overlapped together
import os
import fnmatch
from Bio import SeqIO
import xlsxwriter
from pathlib import Path
import datetime as dt
import csv
import heapq
import matplotlib.pyplot as plt
plt.rcParams['svg.fonttype'] = 'none' # important so that text stays as characters in the svg output
# change directory based on where input files are
os.chdir('C:\\Users\\Leo Vo\\Desktop\\SEK_NGS\\190217_Integration_site_analysis_SEK')
# change if necessary
exp_date = "190202"
user = "Leo Vo"
# output options
excel = False # outputs an excel spreadsheet summary
excel_name = '{}_transposition_distance_mmei_duplicates.xlsx'.format(exp_date) ## Change output excel file name here
plot = True # outputs plots of integration around primary site
plot_overlap = True # overlaps the distributions of the opposite orientations **Will not plot if plot = False above
#reads in BL21DE3 RefSeq
for record in SeqIO.parse("genome.fasta", "fasta"):
genome = record.seq.upper() # remember to convert to upper case
genome_rc = genome.reverse_complement()
if excel:
# initializes excel output file, define formats used below
excel_out_path = Path(excel_name)
if not excel_out_path.is_file():
log = xlsxwriter.Workbook(excel_name)
bold = log.add_format({'bold': True})
upsizebold = log.add_format()
upsizebold.set_font_size(16)
upsizebold.set_bold()
percentage_format = log.add_format()
percentage_format.set_num_format('0.00%')
deci3_format = log.add_format()
deci3_format.set_num_format('0.000')
red_font = log.add_format({'font_color': 'red'})
blue_font = log.add_format({'font_color': 'blue'})
red_percent = log.add_format()
red_percent.set_num_format('0.00%')
red_percent.set_font_color('red')
blue_percent = log.add_format()
blue_percent.set_num_format('0.00%')
blue_percent.set_font_color('blue')
red_deci3 = log.add_format()
red_deci3.set_num_format('0.000')
red_deci3.set_font_color('red')
blue_deci3 = log.add_format()
blue_deci3.set_num_format('0.000')
blue_deci3.set_font_color('blue')
# make landing info sheet containing all information shared by sheets in the excel file
infosheet = log.add_worksheet("General Info")
infosheet.write(0, 0, "NGS MmeI Analysis of Transposition Distance ", upsizebold)
infosheet.set_column(0, 0, 35)
infosheet.write(2, 0, "NextSeq/NGS Data Collection Date", bold)
infosheet.write(2, 1, exp_date)
infosheet.write(3, 0, "Python Data Analysis Date", bold)
infosheet.write(3, 1, str(dt.datetime.now())[0:10])
infosheet.write(4, 0, "Username/Initials", bold)
infosheet.write(4, 1, user)
infosheet.write(5, 0, "Python Code Used", bold)
infosheet.write(5, 1, py_name)
infosheet.write(6, 0, "Notes", bold)
infosheet.write(7, 0, '"Query Window" - larger window for query distances (not the same as on-target window'
', same strand as the BL21 RefSeq')
infosheet.write(8, 0, '"Genomic Base" - Base directly 5\' of integration site, on the SAME strand as protospacer')
infosheet.write(9, 0, '"Example Reads" - Reads here are trimmed to 17bp and the reverse complement of reads in'
'the Geneious fastq output', )
# read in csv file containing sample codes, info and spacers.
# locate fastq file based on code
# for each file, determine spacer orientation then switch genome to revcom if needed (refseq)
# match spacer with refseq
# That + 32 = spacer_end. From refseq take window 400 before and 400 after spacer_end as refseq_query
# for each fastq take reads, reverse and trim them down to the last 17 bp. Then map to refseq_query or revcom of that
# if it maps to location on query, then it's RL, and trans_dist is location + 17 - spacer_end
# if it maps to location on query revcom, it's LR, then trans_dist is len refseq - location -17 + 5 - spacer_end
# add these trans_dist into separate out_tally lists for RL and LR, and also into a common out_tally
# determine most freq trans_dist from common out_tally - main_site
# set on_target as the window -50 and +50 from trans_site.
# go through the separate out tally lists and remove any trans_dist not within on_target
# sum of number of items from these two lists versus total of reads in the fastq is on-target freq
# ratio of the 2 lists is the orientation bias
# output the separate trans dists into excel output in different colors, and also a common column
# use matplotlib to plot and save separately graphs for the 2 orientations
def dist_find(code, psl, description, filename, direction, refseq, spacer, date, excel, plot, plot_overlap): # main analysis function
# map spacer to refseq and determine query window
query_length = 500
if refseq.find(spacer) >= 0:
spacer_end = refseq.find(spacer) + 32
query = refseq[spacer_end-90:spacer_end+query_length] # '-99' accounts for the -50bp of the on-target later
query_rc = query.reverse_complement()
spacer_end = 90 # resets spacer end index to middle of query window (no longer using full refseq)
else:
print("ERROR - Spacer not found within RefSeq")
return
total = 0 # counts total reads in fastq file
out_list_all = [] # list holding common trans_dist
out_list_rl = [] # list holding indv RL trans_dist values
out_list_lr = [] # list holding indv LR trans_dist values
# these lists are longer than query_length to hold negative values of trans_dist
# the output excel cuts of list using query_length so those values will not show
out_tally_rl = [0] * (query_length+spacer_end+20) # list tallying freq of tran_dist for RL
out_tally_lr = [0] * (query_length+spacer_end+20) # list tallying freq of tran_dist for LR
example_reads_rl = ['X'] * (query_length+spacer_end+20) # to hold example reads mapping to each trans_dist for RL
example_reads_lr = ['X'] * (query_length+spacer_end+20) # to hold example reads mapping to each trans_dist for LR
for record in SeqIO.parse(filename, "fastq"): # loop through all reads in fastq file
total += 1
rev_seq = record.seq.reverse_complement()
new_seq = rev_seq[len(rev_seq)-17:] # trim to last 17 base pair
if query.find(new_seq) >= 0: # corresponds to RL
trans_dist = query.find(new_seq) + 17 - spacer_end # distance in bp from end of protospacer
out_list_all.append(trans_dist) # append to holding lists for processing later
out_list_rl.append(trans_dist)
if out_tally_rl[trans_dist] == 0: # add read to example list if this is the first occurrence
example_reads_rl[trans_dist] = new_seq
out_tally_rl[trans_dist] += 1 # count into tally list
elif query_rc.find(new_seq) >= 0: # corresponds to LR
trans_dist = len(query) - query_rc.find(new_seq) - 17 + 5 - spacer_end # dist in bp from end of protospacer
out_list_all.append(trans_dist) # append to tally lists for processing later
out_list_lr.append(trans_dist)
if out_tally_lr[trans_dist] == 0: # add read to example list if this is the first occurrence
example_reads_lr[trans_dist] = new_seq
out_tally_lr[trans_dist] += 1 # count into tally list
# determine most frequent trans_dist
out_tally_all = [0] * query_length
for i in range(0, len(out_tally_all)):
out_tally_all[i] = out_tally_rl[i] + out_tally_lr[i]
for x, y in enumerate(out_tally_all):
if y == max(out_tally_all):
main_site = x + spacer_end # remember to convert dist to site of integration
# define on target window
on_target_lower = main_site - 50
on_target_upper = main_site + 50
# move any trans_dist within this window into a final holding list and clears old holding list
final_list_rl = []
for dist in out_list_rl:
if on_target_lower <= (dist + spacer_end) <= on_target_upper: # convert dist to site of integration
final_list_rl.append(dist)
final_list_lr = []
for dist in out_list_lr:
if on_target_lower <= (dist + spacer_end) <= on_target_upper: # convert dist to site of integration
final_list_lr.append(dist)
# determine on target frequency
on_target_total = len(final_list_rl) + len(final_list_lr)
off_target = total - on_target_total
# determine top 3 most common trans_dist for highlight box
# for combined RL and LR
indices = [] # for zipping with out_tally lists
for i in range(0, query_length):
indices.append(i)
top_3 = heapq.nlargest(3, zip(out_tally_all, indices)) # exists as a list of smaller 2-item lists
if excel:
#set up excel output sheet
logsheet = log.add_worksheet(code)
logsheet.set_column(0, 0, 24)
logsheet.set_column(1, 1, 20)
logsheet.set_column(2, 6, 17)
logsheet.set_column(7, 12, 19)
logsheet.write(3, 3, " ") # for clearer aesthetic
logsheet.write(0, 0, "Sample ID", bold)
logsheet.write(0, 1, code)
logsheet.write(1, 0, "Description", bold)
logsheet.write(1, 1, description)
logsheet.write(2, 0, "Target Location", bold)
if direction == 'fw':
logsheet.write(2, 1, "5' of Integration Site")
else:
logsheet.write(2, 1, "3' of Integration Site (RevCom)")
logsheet.write(3, 0, "Query Window", bold)
if direction == 'fw':
logsheet.write(3, 1, str(query))
if direction == 'rv':
logsheet.write(3, 1, str(query_rc))
logsheet.write(4, 0, "Plasmid encoding gRNA", bold)
logsheet.write(4, 1, psl)
logsheet.write(5, 0, "Protospacer", bold)
logsheet.write(5, 1, spacer)
logsheet.write(6, 0, "Total Reads", bold)
logsheet.write(6, 1, total)
logsheet.write(7, 0, "On Target Reads", bold)
logsheet.write(7, 1, on_target_total)
logsheet.write(7, 2, on_target_total / total, percentage_format)
logsheet.write(8, 0, "Off Target Reads", bold)
logsheet.write(8, 1, off_target)
logsheet.write(8, 2, off_target / total, percentage_format)
logsheet.write(9, 0, "On Target Reads in RL Orientation", bold)
logsheet.write(9, 1, len(final_list_rl))
logsheet.write(9, 2, len(final_list_rl) / total, percentage_format)
logsheet.write(10, 0, "On Target Reads in LR Orientation", bold)
logsheet.write(10, 1, len(final_list_lr))
logsheet.write(10, 2, len(final_list_lr) / total, percentage_format)
logsheet.write(11, 1, "Protospacer-Transposon Distance", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 1, i)
logsheet.write(11, 0, "Genomic Base", bold)
for i in range(-1, query_length - 1):
logsheet.write(i + 13, 0, query[i+spacer_end]) # shift back 1 to get the base right before transposition
logsheet.write(11, 2, "Number of Reads (RL)", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 2, out_tally_rl[i], red_font)
logsheet.write(11, 3, "% of Total Reads (RL)", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 3, out_tally_rl[i]/total, red_percent)
logsheet.write(11, 4, "Normalized Read Count (RL)", bold)
if max(out_tally_rl) > 0:
for i in range(0, query_length):
logsheet.write(i + 12, 4, out_tally_rl[i] / max(out_tally_rl), red_deci3)
logsheet.write(11, 5, "Number of Reads (LR)", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 5, out_tally_lr[i], blue_font)
logsheet.write(11, 6, "% of Total Reads (LR)", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 6, out_tally_lr[i] / total, blue_percent)
logsheet.write(11, 7, "Normalized Read Count (LR)", bold)
if max(out_tally_lr) > 0:
for i in range(0, query_length):
logsheet.write(i + 12, 7, out_tally_lr[i] / max(out_tally_lr), blue_deci3)
logsheet.write(11, 8, "Number of Reads (Combined)", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 8, out_tally_all[i])
logsheet.write(11, 9, "% of Total Reads (Combined)", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 9, out_tally_all[i] / total, percentage_format)
logsheet.write(11, 10, "Normalized Read Count (Combined)", bold)
if max(out_tally_all) > 0:
for i in range(0, query_length):
logsheet.write(i + 12, 10, out_tally_all[i] / max(out_tally_all), deci3_format)
logsheet.write(11, 11, "Example Reads RL", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 11, str(example_reads_rl[i]), red_font)
logsheet.write(11, 12, "Example Reads LR", bold)
for i in range(0, query_length):
logsheet.write(i + 12, 12, str(example_reads_lr[i]), blue_font)
# 'highlight box', take from top_3 list determined above
logsheet.write(0, 4, 'Most Frequent Transposition Distances (bp)', bold)
logsheet.write(1, 4, top_3[0][1])
logsheet.write(1, 5, top_3[0][0] / total, percentage_format)
logsheet.write(1, 6, out_tally_rl[top_3[0][1]]/total, red_percent)
logsheet.write(1, 7, out_tally_lr[top_3[0][1]] / total, blue_percent)
logsheet.write(2, 4, top_3[1][1])
logsheet.write(2, 5, top_3[1][0] / total, percentage_format)
logsheet.write(2, 6, out_tally_rl[top_3[1][1]] / total, red_percent)
logsheet.write(2, 7, out_tally_lr[top_3[1][1]] / total, blue_percent)
logsheet.write(3, 4, top_3[2][1])
logsheet.write(3, 5, top_3[2][0] / total, percentage_format)
logsheet.write(3, 6, out_tally_rl[top_3[2][1]] / total, red_percent)
logsheet.write(3, 7, out_tally_lr[top_3[2][1]] / total, blue_percent)
logsheet.write(4, 4, 'On Target Frequency', bold)
logsheet.write(4, 5, on_target_total / total, percentage_format)
logsheet.write(5, 4, 'Orientation Bias (R->L:L->R)', bold)
logsheet.write(5, 5, '{} : 1'.format(round(len(final_list_rl)/(len(final_list_lr)+0.00000001), 2))) # in case LR is 0
# plot and save graphs if plot setting = True
# Only plots a certain window (e.g. from 40bp to 60bp) - change that in the xlim options below
if plot:
x_axis = [] # artificial x-axis
for i in range(20, 61):
x_axis.append(i)
y_rl = out_tally_rl[20:61]
y_lr = out_tally_lr[20:61]
max_y = max(max(y_rl), max(y_lr)) # for scaling y axis
if not plot_overlap:
fig, axs = plt.subplots(1, 2)
fig. tight_layout(rect=[0.15, 0.1, 1, 0.9])
title = fig.suptitle("{} - {}\nOn-target frequency: {}%\nOrientation bias (R->L:L->R): {}:1"
.format(code, description, round(100*on_target_total/total, 1),
round(len(final_list_rl)/(len(final_list_lr)+0.00000001), 2)))
title.set_y(0.88)
# first graph on the left in red
axs[0].bar(x_axis, y_rl, color='tab:orange', width=1.0)
axs[0].set_title("R->L Integration Events")
# second graph on the right in blue
axs[1].bar(x_axis, y_lr, color='tab:blue', width=1.0)
axs[1].set_title("L->R Integration Events")
fig.subplots_adjust(wspace=0.7)
for axs in axs.flat:
axs.spines['top'].set_visible(False)
axs.spines['right'].set_visible(False)
# ax.spines['bottom'].set_visible(False)
# axs.spines['left'].set_visible(False)
axs.spines['bottom'].set_position('zero')
axs.spines['left'].set_bounds(0, max_y)
axs.set_xticks([40,45,50,55,60])
axs.set_xticklabels([40,45,50,55,60])
axs.set_yticks([0, max_y])
axs.set_yticklabels([0, max_y])
axs.set_xlim(left=39, right=61) ## Change window here
axs.set_ylim(bottom=0, top=1.05*(max_y))
axs.set(xlabel="Distance from target site (bp)", ylabel="Read count")
axs.yaxis.set_label_coords(-0.1,0.5)
fig.set_size_inches(6, 4.2)
fig.subplots_adjust(top=0.65)
#plt.xlabel("Distance from target site (bp)")
#plt.ylabel("Read count")
#plt.gca().set_xlim(left=35, right=60)
#plt.gca().set_ylim(bottom=0, top=(1.3*max_y))
# plt.savefig('test.svg', dpi=250)
plt.savefig('Dist_Output\\{}_{}_{}.png'.format(date, code, description), dpi=300)
plt.close()
if plot_overlap:
fig, axs = plt.subplots(1, 1, tight_layout=True)
title = fig.suptitle("{} - {} / On-target = {}% / Bias = {} :1".format(
code, description, round(100*on_target_total/total, 1),
round(len(final_list_rl)/(len(final_list_lr)+0.00000001), 2)))
title.set_y(0.9)
# LR graph is colorless with a border
axs.bar(x_axis, y_lr, color='none', edgecolor='#153C6B', linewidth=1.0, width=1.01, zorder=1)
# RL graph is blue with no border (behind bordered RL)
axs.bar(x_axis, y_rl, color='#83B0DD', edgecolor='#83B0DD', linewidth=1.0, width=1.01, zorder=0)
axs.spines['top'].set_visible(False)
axs.spines['right'].set_visible(False)
# ax.spines['bottom'].set_visible(False)
# axs.spines['left'].set_visible(False)
axs.spines['bottom'].set_position('zero')
axs.spines['left'].set_bounds(0, max_y)
axs.set_xticks([40, 42, 45, 50, 55, 60])
axs.set_xticklabels([40, 0, 45, 50, 55, 60])
axs.set_yticks([0, max_y])
axs.set_yticklabels([0, max_y])
axs.set_xlim(left=42, right=58) ## Change window here
axs.set_ylim(bottom=0, top=1.25 * (max_y))
axs.set(xlabel="Distance from target site (bp)", ylabel="Read count")
axs.yaxis.set_label_coords(-0.05, 0.4)
fig.set_size_inches(5, 4.2)
# plt.xlabel("Distance from target site (bp)")
# plt.ylabel("Read count")
# plt.gca().set_xlim(left=35, right=60)
# plt.gca().set_ylim(bottom=0, top=(1.3*max_y))
# plt.savefig('test.svg', dpi=250)
plt.savefig('Dist_Output_Overlap\\{}_{}_overlapped_{}.svg'.format(date, code, description), dpi=500)
plt.close()
# analyze all input files back-to-back using a master input .csv file
with open('input_spacers.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader, None)
for row in reader:
code = row[0]
filename = 'none'
for i in os.listdir('.'):
if fnmatch.fnmatch(i, "*{}*.fastq".format(code)):
filename = i
break
if filename != 'none':
description = row[6]
psl = row[7]
direction = str(row[4]).lower() ## spacer direction
## set refeq based on direction
if direction == 'fw':
refseq = genome
elif direction == 'rv':
refseq = genome_rc
else:
print("Direction Error = {}".format(code))
break
spacer = str(row[10]).upper() #spacer sequence, convert to uppercase
# run main dist_find function with the relevant input variables
dist_find(code, psl, description, filename, direction, refseq, spacer, exp_date, excel, plot, plot_overlap)
else:
print("WARNING - File Not Found For {}".format(code))
if excel:
log.close()