-
Notifications
You must be signed in to change notification settings - Fork 0
/
linear_hist_sites_batch_v1.py
167 lines (132 loc) · 6.45 KB
/
linear_hist_sites_batch_v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
py_name = 'linear_hist_sites_batch_v1.py'
################
# Leo Vo Feb 2019
# Script Function:
# Bins coverage/integration read data across the genome additively into uniform bins of defined size. Outputs a bar
# plot for visualization.
# Script Input(s):
# 1/ Csv files from either Geneious output (as Coverage) or from python code (as Reads representing
# integration 'events'), containing coordinates and count (2 columns). These csvs needs to have exactly 1 header row.
# 2/ One csv file containing information about the csv files being run (such as Sample ID, Description, etc)
# In order to run multiple csv files in one batch continuously.
# Script Output(s):
# One genome-wide Tn-Seq bar plot with peaks corresponding to where Coverage/Integration Reads are found, per input csv.
#################
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import fnmatch
# font control
plt.rcParams['svg.fonttype'] = 'none' # important so that text stays as intact characters in the output
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['font.family'] = "sans-serif"
# change directory based on where input files are
os.chdir('C:\\Users\\Leo Vo\\Desktop\\SEK_NGS\\NGS data cleaned unique hits\\Himar\\Trans_Sites')
# turns of interactive mode for matplotlib, optional
plt.ioff()
# some run parameters
bin_size = 5000 # in bp
exp_date = 190202
mode = 'Trans_Sites' # either 'Coverage' or 'Trans_Sites', for csv input file discovery
norm_on = False # normalize each data point against total (genome mapped) reads in each sample (as percentage)
def readnbin(filename, bin_size): # main binning function
# read in csv files
csv = np.genfromtxt(filename, delimiter=",", skip_header=1)
z = csv[:,1] # takes only the "Coverage" column into the input list
a = [int(i) for i in z] # convert everything from floats to int
# sets up empty list which will become the output list
a2_size = int((len(a)-1)/bin_size) + 1 # determines size of out list (number of bins post binning)
a2 = [0]*a2_size
# bin data additively
y = 0
while y <= len(a):
x = 0
while x < bin_size:
if y + x < len(a):
a2[int(y / bin_size)] += a[y + x]
x += 1
else:
x += bin_size
y += bin_size
# set up second list that will serve as x-axis ('coordinate') for output graph
b = []
for i in range(1, len(a2)+1):
b.append(i)
# convert lists to arrays for matplotlib
b = np.asarray(b)
a2 = np.asarray(a2)
return b, a2 # return the lists. If binning using coverage, also return max(a) here for ytick labels later
def plot(filename, bin_size, date, desc, psl, spacer_location, norm_on): # main graphing function
spacer_bin = int(spacer_location/bin_size) + 0.5 # determine which bin the spacer lies in
# set up figure and call readnbin()
fig, axs = plt.subplots(1, 1, tight_layout=True)
b, a2 = readnbin(filename, bin_size) # call readnbin to get arrays for xy axes
total_reads = sum(a2)
if norm_on: # normalize based on total reads (percentage)
a2 = (100*a2)/total_reads # make sure a2 is returned as an array from readnbin()
# change y-max based on normalization option
if norm_on:
max_y = 100
else:
max_y = max(a2) # if using coverage, take max(a) from readnbin() instead of max(a2) (the 'pre binning' maximum)
axs.scatter(spacer_bin, -0.03 * max_y, marker="^", c="#A32E79", s=33) # 1 scatter plot point for protospacer marker
axs.bar(b, a2, color='#0D62AC', width=1.05) # main bar graph, width slightly >1 to avoid gaps between bars
# configure spines of the graph; 'remove' top and right spines
axs.spines['top'].set_visible(False)
axs.spines['right'].set_visible(False)
axs.spines['bottom'].set_position('zero')
axs.spines['left'].set_bounds(0, max_y)
axs.spines['bottom'].set_bounds(0, len(a2)) # bottom line starts and end with length of genome
axs.spines['left'].set_position('zero')
# 2 ticks on the y axis, one at 0 and on at max value of y (max reads)
axs.set_yticks([0, max_y])
if norm_on:
axs.set_yticklabels([0, '100%'])
else:
axs.set_yticklabels([0, max(a2)])
# set up xticks
bin_scale = int(100000/bin_size)
axs.set_xticks([0,5*bin_scale,10*bin_scale,15*bin_scale,20*bin_scale,25*bin_scale,30*bin_scale,35*bin_scale,
40*bin_scale,45*bin_scale])
axs.set_xticklabels([0,0.5,1,1.5,2,2.5,3,3.5,4,4.5])
# size of output figures
fig.set_size_inches(8.9, 3.1)
# axis labeling
#plt.xlabel("E. coli genomic coordinate (Mbp)")
#plt.ylabel("Read count")
axs.yaxis.set_label_coords(-0.02, 0.5)
# set limits of graph area - different from spine limits
# leave extra space on top for info text, and space on bottom for the spacer marker
plt.gca().set_xlim(left=-0.15*len(a2), right=len(a2))
plt.gca().set_ylim(bottom=-0.12*max_y, top=1.20*max_y)
# info text
plt.text(5*bin_scale, 1.17*max_y, "{}-{}\n{}\ngRNA Plasmid = {}\nTotal Reads = {}"
.format(code, date, desc, psl, total_reads))
# save figures
plt.savefig('plots\\TEST_{}_{}_{}.svg'.format(date, code, desc), dpi=500) # check folders
#plt.savefig('plots\\{}_{}_{}.png'.format(date, code, desc), dpi=500)
plt.close() # closes the matplotlib preview popup window
# read in the info csv file and begin processing files as a batch
with open('input.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader, None) # skips header row
for row in reader:
code = row[0]
# search for file in cwd based on Sample ID ('code')
filename = 'none'
for i in os.listdir('.'):
if fnmatch.fnmatch(i, "TEST{}*{}.csv".format(code, mode)):
filename = i
break
if filename != 'none':
desc = row[6]
psl = row[7]
if row[8] == '-' or row[8] == ' ':
spacer_location = -100000000 # set a large negative value if spacer isn't found
else:
spacer_location = int(row[8])
plot(filename, bin_size, exp_date, desc, psl, spacer_location, norm_on)
else:
print("WARNING - File Not Found For {}".format(code))