-
Notifications
You must be signed in to change notification settings - Fork 11
/
pca.py
209 lines (176 loc) · 8.12 KB
/
pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#PCA.py
#Written by: Erik Poppleton
#Date: 3/6/2019
#Performs a principal component analysis on a trajectory
#the output JSON can be loaded into the viewere where it will overlay as arrows
import numpy as np
from UTILS.readers import LorenzoReader2, Cal_confs, get_input_parameter
from sys import exit
import argparse
from json import load, dumps
from Bio.SVDSuperimposer import SVDSuperimposer
from random import randint
from UTILS import parallelize
from warnings import catch_warnings, simplefilter
from os import environ
from compute_mean import compute_cms
fetch_np = lambda conf: np.array([
n.cm_pos for n in conf._nucleotides
])
def make_heatmap(covariance):
"""
Produces a heatmat plot of the covariance between every particle
Parameters:
covariance (numpy.array): The covariance matrix of particle positions
Displays:
A matplotlib imshow plot of the covariance
"""
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
a = ax.imshow(covariance, cmap='viridis', origin='lower')
ax.set(title = "interaction network",
ylabel="nucleotide id*3",
xlabel="nucleotide id*3")
b = fig.colorbar(a, ax=ax)
b.set_label("covariance", rotation = 270)
plt.show()
def get_pca(reader, align_conf, num_confs, start=None, stop=None):
"""
Performs principal component analysis on deviations from the mean structure
Parameters:
reader (readers.LorenzoReader2): An active reader on the trajectory file to analyze.
mean_structure (numpy.array): The position of each particle in the mean configuration. A 3xN array.
num_confs (int): The number of configurations in the reader.
<optional> start (int): The starting configuration ID to begin averaging at. Used if parallel.
<optional> stop (int): The configuration ID on which to end the averaging. Used if parallel.
Returns:
deviations_marix (numpy.array): The difference in position from the mean for each configuration.
"""
if stop is None:
stop = num_confs
else: stop = int(stop)
if start is None:
start = 0
else: start = int(start)
mysystem = reader._get_system(N_skip = start)
deviations_matrix = np.empty((stop, (len(align_conf))*3))
sup = SVDSuperimposer()
confid = 0
#for every configuration in the trajectory chunk, align it to the mean and compute positional difference for every particle
while mysystem != False and confid < stop:
print("-->", mysystem._time)
#mysystem.inbox_system()
cur_conf = fetch_np(mysystem)
sup.set(align_conf, cur_conf)
sup.run()
rot, tran = sup.get_rotran()
#equivalent to taking the dot product of the rotation array and every vector in the deviations array
cur_conf = np.einsum('ij, ki -> kj', rot, cur_conf) + tran
deviations_matrix[confid] = (cur_conf-align_conf).flatten()
confid += 1
mysystem = reader._get_system()
return deviations_matrix
def change_basis(reader, align_conf, num_confs, start=None, stop=None):
if stop is None:
stop = num_confs
else: stop = int(stop)
if start is None:
start = 0
else: start = int(start)
linear_terms = np.empty((stop, len(evectors[1])))
mysystem = reader._get_system(N_skip=start)
i = 0
sup = SVDSuperimposer()
while mysystem != False and i < stop:
print("-->", mysystem._time)
mysystem.inbox_system()
cur_conf = fetch_np(mysystem)
sup.set(align_conf, cur_conf)
sup.run()
rot, tran = sup.get_rotran()
#equivalent to taking the dot product of the rotation array and every vector in the deviations array
cur_conf = np.einsum('ij, ki -> kj', rot, cur_conf) + tran
cur_conf = cur_conf.flatten()
with catch_warnings(): #this produces an annoying warning about casting complex values to real values that is not relevant
simplefilter("ignore")
linear_terms[i] = np.linalg.solve(evectors, cur_conf)
i += 1
mysystem = reader._get_system()
return linear_terms
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Calculates a principal component analysis of nucleotide deviations over a trajectory")
parser.add_argument('inputfile', type=str, nargs=1, help="The inputfile used to run the simulation")
parser.add_argument('trajectory', type=str, nargs=1, help='the trajectory file you wish to analyze')
parser.add_argument('meanfile', type=str, nargs=1, help='The mean structure .json file from compute_mean.py')
parser.add_argument('outfile', type=str, nargs=1, help='the name of the .json file where the PCA will be written')
parser.add_argument('-p', nargs=1, type=int, dest='parallel', help="(optional) How many cores to use")
args = parser.parse_args()
conf_file = args.trajectory[0]
inputfile = args.inputfile[0]
mean_file = args.meanfile[0]
outfile = args.outfile[0]
parallel = args.parallel
if parallel:
n_cpus = args.parallel[0]
top_file = get_input_parameter(inputfile, "topology")
if "RNA" in get_input_parameter(inputfile, "interaction_type"):
environ["OXRNA"] = "1"
else:
environ["OXRNA"] = "0"
import UTILS.base #this needs to be imported after the model type is set
num_confs = Cal_confs(conf_file, top_file)
with open(mean_file) as file:
align_conf = load(file)['g_mean']
cms = compute_cms(align_conf) #all structures must have the same center of mass
align_conf -= cms
if not parallel:
r = LorenzoReader2(conf_file,top_file)
deviations_matrix = get_pca(r, align_conf, num_confs)
if parallel:
out = parallelize.fire_multiprocess(conf_file, top_file, get_pca, num_confs, align_conf)
deviations_matrix = np.concatenate([i for i in out])
#now that we have the deviations matrix we're gonna get the covariance and PCA it
#note that in the future we might want a switch for covariance vs correlation matrix because correlation (cov/stdev so all diagonals are 1) is better for really floppy structures
print("calculating covariance")
covariance = np.cov(deviations_matrix.T)
#make_heatmap(covariance)
print("calculating eigenvectors")
evalues, evectors = np.linalg.eig(covariance)
sort = evalues.argsort()[::-1]
evalues = evalues[sort]
evectors = evectors[sort].T
import matplotlib.pyplot as plt
plt.scatter(range(0, len(evalues)), evalues, s=6)
plt.xlabel("component")
plt.ylabel("eigen value")
plt.show()
mul = np.einsum('ij,i->ij',evectors[0:3], evalues[0:3])
out = np.matmul(deviations_matrix, mul.T).astype(float)
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(out[:,0], out[:,1], out[:,2], c='g', s=25)
plt.show()
weighted_sum = np.zeros_like(evectors[0])
for i in range(0, 1): #how many eigenvalues do you want?
weighted_sum += evalues[i]*evectors[i]
prep_pos_for_json = lambda conf: list(
list(p) for p in conf
)
with catch_warnings(): #this produces an annoying warning about casting complex values to real values that is not relevant
simplefilter("ignore")
output_vectors = weighted_sum.reshape(int(weighted_sum.shape[0]/3), 3).astype(float)
with open(outfile, "w+") as file:
file.write(dumps({
"pca" : prep_pos_for_json(output_vectors)
}))
#Now we're going to reconstruct each conf from the eigenvectors and use those weights to cluster the structures
if not parallel:
r = LorenzoReader2(conf_file,top_file)
linear_terms = change_basis(r, align_conf, num_confs)
if parallel:
out = parallelize.fire_multiprocess(conf_file, top_file, change_basis, num_confs, n_cpus, align_conf)
linear_terms = np.concatenate([i for i in out])
#truncated_terms = linear_terms[:,0:3]
from clustering import perform_DBSCAN
labs = perform_DBSCAN(linear_terms, num_confs, conf_file, inputfile)