From 73ce4b7bc1d170824ed70696275850b373a365a9 Mon Sep 17 00:00:00 2001 From: Sam Ireland Date: Thu, 11 Jul 2019 23:35:00 +0100 Subject: [PATCH] Update speed script --- .gitignore | 1 + scripts/speed.py | 49 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index ef8f1a10..34a946f2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ notes profiles *.png *.json +*.svg diff --git a/scripts/speed.py b/scripts/speed.py index 7a7c6708..bb35d6ab 100644 --- a/scripts/speed.py +++ b/scripts/speed.py @@ -20,7 +20,8 @@ def get_string(code): text = response.content if code.endswith(".mmtf") else response.text return text -if len(sys.argv) > 1 and sys.argv[1] == "-rebuild": + +if len(sys.argv) > 1 and sys.argv[1] == "--rebuild": query = ""\ "org.pdb.query.simple.ChemCompFormulaQuery"\ "ZN" @@ -70,8 +71,9 @@ def get_string(code): with open("scripts/speed.json", "w") as f: json.dump(data, f) + with open("scripts/speed.json") as f: - data = json.load(f)[:500] + data = json.load(f) print("There are {} data points".format(len(data))) @@ -80,27 +82,52 @@ def get_string(code): mmtfs_x, mmtfs_y = zip(*[[d["atoms"], d[".mmtf"]] for d in data if d[".mmtf"] and d["models"] == 1 and d[".mmtf"] < 20]) bios_x, bios_y = zip(*[[d["atoms"], d["biopython"]] for d in data if d["biopython"] and d["models"] == 1 and d["biopython"] < 20]) +def best_fit(X, Y, label): + + xbar = sum(X)/len(X) + ybar = sum(Y)/len(Y) + n = len(X) # or len(Y) + + numer = sum([xi*yi for xi,yi in zip(X, Y)]) - n * xbar * ybar + denum = sum([xi**2 for xi in X]) - n * xbar**2 + + b = numer / denum + a = ybar - b * xbar + + print('{} best fit line:\ny = {:.6f} + {:.6f}x'.format(label, a, b)) + + return a, b + plt.xscale("log") plt.yscale("log") -plt.scatter(pdbs_x, pdbs_y, s=6, c="#58B19F", label=".pdb", alpha=0.8, linewidths=0) -plt.scatter(cifs_x, cifs_y, s=6, c="#FD7272", label=".cif", alpha=0.8, linewidths=0) -plt.scatter(mmtfs_x, mmtfs_y, s=6, c="#182C61", label=".mmtf", alpha=0.8, linewidths=0) +best_fit(cifs_x, cifs_y, "cif") +best_fit(pdbs_x, pdbs_y, "pdb") +best_fit(mmtfs_x, mmtfs_y, "mmtf") +best_fit(bios_x, bios_y, "biopython") +plt.scatter(cifs_x, cifs_y, s=6, c="#FD7272", label=".cif", alpha=0.3, linewidths=0) +plt.scatter(pdbs_x, pdbs_y, s=6, c="#58B19F", label=".pdb", alpha=0.3, linewidths=0) +plt.scatter(mmtfs_x, mmtfs_y, s=6, c="#182C61", label=".mmtf", alpha=0.3, linewidths=0) plt.xlabel("Atom Count") plt.ylabel("Parse time (s)") -plt.legend() -plt.savefig("scripts/compare.png", dpi=1000) +plt.xlim([100, 1000000]) +plt.ylim([0.001, 100]) +plt.legend(loc=2) +plt.savefig("scripts/format-speed.svg", dpi=1000) plt.clf() plt.xscale("log") plt.yscale("log") -plt.scatter(pdbs_x, pdbs_y, s=6, c="#58B19F", label=".pdb (atomium)", alpha=0.8, linewidths=0) -plt.scatter(bios_x, bios_y, s=6, c="#D6A2E8", label=".pdb (biopython)", alpha=0.8, linewidths=0) +plt.scatter(pdbs_x, pdbs_y, s=6, c="#58B19F", label=".pdb (atomium)", alpha=0.5, linewidths=0) +plt.scatter(bios_x, bios_y, s=6, c="#D6A2E8", label=".pdb (biopython)", alpha=0.5, linewidths=0) plt.xlabel("Atom Count") plt.ylabel("Parse time (s)") -plt.legend() -plt.savefig("scripts/compare2.png", dpi=1000) +plt.xlim([100, 100000]) +plt.ylim([0.001, 10]) +plt.legend(loc=2) +plt.savefig("scripts/library-speed.svg", dpi=1000) plt.clf() #plt.scatter(bios_x, bios_y, s=4, c="#F97F51") +