From 73ce4b7bc1d170824ed70696275850b373a365a9 Mon Sep 17 00:00:00 2001
From: Sam Ireland <sam.ireland.uk@gmail.com>
Date: Thu, 11 Jul 2019 23:35:00 +0100
Subject: [PATCH] Update speed script

---
 .gitignore       |  1 +
 scripts/speed.py | 49 +++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 39 insertions(+), 11 deletions(-)
diff --git a/.gitignore b/.gitignore
index ef8f1a10..34a946f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ notes
 profiles
 *.png
 *.json
+*.svg
diff --git a/scripts/speed.py b/scripts/speed.py
index 7a7c6708..bb35d6ab 100644
--- a/scripts/speed.py
+++ b/scripts/speed.py
@@ -20,7 +20,8 @@ def get_string(code):
         text = response.content if code.endswith(".mmtf") else response.text
     return text
 
-if len(sys.argv) > 1 and sys.argv[1] == "-rebuild":
+
+if len(sys.argv) > 1 and sys.argv[1] == "--rebuild":
     query = "<orgPdbQuery>"\
     "<queryType>org.pdb.query.simple.ChemCompFormulaQuery</queryType>"\
     "<formula>ZN</formula></orgPdbQuery>"
@@ -70,8 +71,9 @@ def get_string(code):
         with open("scripts/speed.json", "w") as f:
             json.dump(data, f)
 
+
 with open("scripts/speed.json") as f:
-    data = json.load(f)[:500]
+    data = json.load(f)
 
 print("There are {} data points".format(len(data)))
 
@@ -80,27 +82,52 @@ def get_string(code):
 mmtfs_x, mmtfs_y = zip(*[[d["atoms"], d[".mmtf"]] for d in data if d[".mmtf"] and d["models"] == 1 and d[".mmtf"] < 20])
 bios_x, bios_y = zip(*[[d["atoms"], d["biopython"]] for d in data if d["biopython"] and d["models"] == 1 and d["biopython"] < 20])
 
+def best_fit(X, Y, label):
+
+    xbar = sum(X)/len(X)
+    ybar = sum(Y)/len(Y)
+    n = len(X) # or len(Y)
+
+    numer = sum([xi*yi for xi,yi in zip(X, Y)]) - n * xbar * ybar
+    denum = sum([xi**2 for xi in X]) - n * xbar**2
+
+    b = numer / denum
+    a = ybar - b * xbar
+
+    print('{} best fit line:\ny = {:.6f} + {:.6f}x'.format(label, a, b))
+
+    return a, b
+
 plt.xscale("log")
 plt.yscale("log")
-plt.scatter(pdbs_x, pdbs_y, s=6, c="#58B19F", label=".pdb", alpha=0.8, linewidths=0)
-plt.scatter(cifs_x, cifs_y, s=6, c="#FD7272", label=".cif", alpha=0.8, linewidths=0)
-plt.scatter(mmtfs_x, mmtfs_y, s=6, c="#182C61", label=".mmtf", alpha=0.8, linewidths=0)
+best_fit(cifs_x, cifs_y, "cif")
+best_fit(pdbs_x, pdbs_y, "pdb")
+best_fit(mmtfs_x, mmtfs_y, "mmtf")
+best_fit(bios_x, bios_y, "biopython")
+plt.scatter(cifs_x, cifs_y, s=6, c="#FD7272", label=".cif", alpha=0.3, linewidths=0)
+plt.scatter(pdbs_x, pdbs_y, s=6, c="#58B19F", label=".pdb", alpha=0.3, linewidths=0)
+plt.scatter(mmtfs_x, mmtfs_y, s=6, c="#182C61", label=".mmtf", alpha=0.3, linewidths=0)
 plt.xlabel("Atom Count")
 plt.ylabel("Parse time (s)")
-plt.legend()
-plt.savefig("scripts/compare.png", dpi=1000)
+plt.xlim([100, 1000000])
+plt.ylim([0.001, 100])
+plt.legend(loc=2)
+plt.savefig("scripts/format-speed.svg", dpi=1000)
 plt.clf()
 
 
 plt.xscale("log")
 plt.yscale("log")
-plt.scatter(pdbs_x, pdbs_y, s=6, c="#58B19F", label=".pdb (atomium)", alpha=0.8, linewidths=0)
-plt.scatter(bios_x, bios_y, s=6, c="#D6A2E8", label=".pdb (biopython)", alpha=0.8, linewidths=0)
+plt.scatter(pdbs_x, pdbs_y, s=6, c="#58B19F", label=".pdb (atomium)", alpha=0.5, linewidths=0)
+plt.scatter(bios_x, bios_y, s=6, c="#D6A2E8", label=".pdb (biopython)", alpha=0.5, linewidths=0)
 plt.xlabel("Atom Count")
 plt.ylabel("Parse time (s)")
-plt.legend()
-plt.savefig("scripts/compare2.png", dpi=1000)
+plt.xlim([100, 100000])
+plt.ylim([0.001, 10])
+plt.legend(loc=2)
+plt.savefig("scripts/library-speed.svg", dpi=1000)
 plt.clf()
 
 
 #plt.scatter(bios_x, bios_y, s=4, c="#F97F51")
+