In [None]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
from matplotlib.text import Text
import numpy as np

In [None]:
metrics_filenames = glob.glob("*.csv")
dfs = []
for filename in metrics_filenames:
    dfs.append(pd.read_csv(filename, header=[0, 1], skipinitialspace=True, index_col=0))
metrics = pd.concat(dfs)
metrics = metrics.rename(columns={
    "max_memory (GB)": "max_memory",
    "matrix_memory (GB)": "matrix_memory",
    "execution_time (s)": "execution_time",
    "max_memory_to_use_GB": "max_memory_to_use",
})
metrics.columns = [
    "max_memory_sklearn", "max_memory_chunkdot",
    "matrix_memory_sklearn", "matrix_memory_chunkdot",
    "execution_time_sklearn", "execution_time_chunkdot",
    "embedding_dim",
    "max_memory_to_use",
    "top_k"
]
metrics.head()

In [None]:
print("Embedding Dim:", metrics["embedding_dim"].unique())
print("Max Memory to Use:", metrics["max_memory_to_use"].unique())
print("Top K:", metrics["top_k"].unique())
print("Top K:", metrics.index.unique())

In [None]:
small_metrics = metrics.loc[metrics.index <= 1E5]
assert small_metrics.embedding_dim.nunique() == 1, f"{small_metrics.embedding_dim.unique()}"
assert small_metrics.top_k.nunique() == 1, f"{small_metrics.top_k.unique()}"
embedding_dim = small_metrics.embedding_dim.unique()[0]
top_k = small_metrics.top_k.unique()[0]
print(embedding_dim, top_k)
plt.rc('legend', fontsize=11)

In [None]:
scikit_learn = small_metrics.loc[small_metrics["max_memory_to_use"] == 10, ["max_memory_sklearn"]].rename(columns={"max_memory_sklearn": "Scikit-Learn"})
scikit_learn_extrapolated = scikit_learn.copy()
scikit_learn_extrapolated["Scikit-Learn extrapolated (8 * N^2)"] =  [x ** 2 * 8 / 1E9 if x > 5E4 else None for x in scikit_learn.index]
scikit_learn_extrapolated = scikit_learn_extrapolated[["Scikit-Learn extrapolated (8 * N^2)"]]
chunkdot_5 = small_metrics.loc[small_metrics["max_memory_to_use"] == 5, ["max_memory_chunkdot"]].rename(columns={"max_memory_chunkdot": "ChunkDot (5GB)"})
chunkdot_10 = small_metrics.loc[small_metrics["max_memory_to_use"] == 10, ["max_memory_chunkdot"]].rename(columns={"max_memory_chunkdot": "ChunkDot (10GB)"})
chunkdot_20 = small_metrics.loc[small_metrics["max_memory_to_use"] == 20, ["max_memory_chunkdot"]].rename(columns={"max_memory_chunkdot": "ChunkDot (20GB)"})
chunkdot_20 = chunkdot_20.groupby("n_items").mean()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))
fig.suptitle(f"Cosine Similarity Top K", fontsize=18)
linewidth = 5
label_fontsize = 15

scikit_learn.plot(ax=axes[0], linewidth=linewidth, color="limegreen")
scikit_learn_extrapolated.plot(ax=axes[0], linewidth=linewidth, color="crimson", style="*")
chunkdot_5.plot(ax=axes[0], linewidth=linewidth, color="darkorange")
chunkdot_10.plot(ax=axes[0], linewidth=linewidth, color="darkturquoise")
chunkdot_20.plot(ax=axes[0], linewidth=linewidth, color="darkorchid")

axes[0].set_xlabel("Number of items", fontsize=label_fontsize)
axes[0].set_ylabel("Gigabytes", fontsize=label_fontsize)
axes[0].ticklabel_format(style='sci', axis='x', scilimits=(0,0))
axes[0].set_title("Memory consumption", fontsize=15)
axes[0].yaxis.grid(color='gray', linestyle='dashed')
axes[0].xaxis.grid(color='gray', linestyle='dashed')
axes[0].tick_params(axis='both', labelsize=13)

scikit_learn = small_metrics.loc[small_metrics["max_memory_to_use"] == 10, ["execution_time_sklearn"]].rename(columns={"execution_time_sklearn": "Scikit-Learn"})
scikit_learn_extrapolated = scikit_learn.copy()
chunkdot_5 = small_metrics.loc[small_metrics["max_memory_to_use"] == 5, ["execution_time_chunkdot"]].rename(columns={"execution_time_chunkdot": "ChunkDot (5GB)"})
chunkdot_10 = small_metrics.loc[small_metrics["max_memory_to_use"] == 10, ["execution_time_chunkdot"]].rename(columns={"execution_time_chunkdot": "ChunkDot (10GB)"})
chunkdot_20 = small_metrics.loc[(small_metrics["max_memory_to_use"] == 20) & (small_metrics.index < 1E6), ["execution_time_chunkdot"]].rename(columns={"execution_time_chunkdot": "ChunkDot (20GB)"})
chunkdot_20 = chunkdot_20.groupby("n_items").mean()

scikit_learn.plot(ax=axes[1], linewidth=linewidth, color="limegreen")
chunkdot_5.plot(ax=axes[1], linewidth=linewidth, color="darkorange")
chunkdot_10.plot(ax=axes[1], linewidth=linewidth, color="darkturquoise")
chunkdot_20.plot(ax=axes[1], linewidth=linewidth, color="darkorchid")
axes[1].set_xlabel("Number of items", fontsize=label_fontsize)
axes[1].set_ylabel("Seconds", fontsize=label_fontsize)
axes[1].ticklabel_format(style='sci', axis='x', scilimits=(0,0))
axes[1].set_title("Execution time", fontsize=15)
axes[1].yaxis.grid(color='gray', linestyle='dashed')
axes[1].xaxis.grid(color='gray', linestyle='dashed')
axes[1].tick_params(axis='both', labelsize=13)

fig.savefig('../images/cosine_similarity_top_k-1E5.png', bbox_inches='tight')

In [None]:
big_metrics = metrics.loc[metrics["max_memory_to_use"] == 20]
assert big_metrics.embedding_dim.nunique() == 1, f"{big_metrics.embedding_dim.unique()}"
assert big_metrics.top_k.nunique() == 1, f"{big_metrics.top_k.unique()}"
embedding_dim = big_metrics.embedding_dim.unique()[0]
top_k = big_metrics.top_k.unique()[0]
print(embedding_dim, top_k)

In [None]:
chunkdot_20 = big_metrics[["max_memory_chunkdot"]].rename(columns={"max_memory_chunkdot": "ChunkDot (20GB)"})
chunkdot_20 = chunkdot_20.groupby("n_items").mean()
scikit_learn_extrapolated = pd.DataFrame(index=range(0, int(1E6) + 1, int(1E5)), data={"Scikit-Learn extrapolated (8 * N^2)": [x ** 2 * 8 / 1E9 if x > 5E4 else None for x in range(0, int(1E6) + 1, int(1E5))]})

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))
fig.suptitle("Cosine Similarity Top K", fontsize=18)
linewidth = 3
label_fontsize = 15

scikit_learn_extrapolated.plot(ax=axes[0], linewidth=linewidth, color="crimson", style="*", logy=True)
chunkdot_20.plot(ax=axes[0], linewidth=linewidth, color="darkorchid", marker="o")
axes[0].set_yticklabels(axes[0].get_yticklabels() + [Text(0, 20.0, '20')])
axes[0].set_yticks(list(axes[0].get_yticks()) + [20])
axes[0].set_xlabel("Number of items", fontsize=label_fontsize)
axes[0].set_ylabel("Gigabytes", fontsize=label_fontsize)
axes[0].ticklabel_format(style='sci', axis='x', scilimits=(0,0))
axes[0].set_title("Memory consumption", fontsize=15)
axes[0].yaxis.grid(color='gray', linestyle='dashed')
axes[0].xaxis.grid(color='gray', linestyle='dashed')
axes[0].set_ylim(1, 1E4)
axes[0].tick_params(axis='both', labelsize=13)

chunkdot_20 = (big_metrics[["execution_time_chunkdot"]] / 60).rename(columns={"execution_time_chunkdot": "ChunkDot (20GB)"})
# numba_20["execution_time_numba"] = numba_20["execution_time_numba"] / 60
# numba_20 = numba_20.rename(columns={"execution_time_numba": "ChunkDot (20GB)"})
chunkdot_20 = chunkdot_20.groupby("n_items").mean()

chunkdot_20.plot(ax=axes[1], linewidth=linewidth, color="darkorchid", marker="o", ylim=(-5, 120))
axes[1].set_xlabel("Number of items", fontsize=label_fontsize)
axes[1].set_ylabel("Minutes", fontsize=label_fontsize)
axes[1].ticklabel_format(style='sci', axis='x', scilimits=(0,0))
axes[1].set_title("Execution time", fontsize=15)
axes[1].yaxis.grid(color='gray', linestyle='dashed')
axes[1].xaxis.grid(color='gray', linestyle='dashed')
axes[1].tick_params(axis='both', labelsize=13)

fig.savefig('../images/cosine_similarity_top_k-1E6.png', bbox_inches='tight')