# Collect data from runs

In [None]:
import logging
import sys
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from profiling import get_memory, get_time
from chunkdot.cosine_similarity_top_k import cosine_similarity_top_k
from chunkdot.utils import get_memory_available, warm_up_chunked_dot

In [None]:
warm_up_chunked_dot()

In [None]:
logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)

In [None]:
get_memory_available() / 1E9

In [None]:
import math
M = 5E9
math.sqrt(M / 8)

In [None]:
embedding_dim = 256
max_n_items = 1E5
top_k = 100
max_memory_to_use = 10E9
n_steps = 10

step = int(max_n_items / n_steps)
n_items = range(step, int(max_n_items) + 1, step)
numba_function_kwargs = {"top_k": top_k, "max_memory": max_memory_to_use, "force_memory": True}

max_memory = {
    "sklearn": [],
    "chunkdot": []
}
matrix_memory = {
    "sklearn": [],
    "chunkdot": []
}
execution_time = {
    "sklearn": [],
    "chunkdot": []
}
for i, n in enumerate(n_items):
    print(f"\nN: {n} - {i + 1}/{len(n_items)}")
    if n > 50000:
        print("\n\tSklean memory: Skipped")
        print("\n\tSklean time: Skipped")
        max_size, matrix_size, _time = None, None, None
    else:
        print("\n\tSklean memory")
        max_size, matrix_size = get_memory(cosine_similarity, n_items=n, embedding_dim=embedding_dim)
        print(f"\tMax:{max_size / 1E9:0.2f}GB  Matrix:{matrix_size / 1E9:0.2f}GB")
        print("\n\tSklean time")
        _time = get_time(cosine_similarity, n_items=n, embedding_dim=embedding_dim, n_iterations=1)
        print(f"\t{_time:0.2f} sec")
    max_memory["sklearn"].append(max_size), matrix_memory["sklearn"].append(matrix_size), execution_time["sklearn"].append(_time)
    
    print("\n\tChunkDot memory")
    max_size, matrix_size = get_memory(cosine_similarity_top_k, n_items=n, embedding_dim=embedding_dim, function_kwargs=numba_function_kwargs)
    print(f"\tMax:{max_size / 1E9:0.2f}GB  Matrix:{matrix_size / 1E9:0.2f}GB")
    print("\n\tChunkDot time")
    _time = get_time(cosine_similarity_top_k, n_items=n, embedding_dim=embedding_dim, function_kwargs=numba_function_kwargs, n_iterations=1)
    print(f"\t{_time:0.2f} sec")
    max_memory["chunkdot"].append(max_size), matrix_memory["chunkdot"].append(matrix_size), execution_time["chunkdot"].append(_time)

In [None]:
import pandas as pd
max_memory_to_use_GB = int(max_memory_to_use / 1E9)

dfs = {
    "max_memory (GB)": pd.DataFrame.from_dict(max_memory) / 1E9,
    "matrix_memory (GB)": pd.DataFrame.from_dict(matrix_memory) / 1E9,
    "execution_time (s)": pd.DataFrame.from_dict(execution_time),
}
df = pd.concat(dfs.values(), keys=dfs.keys(), axis=1)
df = df.assign(n_items=n_items, embedding_dim=embedding_dim, max_memory_to_use_GB=max_memory_to_use_GB, top_k=top_k).set_index("n_items")
df.to_csv(f"metrics_embedding_dim-{embedding_dim}-top_k-{top_k}-max_memory_to_use-{max_memory_to_use_GB}GB.csv")
df