In [1]:
import pandas as pd
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import logging

import sys
import os
notebook_dir = os.getcwd()
sys.path.append(notebook_dir)
from model import Model
from dataloader import DataLoader
from constants import ARR_LENS, S_VALUES

In [2]:
logging.basicConfig(
    filename="merge.log",
    filemode="w",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s" 
)

In [11]:
def run_experiment(args):
    arr_len, S = args
    dataloader = DataLoader(arr_len=arr_len)
    array = dataloader.generate_data()

    logging.info(f"Starting sort for arr_len={arr_len} with S={S}")
    model = Model(array, S)
    logging.info(f"Completed sort for arr_len={arr_len} with S={S}")

    model.train()

    return {
        "Array Length": arr_len,
        "Execution Time": model.execution_time,
        "Key Comparisons": model.key_cmp
    }

In [12]:
# TEST
arr_len = 30
S=3

dataloader = DataLoader(arr_len=arr_len)
array = dataloader.generate_data()

model = Model(array, S)

print(model.array)
model.train()
print(model.array)

[20 30  3 18 23  5  8 28 21 25 29 22  4 26 19 13  6  2 15  9 27 10 11 14
 16  1 17 24 12  7]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30]


In [None]:
experiments = [(10_000_000, 7), (10_000_000, 1)]

with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap(run_experiment, experiments), total=len(experiments), desc="Processing Arrays"))

df_results = pd.DataFrame(results)
df_results.to_parquet("merge.parquet", index=False)

print("Execution complete")

Processing Arrays:   0%|          | 0/2 [00:00<?, ?it/s]