In [1]:
import pandas as pd
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import logging

import sys
import os
notebook_dir = os.getcwd()
sys.path.append(notebook_dir)
from model import Model
from dataloader import DataLoader
from constants import ARR_LENS, S_VALUES

In [2]:
logging.basicConfig(
    filename="merge.log",
    filemode="w",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s" 
)

In [3]:
def run_experiment(args):
    arr_len = args
    dataloader = DataLoader(arr_len=arr_len)
    array = dataloader.generate_data()

    model = Model(array)

    logging.info(f"Starting sort for arr_len={arr_len} with S={S}")
    model.train()
    logging.info(f"Completed sort for arr_len={arr_len} with S={S}")

    return {
        "Array Length": arr_len,
        "Execution Time": model.execution_time,
        "Key Comparisons": model.key_cmp
    }

In [4]:
# TEST
arr_len = 30
S=3

dataloader = DataLoader(arr_len=arr_len)
array = dataloader.generate_data()

model = Model(array)

print(model.array)
model.train()
print(model.array)

[ 5 26  8  1 16 20  6 27  9  3 12  4 11 23 22 18 29 17 19 21 25 24 15 13
 28 14  2 10 30  7]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30]


In [5]:
with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap(run_experiment, ARR_LENS), total=len(ARR_LENS), desc="Processing Arrays"))

df_results = pd.DataFrame(results)
df_results.to_parquet("merge.parquet", index=False)

print("Execution complete")

Processing Arrays: 100%|██████████| 100/100 [01:40<00:00,  1.00s/it]


Execution complete
