In [1]:
import numpy as np
import pandas as pd
import time
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import logging

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
logging.basicConfig(
    filename="experiments.log",
    filemode="w",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s" 
)

In [3]:
ARR_LENS = list(range(100_000, 10_000_001, 100_000))
S_VALUES = list(range(1, 100, 2)) # 1 - 99

In [4]:
class DataLoader:
    def __init__(self, arr_len):
        self.arr_len = arr_len
        
    def generate_data(self):
        return np.random.permutation(np.arange(1, self.arr_len + 1))

In [5]:
class Model:

    def __init__(self, array, S):
        self.S = S   
        self.array = array

    def train(self):
        self.execution_time = None
        self.key_cmp = 0
        start_time = time.perf_counter()
        self.merge_sort(0, len(self.array)-1)
        end_time = time.perf_counter()
        self.execution_time = end_time - start_time

    def train_hybrid(self):
        self.execution_time = None
        self.key_cmp = 0
        start_time = time.perf_counter()
        self.merge_insertion_sort(0, len(self.array)-1)
        end_time = time.perf_counter()
        self.execution_time = end_time - start_time

    def merge(self, n, m):
        mid = (n + m) // 2
        left = self.array[n:mid + 1]
        right = self.array[mid + 1:m + 1]

        i = j = 0

        temp = []

        while i < len(left) and j < len(right):
            self.key_cmp += 1
            if left[i] <= right[j]:  # Stable sort: left[i] goes first if equal
                temp.append(left[i])
                i += 1
            else:
                temp.append(right[j])
                j += 1

        # Add any remaining elements
        while i < len(left):
            temp.append(left[i])
            i += 1

        while j < len(right):
            temp.append(right[j])
            j += 1

        # Copy merged values back into original array
        self.array[n:m+1] = temp


    def merge_insertion_sort(self, n, m):
        
        if (m-n <= 0):
            return
        
        if m-n+1 <= self.S:
            self.insertion_sort(n, m)
            return
        
        mid = (n+m)//2
        if (m-n>1):
            self.merge_insertion_sort(n, mid)
            self.merge_insertion_sort(mid+1, m)
        
        self.merge(n, m)

    def merge_sort(self, n, m):
        if (m-n <= 0):
            return
        
        mid = (n+m)//2
        if (m-n>1):
            self.merge_sort(n, mid)
            self.merge_sort(mid+1, m)
        
        self.merge(n, m)

    def insertion_sort(self, n, m):
        i = n+1
        while i <= m:
            j = i
            while j > n:
                self.key_cmp += 1
                if self.array[j] < self.array[j-1]:
                    tmp = self.array[j]
                    self.array[j] = self.array[j-1]
                    self.array[j-1] = tmp
                    j -= 1
                else:
                    break
            i+=1

In [6]:
def run_experiment(args):
    arr_len, S = args
    dataloader = DataLoader(arr_len=arr_len)
    array = dataloader.generate_data()

    model = Model(array, S)

    logging.info(f"Starting sort for arr_len={arr_len} with S={S}")
    model.train_hybrid()
    logging.info(f"Completed sort for arr_len={arr_len} with S={S}")

    return {
        "Array Length": arr_len,
        "Threshold": S,
        "Execution Time": model.execution_time,
        "Key Comparisons": model.key_cmp
    }

In [7]:
# TEST
arr_len = 30
S=3

dataloader = DataLoader(arr_len=arr_len)
array = dataloader.generate_data()

model = Model(array, S)

print(model.array)
model.train()
print(model.array)

[25 12 26 13 24 21 28 27 24 17 12 22 18  4 23 28 28 23  9 27  4  1 16 25
 18 26  6  7 24 28]
[ 1  4  4  6  7  9 12 12 13 16 17 18 18 21 22 23 23 24 24 24 25 25 26 26
 27 27 28 28 28 28]


In [8]:
experiments = [(arr_len, S) for arr_len in ARR_LENS for S in S_VALUES]

with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap(run_experiment, experiments), total=len(experiments), desc="Processing Arrays"))

df_results = pd.DataFrame(results)
df_results.to_parquet("baseline.parquet", index=False)

print("Execution complete")

Processing Arrays: 100%|██████████| 5000/5000 [1:16:03<00:00,  1.10it/s]


Execution complete
