This notebook shows how to generate the mHeight function dataset. This dataset is unbalanced; we show how to generate the full dataset and how to sample the data so that the heights all show up an equal number of times. The dataset in ML4AlgComb is the full dataset, not the sampled version.

Author: Herman Chau

In [1]:
mHeightRanges = {
    8: 3,
    9: 3,
    10: 3,
    11: 4,
    12: 4,
}

In [2]:
ntokens = 5
# Generates all mHeights.
import itertools, random
with open(f"./data/mHeight_{ntokens}.txt", "w") as f:
    for w in itertools.permutations(range(ntokens)):
        m = ntokens
        contains4231 = False
        for (a,b,c,d) in itertools.combinations(range(ntokens), 4):
            if w[c] < w[d] < w[a] < w[b]:
                m = min(m, w[a] - w[d])
            if w[d] < w[b] < w[c] < w[a]:
                contains4231 = True
                break
        if m < ntokens and not contains4231:
            f.write(f"{w};{m-1}\n")

In [7]:
# Print out how many permutations there are of each mHeight
mHeight_distribution = {}
with open(f"data/mHeight_{ntokens}.txt", "r") as f:
    line = f.readline()
    while line:
        _, m = line.split(";")
        m = eval(m)
        if m not in mHeight_distribution:
            mHeight_distribution[m] = 0
        mHeight_distribution[m] += 1
        line = f.readline()
print(mHeight_distribution)

{0: 14, 1: 1}


In [17]:
# Produce a sampled version of the data such that heights 1, 2, ..., max_height
# all show up a equal number of times.

for n in range(8, 13):
    max_height = mHeightRanges[n]
    samples = 0
    with open(f"data/mHeight_{n}_full.txt", "r") as f:
        line = f.readline()
        while line:
            _, m = line.split(";")
            m = eval(m)
            if m == max_height-1:
                samples += 1
            line = f.readline()
    mHeight_indices = {}
    import random
    with open(f"data/mHeight_{n}_full.txt", "r") as f:
        line = f.readline()
        idx = 0
        while line:
            _, m = line.split(";")
            m = eval(m)
            if m not in mHeight_indices:
                mHeight_indices[m] = []
            mHeight_indices[m].append(idx)
            line = f.readline()
            idx += 1
    indices = set()
    for i in range(max_height):
        random.shuffle(mHeight_indices[i])
        indices = indices | set(mHeight_indices[i][:samples])

    with open(f"data/mHeight_{n}_full.txt", "r") as f:
        with open(f"data/mHeight_{n}_sampled.txt", "w") as out_f:
            line = f.readline()
            idx = 0
            while line:
                if idx in indices:
                    out_f.write(line)
                line = f.readline()
                idx += 1

In [20]:
import random
for n in range(8, 13):
    lines = []
    with open(f"data/mHeight_{n}_sampled.txt", "r") as f:
        line = f.readline()
        while line:
            line = line.split(";")
            permutation = line[0]
            mHeight = eval(line[1])
            lines.append((permutation, mHeight))
            line = f.readline()
    random.shuffle(lines)
    with open(f"data/mHeight_{n}_train.txt", "w") as f:
        for line in lines[:len(lines)//2]:
            f.write(f"{line[0]};{line[1]}\n")
    with open(f"data/mHeight_{n}_test.txt", "w") as f:
        for line in lines[len(lines)//2:]:
            f.write(f"{line[0]};{line[1]}\n")

In [2]:
for n in range(11, 13):
    lines = []
    with open(f"data/mHeight_{n}.txt", "r") as f:
        line = f.readline()
        while line:
            line = line.split(";")
            permutation = eval(line[0])
            mHeight = eval(line[1])[0]-1
            lines.append((permutation, mHeight))
            line = f.readline()
    with open(f"data/mHeight_{n}.txt", "w") as f:
        for line in lines:
            f.write(f"{line[0]};{line[1]}\n")

In [10]:
# Convert data to inversion vector format
def inversion_vector(permutation):
    ret = []
    n = len(permutation)
    for i in range(n):
        for j in range(i+1,n):
            if permutation[i] > permutation[j]:
                ret.append(1)
            else:
                ret.append(0)
    return ret

for n in range(5, 8):
    lines = []
    with open(f"data/mHeight_{n}.txt", "r") as f:
        line = f.readline()
        while line:
            line = line.split(";")
            permutation = "".join([str(x) for x in inversion_vector(eval(line[0]))])
            mHeight = eval(line[1])
            lines.append((permutation, mHeight))
            line = f.readline()
    with open(f"data/mHeight_{n}_full.txt", "w") as f:
        for line in lines:
            f.write(f"{line[0]};{line[1]}\n")