## Dataset creation from GSM-Infinity

We create four datasets of increasing difficulty by grouping splits from the [hard GSM-Infinite dataset (zero-noise setting)](https://huggingface.co/datasets/InfiniAILab/gsm_infinite_hard_0). The 4 datasets are generated to ensure that each level consists of roughly the same number of samples.

The examples from each split are divided into a 90-10 train test split



In [None]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
from huggingface_hub import login

total_ops = 30
num_samples = 0
lengths = [0,0]
for ops_index in range(2, total_ops + 1):
    dataset = load_dataset('InfiniAILab/gsm_infinite_hard_0', split=f'ops_{ops_index}')
    num_samples += len(dataset)
    lengths.append(len(dataset))

In [None]:

level_dataset_num_samples = num_samples // 4
split_points = [2]
start_point = split_points[-1]
current_level_dataset_size = 0
actual_lengths = []

while start_point <= total_ops:
    current_level_dataset_size = lengths[start_point]
    end_point = start_point + 1

    while end_point <= total_ops and current_level_dataset_size <= level_dataset_num_samples:
        current_level_dataset_size += lengths[end_point]
        end_point += 1
    split_points.append(end_point)
    actual_lengths.append(current_level_dataset_size)
    current_level_dataset_size = 0
    start_point = end_point


In [None]:
level_wise_datasets = []
for indx, start_index in enumerate(split_points[: len(split_points) - 1]):
    end_index = split_points[indx + 1]
    ops_ds = []
    for ops in range(start_index, end_index):
        ds = load_dataset('InfiniAILab/gsm_infinite_hard_0', split=f'ops_{ops}')
        ops_ds.append(ds)
    level_ds = concatenate_datasets(ops_ds)
    
    level_split = level_ds.train_test_split(test_size = int(0.1 * len(level_ds)), seed=42, shuffle=True)
    level_split.push_to_hub(
        f'anirudhb11/gsm_infinite_hard_0_s_{start_index}_e_{end_index}'
    )
        