In [4]:
import numpy as np
import json

special_tokens = ["<beatmap_start>", "<beatmap_end>", "<hit_object_start>", "hit_object_end>"]
types = [f"type_{t}" for t in ["circle", "slider", "spinner"]]
x = [f"x_{i}" for i in range(0, 513, 32)]
y = [f"y_{i}" for i in range(0, 386, 32)]
hitsounds = [f"hit_sound_{i}" for i in range(0, 16, 2)]
path = ["<start_path>", "<end_path>", "B", "L", "P"]
repeat = [f"repeat_{i}" for i in range(0, 31)] +  ["<start_repeat>", "<end_repeat>"]
new_combos = [f"new_combo_{i}" for i in range(2)]
slider_velocity = [f"sv_{round(i,1)}" for i in np.arange(0, 37, 0.1)]
sample_set = [f"sample_set_{i}" for i in range(4)]
volume = [f"vol_{i}" for i in range(0, 101, 10)]
effects = [f"effects_{i}" for i in range(2)]
tick = [f"tick_{i}" for i in range(51)] + ["<start_tick>", "<end_tick>"]
delta_time = [f"dt_{i}" for i in range(2001)] + ["<start_delta_time>", "<end_delta_time>"]

vocab = {
    "special": special_tokens,
    "type": types,
    "x": x,
    "y": y,
    "hit_sound": hitsounds,
    "path": path,
    "repeat": repeat,
    "new_combo": new_combos,
    "slider_velocity": slider_velocity,
    "sample_set": sample_set,
    "volume": volume,
    "effects": effects,
    "tick": tick,
    "delta_time": delta_time
}

flat_vocab = []
for group in vocab.values():
    flat_vocab.extend(group)

with open("/home/saliherdemk/projects/osu-dataset-generator/Tokenizer/vocab/vocab.json", "w") as f:
    json.dump(vocab, f, indent=2)

with open("/home/saliherdemk/projects/osu-dataset-generator/Tokenizer/vocab/vocab_list.json", "w") as f:
    json.dump(flat_vocab, f, indent=2)

print(len(flat_vocab))


2528


In [7]:
import pandas as pd

def parse_path(path):
    p = ["<start_path>"]
    splitted = path.split("|")
    p.append(splitted.pop(0))
    for i in splitted:
        x, y = i.split(":")
        p += ["x_" + x, "y_" + y]
    p.append("<end_path>")

    return ",".join(p)


def get_delta_time(dt):
    result = ["<start_delta_time>"]
    m = dt // 2000
    n = dt % 2000
    for _ in range(m):
        result.append("dt_2000")
    result += ["dt_" + str(n), "<end_delta_time>"]
    return ",".join(result)

def get_tick(t):
    result = ["<start_tick>"]
    m = t // 50
    n = t % 50
    for _ in range(m):
        result.append("tick_50")
    result += ["tick_" + str(n), "<end_tick>"]
    return ",".join(result)

def get_repeat(r):
    result = ["<start_repeat>"]
    m = r // 30
    n = r % 30
    for _ in range(m):
        result.append("repeat_30")
    result += ["repeat_" + str(n), "<end_repeat>"]
    return ",".join(result)

def encode(beatmap):
    encoded = ["<beatmap_start>"]
    i = 0
    
    for _, row in beatmap.iterrows():
        hit_obj_type = row["type"]
        hit_obj = ["<hit_object_start>"]
        
        t = "type_" + row["type"]
        x = f"x_{round(row["x"] / 32) * 32}" 
        y = f"y_{round(row["y"] / 32) * 32}"
        hit_sound = f"hit_sound_{row["hit_sound"]}" 
        new_combo = f"new_combo_{int(row["new_combo"])}"
        sample_set = f"sample_set_{row["sample_set"]}"
        volume = f"vol_{round(row["volume"] / 10) * 10}"
        effects = f"effects_{row["effects"]}"
        delta_time = get_delta_time(row["delta_time"])

        hit_obj += [t, x, y, hit_sound, new_combo, sample_set, volume, effects, delta_time]

        if hit_obj_type == "slider":
            path = parse_path(row["path"])
            repeat = get_repeat(row["repeat"])
            slider_velocity = f"sv_{round(row["slider_velocity"], 1)}"
            hit_obj += [path, repeat, slider_velocity]
            
        if hit_obj_type != "circle":
            hit_obj.append(get_tick(row["tick"]))
            
        hit_obj.append("<hit_object_end>")
        encoded.append(",".join(hit_obj))
    encoded.append("<beatmap_end>")
    return "".join(encoded)

In [8]:
df = pd.read_csv("/home/saliherdemk/osu_dataset/formatted/formatted.csv")
grouped = df.groupby('id')

dataset = []
for key, df in grouped:
    dataset.append({"beatmap_id": key, "encoded": encode(df)})

new_df = pd.DataFrame(dataset)
new_df

Unnamed: 0,beatmap_id,encoded
0,101861-1,"<beatmap_start><hit_object_start>,type_slider,..."
1,101861-2,"<beatmap_start><hit_object_start>,type_slider,..."
2,101861-3,"<beatmap_start><hit_object_start>,type_circle,..."
3,101861-4,"<beatmap_start><hit_object_start>,type_circle,..."
4,101861-5,"<beatmap_start><hit_object_start>,type_spinner..."
...,...,...
404,980783-1,"<beatmap_start><hit_object_start>,type_slider,..."
405,980783-2,"<beatmap_start><hit_object_start>,type_slider,..."
406,992892-0,"<beatmap_start><hit_object_start>,type_slider,..."
407,992892-1,"<beatmap_start><hit_object_start>,type_circle,..."


In [9]:
new_df["encoded"][1]

'<beatmap_start><hit_object_start>,type_slider,x_64,y_128,hit_sound_0,new_combo_1,sample_set_2,vol_80,effects_1,<start_delta_time>,dt_0,<end_delta_time>,<start_path>,P,x_47,y_195,x_74,y_292,<end_path>,<start_repeat>,repeat_1,<end_repeat>,sv_2.0,<start_tick>,tick_3,<end_tick>,<hit_object_end><hit_object_start>,type_slider,x_192,y_256,hit_sound_0,new_combo_0,sample_set_2,vol_80,effects_1,<start_delta_time>,dt_529,<end_delta_time>,<start_path>,P,x_208,y_202,x_181,y_105,<end_path>,<start_repeat>,repeat_1,<end_repeat>,sv_2.0,<start_tick>,tick_3,<end_tick>,<hit_object_end><hit_object_start>,type_slider,x_480,y_64,hit_sound_0,new_combo_1,sample_set_2,vol_80,effects_1,<start_delta_time>,dt_529,<end_delta_time>,<start_path>,P,x_393,y_36,x_308,y_102,<end_path>,<start_repeat>,repeat_1,<end_repeat>,sv_2.0,<start_tick>,tick_3,<end_tick>,<hit_object_end><hit_object_start>,type_slider,x_320,y_160,hit_sound_0,new_combo_0,sample_set_2,vol_80,effects_1,<start_delta_time>,dt_353,<end_delta_time>,<start_p