In [1]:
import os
import pandas as pd

data_path = "/mnt/L-HDD/try_dataset/hit_objects_formatted.csv"
df = pd.read_csv(data_path)

In [2]:
df[:1]

Unnamed: 0,unique_id,ID,Time,Type,X,Y,HitSound,Path,Repeat,Length,beatmap_id,MFCC,RMS,beat_length,meter,slider_velocity,sample_set,volume,effects
0,9380,1178020-0,73016.0,circle,153,120,2,0,0,0.0,1178020,[-229.50597 140.8695 -10.333648 23.3...,0.0442147739231586,288.461538,4.0,-333.333333,2.0,35.0,0.0


# Parse Path 

In [5]:
import numpy as np

def parse(row):
    x = row["Path"]
    curve_type = 0
    spinner_time = 0
    path = []
    if(pd.isna(x)):
        pass
    elif not '|' in x:
        spinner_time = x
    else:
        curve_type = x[0]
        coordinates_str = x[2:].split('|')
        path = [list(map(int, point.split(':'))) for point in coordinates_str]
    row["curve_type"] = curve_type
    row["spinner_time"] = spinner_time
    row["Path"] = path
    return row


In [6]:
df = df.apply(parse, axis=1)

In [7]:
max_point_num = df["Path"].apply(lambda x: len(x)).max()
max_point_num

np.int64(7291)

In [10]:
from tqdm import tqdm
def parse_splitted_vectorized(df):
    padded_paths = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        if len(row["Path"]) > 0:
            flattened = np.concatenate(row["Path"])
        else:
            flattened = np.array([])

        padded = np.pad(flattened[:max_point_num], (0, max_point_num - min(len(flattened), max_point_num)), mode='constant')
        padded_paths.append(padded)
    
    column_names = [f"Path_{i+1}" for i in range(max_point_num)]
    
    path_df = pd.DataFrame(padded_paths, columns=column_names, index=df.index)
    return pd.concat([df, path_df], axis=1)
    

In [11]:
df = parse_splitted_vectorized(df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35657/35657 [00:03<00:00, 11719.77it/s]


In [13]:
df.drop(columns="Path", inplace=True)

In [14]:
df

Unnamed: 0,unique_id,ID,Time,Type,X,Y,HitSound,Repeat,Length,beatmap_id,...,Path_7282,Path_7283,Path_7284,Path_7285,Path_7286,Path_7287,Path_7288,Path_7289,Path_7290,Path_7291
0,9380,1178020-0,73016.000000,circle,153,120,2,0,0.0,1178020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9603,1178020-0,126525.000000,circle,374,366,8,0,0.0,1178020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9602,1178020-0,126453.000000,circle,345,338,8,0,0.0,1178020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9601,1178020-0,126381.000000,circle,334,299,10,0,0.0,1178020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9600,1178020-0,126309.000000,circle,320,261,8,0,0.0,1178020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35652,17449,2333780-4,62441.000000,circle,320,212,0,0,0.0,2333780,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35653,17448,2333780-4,61879.000000,slider,188,300,0,1,120.0,2333780,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35654,17447,2333780-4,61504.000000,circle,76,344,0,0,0.0,2333780,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35655,17455,2333780-4,64504.000000,circle,416,324,2,0,0.0,2333780,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Parse MFCC

In [15]:
import re
import numpy as np
def parse_mfcc(mfcc_str):
    mfcc_str = mfcc_str.replace("[", "").replace("]", "")
    mfcc_values = re.split(r'\s+', mfcc_str.strip())
    return np.array(mfcc_values, dtype=np.float32)


In [16]:
df["MFCC"] = df["MFCC"].apply(parse_mfcc)

In [17]:
mfcc_length = 20

for i in range(mfcc_length):
    df[f"MFCC_{i+1}"] = df["MFCC"].apply(lambda x: x[i])

In [18]:
df.drop(columns="MFCC", axis = 1, inplace=True)

In [19]:
df

Unnamed: 0,unique_id,ID,Time,Type,X,Y,HitSound,Repeat,Length,beatmap_id,...,MFCC_11,MFCC_12,MFCC_13,MFCC_14,MFCC_15,MFCC_16,MFCC_17,MFCC_18,MFCC_19,MFCC_20
0,9380,1178020-0,73016.000000,circle,153,120,2,0,0.0,1178020,...,11.343530,8.967937,0.799711,-5.866954,-1.356839,-0.878800,-4.733579,-1.973073,2.539490,6.523636
1,9603,1178020-0,126525.000000,circle,374,366,8,0,0.0,1178020,...,-4.875385,-4.688524,-13.085867,-15.676073,-16.041273,-6.807100,-6.504179,-1.459217,0.903399,6.706066
2,9602,1178020-0,126453.000000,circle,345,338,8,0,0.0,1178020,...,-4.338229,-4.789364,-10.858992,-4.951115,-15.150204,-10.841499,-9.163681,-6.965391,-9.764526,-1.651822
3,9601,1178020-0,126381.000000,circle,334,299,10,0,0.0,1178020,...,-15.729964,-13.745026,-17.386240,-12.231793,-12.330675,-7.527829,-11.683446,-8.709246,-13.845087,-4.004777
4,9600,1178020-0,126309.000000,circle,320,261,8,0,0.0,1178020,...,-5.186029,-5.826274,-13.748964,-9.270420,-15.204746,-6.538778,-10.489502,-9.449425,-13.066658,-5.377833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35652,17449,2333780-4,62441.000000,circle,320,212,0,0,0.0,2333780,...,-8.491925,14.687662,-5.103133,6.275972,6.803501,15.748437,22.351376,-5.638035,6.645770,-6.884073
35653,17448,2333780-4,61879.000000,slider,188,300,0,1,120.0,2333780,...,1.943956,8.678497,6.565650,11.228224,9.392997,6.691104,-2.179216,0.551466,-4.256234,6.310300
35654,17447,2333780-4,61504.000000,circle,76,344,0,0,0.0,2333780,...,-16.312738,18.657139,8.465160,15.762417,-1.111063,-1.997209,-14.861074,-11.058609,11.255178,-3.289972
35655,17455,2333780-4,64504.000000,circle,416,324,2,0,0.0,2333780,...,-15.034908,-2.621220,0.739698,-1.871383,-7.724210,-7.028070,-9.876436,-13.320180,-19.953823,-12.130484
