In [None]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import os
import numpy as np

class BeatmapDataset(Dataset):
    def __init__(self, input_file, mel_folder):
        self.df = pd.read_csv(input_file)
        self.mel_folder = mel_folder

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        beatmap_id = row["beatmap_id"]
        beatmapset_id = beatmap_id.split("-")[0]

        mel_path = os.path.join(self.mel_folder, f"{beatmapset_id}.npy")
        mel = np.load(mel_path)
        mel = torch.tensor(mel, dtype=torch.float)
        print(mel.shape)

        tokens = [int(t) for t in row["tokenized"].split(',')]
        tokens = torch.tensor(tokens, dtype=torch.long)

        return mel, tokens

In [None]:
import numpy as np

mel = np.load("/home/saliherdemk/osu-try/data/formatted/mels/7922.npy")
mel.shape

In [None]:
import numpy as np

mel = np.load("/home/saliherdemk/osu-try/data/formatted/mels/10204.npy")
mel.shape

In [None]:
from torch.utils.data import DataLoader

csv_file = "/home/saliherdemk/osu-try/data/formatted/encoded.csv"
mel_folder = "/home/saliherdemk/osu-try/data/formatted/mels/"

dataset = BeatmapDataset(csv_file, mel_folder)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
for idx, data in dataloader:
    print(idx,data)

In [None]:
import pandas as pd

df = pd.read_csv("/home/saliherdemk/osu-try-dataset/formatted/encoded.csv")

In [None]:
pd.options.display.max_rows = None

In [None]:
df

In [None]:
text = "<beatmap_start>,"
ids = [token for token in text.split(",") if len(token)]
ids

In [5]:
import numpy as np

a = np.load("/home/saliherdemk/osu-try-dataset/formatted/mels/4229_7.npy")
b = np.load("/home/saliherdemk/osu-try-dataset/formatted/mels/18779_9.npy")


In [6]:
a.shape, b.shape


((512, 128), (512, 128))

In [7]:
a

array([[-35.589306, -15.382883, -14.869236, ..., -53.917366, -57.329285,
        -69.95265 ],
       [-31.094816, -18.681942, -18.030474, ..., -50.625275, -53.4004  ,
        -65.24948 ],
       [-25.338524, -22.125801, -22.323944, ..., -42.237732, -44.19422 ,
        -55.47326 ],
       ...,
       [  0.      ,   0.      ,   0.      , ...,   0.      ,   0.      ,
          0.      ],
       [  0.      ,   0.      ,   0.      , ...,   0.      ,   0.      ,
          0.      ],
       [  0.      ,   0.      ,   0.      , ...,   0.      ,   0.      ,
          0.      ]], shape=(512, 128), dtype=float32)

In [8]:
b

array([[-20.117947, -18.944096, -15.781652, ..., -80.      , -80.      ,
        -80.      ],
       [-18.093306, -17.523674, -16.785995, ..., -80.      , -80.      ,
        -80.      ],
       [-17.765316, -17.544176, -16.488117, ..., -80.      , -80.      ,
        -80.      ],
       ...,
       [  0.      ,   0.      ,   0.      , ...,   0.      ,   0.      ,
          0.      ],
       [  0.      ,   0.      ,   0.      , ...,   0.      ,   0.      ,
          0.      ],
       [  0.      ,   0.      ,   0.      , ...,   0.      ,   0.      ,
          0.      ]], shape=(512, 128), dtype=float32)

In [31]:
import os
from tqdm import tqdm
import shutil

input_folder = "/home/saliherdemk/a/"
dataset_path = "/home/saliherdemk/osu-dataset/"

beatmap_folders = [
    entry
    for entry in os.listdir(input_folder)
    if os.path.isdir(os.path.join(input_folder, entry))
]

for entry in tqdm(beatmap_folders):
    entry_path = os.path.join(input_folder, entry)
    audio_files = [
        f
        for f in os.listdir(entry_path)
        if f.lower().endswith((".mp3", ".ogg"))
    ]
    audio_folder = os.path.join(dataset_path, "audio", entry.split("-")[1])
    os.makedirs(audio_folder, exist_ok=True)

    try:
        osu_file = [
            f for f in os.listdir(entry_path) if f.lower().endswith((".osu"))
        ][0]
    except:
        print(
            "Couldn't find .osu file in",
            entry_path,
            "You may need to add manually.",
        )
        continue

    audio_files = {f.lower(): f for f in os.listdir(entry_path)}

    audio_filename = ""

    with open(os.path.join(entry_path, osu_file), "r", encoding="utf-8") as f:
        lines = f.readlines()
    for line in lines:
        if line.startswith("AudioFilename"):
            audio_filename = line.split(":")[1].strip().lower()
            continue

    audio_file = audio_files[audio_filename.lower()]
    
    shutil.copy2(
        os.path.join(entry_path, audio_file),
        os.path.join(audio_folder, audio_file),
    )
    shutil.rmtree(entry_path)
    # 7231


  0%|                                                                                                                                                                                      | 0/120 [00:00<?, ?it/s]

Couldn't find .osu file in /home/saliherdemk/a/beatmapset-38720 You may need to add manually.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [01:03<00:00,  1.89it/s]


In [1]:
exclude = ["811908", "685725", "25637", "38720"]

In [19]:
import pandas as pd

In [20]:
beatmap_df = pd.read_csv("/home/saliherdemk/osu-dataset/beatmaps.csv")
hit_objects_df = pd.read_csv("/home/saliherdemk/osu-dataset/hit_objects.csv")
timing_points_df = pd.read_csv("/home/saliherdemk/osu-dataset/timing_points.csv")

In [21]:
beatmap_df_new = beatmap_df[~beatmap_df["id"].str.split("-").str[0].isin(exclude)]
hit_objects_df_new = hit_objects_df[~hit_objects_df["id"].str.split("-").str[0].isin(exclude)]
timing_points_df_new = timing_points_df[~timing_points_df["id"].str.split("-").str[0].isin(exclude)]

In [24]:
len(timing_points_df_new), len(timing_points_df)

(2722447, 2722447)