In [1]:
import json

import pandas as pd

In [2]:
with open("data/deluxe_I_know_a_little.json", "r") as file:
    song_data = json.load(file) 

## Note Level Features

In [3]:
song_notes = song_data.get("Notes", [])

In [4]:
song_metadata = song_data.get("Song", [])

In [5]:
notes_df = pd.DataFrame(song_notes)

In [6]:
notes_df.head()

Unnamed: 0,NoteId,SongId,TrackVersionId,NoteTypeId,StageId,Column,TickStart,TickEnd,ArrowDirection,SwitchHoldTicks,MatchHeight
0,1962238,426,2,1,1,1,293.877551,293.877551,,,0
1,1962239,426,2,1,1,2,293.877551,293.877551,,,0
2,1962240,426,2,5,1,3,312.244898,312.244898,3.0,,0
3,1962241,426,2,5,1,2,330.612245,330.612245,5.0,,0
4,1962242,426,2,5,1,1,348.979592,348.979592,1.0,,0


In [7]:
total_notes = len(notes_df)
print(total_notes)

643


In [8]:
duration_str = song_metadata.get("Duration", "0m 0s")

In [9]:
duration_parts = duration_str.split(" ")

In [10]:
minutes = int(duration_parts[0][:-1]) if "m" in duration_parts[0] else 0
seconds = int(duration_parts[1][:-1]) if len(duration_parts) > 1 else 0
duration_seconds = minutes * 60 + seconds

In [11]:
duration_seconds

133

Calculate Note Density

Total number of notes divided by the song's duration. Higher density usually means more actions per second, increasing difficulty.

In [12]:
note_density = total_notes / duration_seconds if duration_seconds > 0 else 0

In [13]:
note_density

4.834586466165414

Calculate Unique Directions (ArrowDirection)

Greater count of unique arrow directions may require more skill to react.

In [14]:
unique_directions = notes_df["ArrowDirection"].nunique()

In [15]:
unique_directions

8

Calculate Unique Columns

Songs using more columns or (having frequent changes) can be harder to play. 

In [16]:
unique_columns = notes_df['Column'].nunique()

In [17]:
unique_columns

3

Count Hold Notes (non-null SwitchHoldTicks)

Count or percentage of SwitchHoldTicks values that are non-null, as hold notes can add complexity.

In [18]:
hold_notes_count = notes_df["SwitchHoldTicks"].notnull().sum()
print(f"Hold Notes Count: {hold_notes_count}")
print(f"Total Notes Count: {total_notes}")
hold_notes_percentage = (hold_notes_count / total_notes) * 100 if total_notes > 0 else 0
print(f"Hold Notes Percentage: {round(hold_notes_percentage, 2)}%")

Hold Notes Count: 7
Total Notes Count: 643
Hold Notes Percentage: 1.09%


In [19]:
note_level_df = pd.DataFrame([{
    "Duration (s)": duration_seconds,
    "Note Density": round(note_density, 2),
    "Unique Directions": unique_directions,
    "Unique Columns": unique_columns,
    "Hold Notes Percentage": round(hold_notes_percentage, 2)
}])

note_level_df.head()

Unnamed: 0,Duration (s),Note Density,Unique Directions,Unique Columns,Hold Notes Percentage
0,133,4.83,8,3,1.09


## Stage Level Features

In [20]:
song_stages = song_data.get("Stages", [])

In [21]:
stages_df = pd.DataFrame(song_stages)

Number of stages

Songs with more stages or sections could feel harder due to their length and variability.

In [22]:
num_stages = len(stages_df)
num_stages

5

Average TickSize

TickSize represents timing precision or granularity in each stage. Smaller tick sizes likely demand more accuracy from players.

In [23]:
average_tick_size = stages_df["TickSize"].mean() if not stages_df.empty else 0
round(float(average_tick_size), 2)

13.1

In [24]:
stage_level_df = pd.DataFrame([{"Number of Stages": num_stages, "Average Tick Size": round(average_tick_size, 2)}])
stage_level_df.head()

Unnamed: 0,Number of Stages,Average Tick Size
0,5,13.1


## Song Level Features

BPM

Higher BPM indicates a faster pace, generally increasing difficulty.

In [25]:
bpm = song_metadata.get("BPM", 0)
bpm

196

Difficulty Name & ID 

The target we want to predict.

In [26]:
difficulty_name = song_metadata.get("DifficultyName", "Unknown")
print(difficulty_name)
difficulty_id = song_metadata.get("DifficultyId", -1)
print(difficulty_id)

Extreme
3


In [27]:
song_level_df = pd.DataFrame([{
    "BPM": bpm,
    "Difficulty Name": difficulty_name,
    "Difficulty ID": difficulty_id
}])

song_level_df.head()

Unnamed: 0,BPM,Difficulty Name,Difficulty ID
0,196,Extreme,3


## Final song representation

features + target (difficulty)

In [28]:
final_df = pd.concat([note_level_df, stage_level_df, song_level_df], axis=1)
final_df.head()

Unnamed: 0,Duration (s),Note Density,Unique Directions,Unique Columns,Hold Notes Percentage,Number of Stages,Average Tick Size,BPM,Difficulty Name,Difficulty ID
0,133,4.83,8,3,1.09,5,13.1,196,Extreme,3
