# Main notebook for presenting results

**[WARNING]:** *Do not forget to run the `setup.py`script, and to choose the correct virtual environment for this notebook*  

In [1]:
%run setup.py

Checking the dataset existence and integrity.
Dataset does not exist or is corrupted. Downloading again...
Started downloading dataset from https://os.unil.cloud.switch.ch/fma/fma_metadata.zip...
Download: |████████████████████████████████████████| 100.0%
Dowload Finished !
Unzipping files...
Files extracted in the following folder: /home/onyxia/applied-statistical-learning/data/fma_metadata
Installing dependencies
Creating a new virtual environment...
Installing dependancies from requirements.txt...
Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m49.9 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.2
    Uninstalling pip-25.2:
      Successfully uninstalled pip-25.2
Successfully installed pip-25.3
Collecting pandas (from -r /home/onyxia/applied-st

Removing info rows of the features and small preprocessing

In [28]:
import pandas as pd
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning) #to avoid column type warnings


def preprocessing(df):
    h1 = df.iloc[0].astype(str)
    h2 = df.iloc[1].astype(str)

    new_columns = []
    for col, p1, p2 in zip(df.columns.astype(str), h1, h2):
        parts = [p for p in (col, p1, p2) if p and p.lower() != "nan"]

        if len(parts) >= 3:
            name = f"{parts[0]}_({parts[1]},{parts[2]})"
        elif len(parts) == 2:
            name = f"{parts[0]}_({parts[1]})"
        else:
            name = parts[0]

        new_columns.append(name)

    df.columns = new_columns
    df = df.rename(columns={new_columns[0]: "track_id"})
    df = df[3:].apply(pd.to_numeric, errors="coerce")
    return df

features = preprocessing(pd.read_csv("data/fma_metadata/features.csv"))
genres = pd.read_csv("data/fma_metadata/tracks.csv")
genres = genres[["Unnamed: 0", "track.7"]] # track.8 for non str label
genres = genres.rename(columns={"Unnamed: 0": "track_id", "track.7": "label"})
features = features.merge(genres, on="track_id", how="left")
features = features[~features["label"].isna()] #67k NAs
features.head()


Unnamed: 0,track_id,"chroma_cens_(kurtosis,01)","chroma_cens.1_(kurtosis,02)","chroma_cens.2_(kurtosis,03)","chroma_cens.3_(kurtosis,04)","chroma_cens.4_(kurtosis,05)","chroma_cens.5_(kurtosis,06)","chroma_cens.6_(kurtosis,07)","chroma_cens.7_(kurtosis,08)","chroma_cens.8_(kurtosis,09)",...,"tonnetz.40_(std,05)","tonnetz.41_(std,06)","zcr_(kurtosis,01)","zcr.1_(max,01)","zcr.2_(mean,01)","zcr.3_(median,01)","zcr.4_(min,01)","zcr.5_(skew,01)","zcr.6_(std,01)",label
16382,26532,-0.302969,-0.470872,-0.469196,-0.255122,-0.187806,-0.585547,-0.742369,-0.640302,-0.226568,...,0.027796,0.02893,5.608563,0.437988,0.055578,0.046387,0.003418,1.845556,0.033841,Jazz
16383,26533,0.440674,-0.503167,-0.675728,-0.749545,-0.088118,0.738185,-0.663051,-0.204217,-0.585312,...,0.025859,0.026092,3.398937,0.392578,0.05532,0.045898,0.0,1.542672,0.03764,Jazz
16384,26534,-0.312054,-0.323059,-0.49649,-0.432532,-0.44476,-0.27629,-0.425854,0.264916,-0.371155,...,0.02853,0.027341,17.46587,0.516113,0.056354,0.043945,0.0,3.443325,0.045574,Jazz
16387,26542,0.055924,-1.130357,-0.907664,-0.969065,-0.995403,-1.078394,-0.195077,0.31182,-0.855434,...,0.019759,0.021637,10.866811,0.44873,0.077721,0.07666,0.003418,1.72253,0.035665,Rock
16388,26543,-0.538276,0.881777,4.649347,1.704443,0.861879,-0.952233,-1.208602,-0.898565,0.106316,...,0.017965,0.019144,3.862004,0.36377,0.085055,0.07666,0.00293,1.54065,0.038215,Rock


Some desc stats

In [29]:
from collections import Counter

Counter(features[~features["label"].isna()]["label"]) #very asymetric, plotable

Counter({'Rock': 10727,
         'Experimental': 8896,
         'Electronic': 7639,
         'Hip-Hop': 2960,
         'Folk': 2054,
         'Pop': 1987,
         'Instrumental': 1936,
         'International': 1118,
         'Classical': 1039,
         'Jazz': 354,
         'Spoken': 268,
         'Old-Time / Historic': 215,
         'Soul-RnB': 115,
         'Country': 103,
         'Blues': 51,
         'Easy Listening': 23})