In [7]:
%reload_ext autoreload
%autoreload 2

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import sys
code_dir = os.path.dirname(os.getcwd())
sys.path.append(code_dir)

import json
import math
from glob import glob
from itertools import permutations
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import IPython.display as ipd

tracks_csv_path = "/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset/extras/fma_info/tracks.csv"
genres_csv_path = "/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset/extras/fma_info/genres.csv"

figure_dir = os.path.join("..", "figures")

## Load FMA Annotations

In [8]:
def load_tracks_csv(filepath: str) -> pd.DataFrame:
    import ast
    tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

    COLUMNS = [
        ('track', 'tags'), 
        ('album', 'tags'), 
        ('artist', 'tags'), 
        ('track', 'genres'), 
        ('track', 'genres_all'), 
        ]
    for column in COLUMNS:
        tracks[column] = tracks[column].map(ast.literal_eval)

    COLUMNS = [
        ('track', 'date_created'), 
        ('track', 'date_recorded'),
        ('album', 'date_created'), 
        ('album', 'date_released'),
        ('artist', 'date_created'), 
        ('artist', 'active_year_begin'), 
        ('artist', 'active_year_end'), 
        ]
    for column in COLUMNS:
        tracks[column] = pd.to_datetime(tracks[column])

    SUBSETS = ('small', 'medium', 'large')
    try:
        tracks['set', 'subset'] = tracks['set', 'subset'].astype('category', categories=SUBSETS, ordered=True)
    except (ValueError, TypeError):
        # the categories and ordered arguments were removed in pandas 0.25
        tracks['set', 'subset'] = tracks['set', 'subset'].astype(pd.CategoricalDtype(categories=SUBSETS, ordered=True))

    COLUMNS = [
        ('track', 'genre_top'), 
        ('track', 'license'),
        ('album', 'type'), 
        ('album', 'information'), 
        ('artist', 'bio')
        ]
    for column in COLUMNS:
        tracks[column] = tracks[column].astype('category')

    return tracks

def count_genres(df):
    top_genre_counts = df['track', 'genre_top'].value_counts(dropna=False).to_dict()
    if np.nan in top_genre_counts:
        top_genre_counts["None"] = top_genre_counts.pop(np.nan)
    top_genre_counts = {k: v for k,v in sorted(top_genre_counts.items())}
    return top_genre_counts

In [9]:
df = load_tracks_csv(tracks_csv_path)
print(df.shape)
df.head()

(106574, 52)


Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [10]:
drop_cols = [
    ( 'album',          'comments'),
    ( 'album',      'date_created'),
    ( 'album',     'date_released'),
    ( 'album',          'engineer'),
    ( 'album',         'favorites'),
    ( 'album',       'information'),
    ( 'album',           'listens'),
    ( 'album',          'producer'),
    ( 'album',              'tags'), #
    ( 'album',             'title'), #
    ('artist', 'active_year_begin'),
    ('artist',   'active_year_end'),
    ('artist', 'associated_labels'),
    ('artist',               'bio'),
    ('artist',          'comments'),
    ('artist',      'date_created'),
    ('artist',         'favorites'),
    ('artist',          'latitude'),
    ('artist',          'location'),
    ('artist',         'longitude'),
    ('artist',           'members'), #
    ('artist',  'related_projects'),
    ('artist',              'tags'), #
    ('artist',           'website'),
    ('artist',    'wikipedia_page'),
    (   'set',             'split'),
    ( 'track',          'bit_rate'),
    ( 'track',          'comments'),
    ( 'track',          'composer'),
    ( 'track',      'date_created'),
    ( 'track',     'date_recorded'),
    ( 'track',          'duration'),
    ( 'track',         'favorites'),
    ( 'track',       'information'),
    ( 'track',          'interest'),
    ( 'track',     'language_code'),
    ( 'track',           'license'),
    ( 'track',           'listens'),
    ( 'track',          'lyricist'), #
    ( 'track',         'publisher'),
    ( 'track',              'tags'), #
]
df.drop(columns=drop_cols, inplace=True)
df.head()

Unnamed: 0_level_0,album,album,album,artist,artist,set,track,track,track,track,track
Unnamed: 0_level_1,id,tracks,type,id,name,subset,genre_top,genres,genres_all,number,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2,1,7,Album,1,AWOL,small,Hip-Hop,[21],[21],3,Food
3,1,7,Album,1,AWOL,medium,Hip-Hop,[21],[21],4,Electric Ave
5,1,7,Album,1,AWOL,small,Hip-Hop,[21],[21],6,This World
10,6,2,Album,6,Kurt Vile,small,Pop,[10],[10],1,Freeway
20,4,13,Album,4,Nicky Cook,large,,"[76, 103]","[17, 10, 76, 103]",3,Spiritual Level


In [11]:
full_top_genre_counts = count_genres(df)
print(json.dumps(full_top_genre_counts, indent=4))

# List all top genres (single genre)
top_genres = sorted([k for k in full_top_genre_counts.keys() if k != "None"])
print(len(top_genres))
print()
print("\n".join(top_genres))
print()

{
    "Blues": 110,
    "Classical": 1230,
    "Country": 194,
    "Easy Listening": 24,
    "Electronic": 9372,
    "Experimental": 10608,
    "Folk": 2803,
    "Hip-Hop": 3552,
    "Instrumental": 2079,
    "International": 1389,
    "Jazz": 571,
    "None": 56976,
    "Old-Time / Historic": 554,
    "Pop": 2332,
    "Rock": 14182,
    "Soul-RnB": 175,
    "Spoken": 423
}
16

Blues
Classical
Country
Easy Listening
Electronic
Experimental
Folk
Hip-Hop
Instrumental
International
Jazz
Old-Time / Historic
Pop
Rock
Soul-RnB
Spoken



In [None]:
# Drop rows with NaN top genre
df_single_top_genre = df[~df['track', 'genre_top'].isna()].copy()
print(df_single_top_genre.shape)
df_single_top_genre.head()

In [None]:
# Get the medium subset
df_medium = df[df['set', 'subset'] <= "medium"].copy()
print(df_medium.shape)
df_medium.head()

medium_top_genre_counts = count_genres(df_medium)
print()
print(len(medium_top_genre_counts))
print(json.dumps(medium_top_genre_counts, indent=4))

## Genre CSV

In [14]:
df_genres = pd.read_csv(genres_csv_path, index_col=0)
print(df_genres.shape)
df_genres.head()

(163, 4)


Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8693,38,Avant-Garde,38
2,5271,0,International,2
3,1752,0,Blues,3
4,4126,0,Jazz,4
5,4106,0,Classical,5


In [18]:
top_genre_ids = df_genres['top_level'].unique()
print(top_genre_ids)

[  38    2    3    4    5   20    8    9   10   14   12   13   15   17
 1235   21]


In [None]:
df_multi_top_genre = df[df[('track', 'genre_top')].isna()].copy()
df_multi_top_genre.head()

In [61]:
multi_top_genres = []
for track_id, row in df_multi_top_genre['track', 'genres'].items():
    genres = set(df_genres.loc[row]['top_level'].unique().tolist())
    if genres not in multi_top_genres:
        multi_top_genres.append(genres)

In [62]:
multi_top_genres

[{10, 17},
 {10, 12},
 {2, 15},
 {38, 1235},
 {4, 15, 38},
 {3, 4},
 set(),
 {17, 1235},
 {15, 38},
 {4, 38},
 {2, 8},
 {5, 15},
 {12, 17},
 {2, 4},
 {12, 15},
 {9, 12, 17, 38},
 {2, 14, 15},
 {12, 38},
 {10, 12, 17},
 {20, 38},
 {12, 15, 38},
 {12, 20},
 {9, 12},
 {13, 38},
 {5, 8},
 {10, 15},
 {2, 12},
 {2, 12, 17},
 {2, 17},
 {10, 15, 17},
 {10, 38},
 {9, 12, 17},
 {17, 20},
 {10, 12, 38},
 {2, 4, 17, 38},
 {2, 17, 38},
 {12, 21, 38},
 {15, 38, 1235},
 {17, 38},
 {2, 9, 12},
 {12, 20, 1235},
 {12, 17, 38},
 {4, 12, 38},
 {15, 17, 38},
 {10, 17, 20},
 {9, 17},
 {5, 9},
 {2, 4, 38},
 {8, 20, 38},
 {21, 38},
 {8, 38},
 {10, 15, 38},
 {15, 21},
 {8, 9, 17},
 {2, 8, 38},
 {8, 9},
 {3, 12, 17},
 {4, 12},
 {2, 38, 1235},
 {14, 15, 21},
 {2, 14, 21},
 {9, 10, 12, 17},
 {10, 12, 15},
 {12, 17, 1235},
 {12, 1235},
 {14, 21},
 {15, 1235},
 {12, 17, 38, 1235},
 {5, 38},
 {15, 20},
 {13, 15, 1235},
 {4, 15},
 {10, 14, 21},
 {14, 15},
 {12, 14},
 {8, 17},
 {3, 17},
 {2, 38},
 {2, 3, 15},
 {5, 38,

In [None]:
genre_id_dict = df_genres['title'].to_dict()
# Reorder use consecutive integers
genre_id_dict = {v: i for i, v in enumerate(genre_id_dict.values())}

In [None]:
# Replace NaN with -1 and record
df['track', 'genre_top_id'] = df.apply(lambda row: genre_id_dict.get(row['track', 'genre_top'], -1), axis=1)
df['track', 'genre_top_id'].unique()

In [11]:
set(df_genres.loc[df_genres['top_level'].unique(), 'title'].values)

{'Blues',
 'Classical',
 'Country',
 'Easy Listening',
 'Electronic',
 'Experimental',
 'Folk',
 'Hip-Hop',
 'Instrumental',
 'International',
 'Jazz',
 'Old-Time / Historic',
 'Pop',
 'Rock',
 'Soul-RnB',
 'Spoken'}

In [12]:
df_genres

Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8693,38,Avant-Garde,38
2,5271,0,International,2
3,1752,0,Blues,3
4,4126,0,Jazz,4
5,4106,0,Classical,5
...,...,...,...,...
1032,60,102,Turkish,2
1060,30,46,Tango,2
1156,26,130,Fado,2
1193,72,763,Christmas,38


In [14]:
df_genres[df_genres['parent'] == 0].sort_values('#tracks', ascending=False)

Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38,38154,0,Experimental,38
15,34413,0,Electronic,15
12,32923,0,Rock,12
1235,14938,0,Instrumental,1235
10,13845,0,Pop,10
17,12706,0,Folk,17
21,8389,0,Hip-Hop,21
2,5271,0,International,2
4,4126,0,Jazz,4
5,4106,0,Classical,5


### Tree

In [3]:
import pydot

class Genres:

    def __init__(self, genres_df):
        self.df = genres_df

    def create_tree(self, roots, depth=None):

        if type(roots) is not list:
            roots = [roots]
        graph = pydot.Dot(graph_type='digraph', strict=True)

        def create_node(genre_id):
            title = self.df.at[genre_id, 'title']
            ntracks = self.df.at[genre_id, '#tracks']
            # name = self.df.at[genre_id, 'title'] + '\n' + str(genre_id)
            name = '"{}\n{} / {}"'.format(title, genre_id, ntracks)
            return pydot.Node(name)

        def create_tree(root_id, node_p, depth):
            if depth == 0:
                return
            children = self.df[self.df['parent'] == root_id]
            for child in children.iterrows():
                genre_id = child[0]
                node_c = create_node(genre_id)
                graph.add_edge(pydot.Edge(node_p, node_c))
                create_tree(genre_id, node_c,
                            depth-1 if depth is not None else None)

        for root in roots:
            node_p = create_node(root)
            graph.add_node(node_p)
            create_tree(root, node_p, depth)

        return graph

    def find_roots(self):
        roots = []
        for gid, row in self.df.iterrows():
            parent = row['parent']
            title = row['title']
            if parent == 0:
                roots.append(gid)
            elif parent not in self.df.index:
                msg = '{} ({}) has parent {} which is missing'.format(gid, title, parent)
                raise RuntimeError(msg)
        return roots

In [4]:
g = Genres(df_genres)
graph = g.create_tree([25, 31], 1)
#ipd.Image(graph.create_png())

In [6]:
roots = g.find_roots()
print('{} roots'.format(len(roots)))
graph = g.create_tree(roots)
graph.write_pdf('genre_hierarchy.pdf')

16 roots


## Examine k-fold Splits

In [None]:
five_fold_dir = "/home/oaraz/nextcore/fingerprinting/datasets/5_fold-fma_full_subset/"

# fold_analysis_dir = os.path.join(five_fold_dir, "fold_analysis")
# os.makedirs(fold_analysis_dir, exist_ok=True)

folds_fig_dir = os.path.join(figure_dir, "5_fold")
os.makedirs(folds_fig_dir, exist_ok=True)

In [None]:
for fold_id in range(5):

    fold_dir = os.path.join(five_fold_dir, str(fold_id))

    train_txt = os.path.join(fold_dir, "train.txt")
    test_query_txt = os.path.join(fold_dir, "test_query.txt")
    #test_dummy_txt = os.path.join(fold_dir, "test_dummy.txt")

    # Train Set
    with open(train_txt, "r") as in_f:
        train_paths = [l.strip() for l in in_f.readlines()]
    train_track_ids = [int(os.path.splitext(os.path.basename(path))[0].lstrip("0")) for path in train_paths]
    train_top_genres = []
    for track_id in train_track_ids:
        genre = df.loc[track_id]['track', 'genre_top']
        if pd.isna(genre):
            genre = "None"
        else:
            genre = genre
        train_top_genres.append(genre)

    # Test Query
    with open(test_query_txt, "r") as in_f:
        test_query_paths = [l.strip() for l in in_f.readlines()]
    test_query_track_ids = [int(os.path.splitext(os.path.basename(path))[0].lstrip("0")) for path in test_query_paths]
    test_query_top_genres = []
    for track_id in test_query_track_ids:
        genre = df.loc[track_id]['track', 'genre_top']
        if pd.isna(genre):
            genre = "None"
        else:
            genre = genre
        test_query_top_genres.append(genre)

    # Convert to counter and add missing genres if there are
    train_top_genre_counter = Counter(train_top_genres)
    test_query_top_genre_counter = Counter(test_query_top_genres)
    for genre in top_genres:
        if genre not in train_top_genre_counter.keys():
            train_top_genre_counter[genre] = 0
        if genre not in test_query_top_genre_counter.keys():
            test_query_top_genre_counter[genre] = 0

    # Sort by genre
    train_top_genre_counter = {k: v for k,v in sorted(train_top_genre_counter.items())}
    test_query_top_genre_counter = {k: v for k,v in sorted(test_query_top_genre_counter.items())}

    fig,ax = plt.subplots(nrows=3, figsize=(15,12), constrained_layout=True,)
    ax = np.array(ax).flatten()
    fig.suptitle(f"Top Genre Distributions of FMA_full vs Fold{fold_id}", fontsize=20)

    ax[0].bar(full_top_genre_counts.keys(), full_top_genre_counts.values())
    ax[0].set_title("FMA Full (Multiple Top Genre is 'None' Column)", fontsize=15)

    ax[1].bar(train_top_genre_counter.keys(), train_top_genre_counter.values())
    ax[1].set_title("Training Set")

    ax[2].bar(test_query_top_genre_counter.keys(), test_query_top_genre_counter.values())
    ax[2].set_title("Test Query Set")

    for _x in ax:
        _x.grid()
        _x.set_ylabel("#Tracks (log)")
        _x.set_xlabel("Genres")
        _x.tick_params(axis='x', labelrotation = 15)
        _x.set_yscale("log")
        _x.set_yticks(10**np.arange(6))
        _x.set_ylim([0.1, 10**5])

    fig.savefig(os.path.join(folds_fig_dir, f"{fold_id}.png"))

    plt.close()

## Sample Here

In [None]:
from sklearn.model_selection import train_test_split

N_TRAIN = 10000
N_VAL = 1000
N_TEST = 5000

N_TOTAL = N_TRAIN + N_VAL + N_TEST

sample_figures_dir = os.path.join(figure_dir, "sample")
os.makedirs(sample_figures_dir, exist_ok=True)

### From Single Top Genre

#### Not stratified

In [None]:
# Initital random sampling # TODO: understand
train_val_test_df = df_single_top_genre.copy().sample(n=N_TOTAL, 
                                                      random_state=27)

train_val_df, test_query_df = train_test_split(train_val_test_df, 
                                                test_size=N_TEST, 
                                                random_state=27)
train_df, val_df = train_test_split(train_val_df, 
                                    test_size=N_VAL, 
                                    random_state=27)
assert train_df.shape[0] + val_df.shape[0] + test_query_df.shape[0] == N_TOTAL
del train_val_df

train_top_genre_counter = count_genres(train_df)
val_top_genre_counter = count_genres(val_df)
test_query_top_genre_counter = count_genres(test_query_df)

# Plot
fig,ax = plt.subplots(nrows=3, figsize=(15,12), constrained_layout=True)
ax = np.array(ax).flatten()
fig.suptitle("Top Genre Distributions of FMA_full vs Our Splits", fontsize=20)

ax[0].bar(full_top_genre_counts.keys(), full_top_genre_counts.values())
ax[0].set_title("FMA Full (Multiple Top Genre is 'None' Column)", fontsize=15)

ax[1].bar(train_top_genre_counter.keys(), train_top_genre_counter.values(), label="Train")
ax[1].bar(val_top_genre_counter.keys(), val_top_genre_counter.values(), color="orange", label="Validation")
ax[1].set_title("Development Set (Only Single Top Genre)", fontsize=15)
ax[1].legend()

ax[2].bar(test_query_top_genre_counter.keys(), test_query_top_genre_counter.values())
ax[2].set_title("Test Query Set (Only Single Top Genre)", fontsize=15)

for _x in ax:
    _x.grid()
    _x.set_ylabel("#Tracks (log)", fontsize=13)
    _x.set_xlabel("Genres", fontsize=13)
    _x.tick_params(axis='x', labelrotation=15)
    _x.set_yscale("log")
    _x.set_yticks(10**np.arange(6))
    _x.set_ylim([0.1, 10**5])

plt.show()

fig.savefig(os.path.join(sample_figures_dir, "large_single-our_splits-2_step.png"))

In [None]:
train_val_df, test_query_df = train_test_split(
                            df_single_top_genre.copy(), 
                            train_size=N_TRAIN+N_VAL,
                            test_size=N_TEST, 
                            random_state=27)
train_df, val_df = train_test_split(
                            train_val_df, 
                            train_size=N_TRAIN,
                            test_size=N_VAL, 
                            random_state=27)
assert train_df.shape[0] + val_df.shape[0] + test_query_df.shape[0] == N_TOTAL
del train_val_df

train_top_genre_counter = count_genres(train_df)
val_top_genre_counter = count_genres(val_df)
test_query_top_genre_counter = count_genres(test_query_df)

# Plot
fig,ax = plt.subplots(nrows=3, figsize=(15,12), constrained_layout=True)
ax = np.array(ax).flatten()
fig.suptitle("Top Genre Distributions of FMA_full vs Our Splits", fontsize=20)

ax[0].bar(full_top_genre_counts.keys(), full_top_genre_counts.values())
ax[0].set_title("FMA Full (Multiple Top Genre is 'None' Column)", fontsize=15)

ax[1].bar(train_top_genre_counter.keys(), train_top_genre_counter.values(), label="Train")
ax[1].bar(val_top_genre_counter.keys(), val_top_genre_counter.values(), color="orange", label="Validation")
ax[1].set_title("Development Set (Only Single Top Genre)", fontsize=15)
ax[1].legend()

ax[2].bar(test_query_top_genre_counter.keys(), test_query_top_genre_counter.values())
ax[2].set_title("Test Query Set (Only Single Top Genre)", fontsize=15)

for _x in ax:
    _x.grid()
    _x.set_ylabel("#Tracks (log)", fontsize=13)
    _x.set_xlabel("Genres", fontsize=13)
    _x.tick_params(axis='x', labelrotation=15)
    _x.set_yscale("log")
    _x.set_yticks(10**np.arange(6))
    _x.set_ylim([0.1, 10**5])

plt.show()

fig.savefig(os.path.join(sample_figures_dir, "large_single-our_splits.png"))

#### Stratified

In [None]:
train_val_df, test_query_df = train_test_split(
                                           df_single_top_genre.copy(), 
                                           train_size=N_TRAIN+N_VAL, 
                                           test_size=N_TEST, 
                                           stratify=df_single_top_genre['track', 'genre_top'], 
                                           random_state=27
                                           )
train_df, val_df = train_test_split(train_val_df, 
                                    train_size=N_TRAIN, 
                                    test_size=N_VAL, 
                                    stratify=train_val_df['track', 'genre_top'], 
                                    random_state=27
                                    )
assert train_df.shape[0] + val_df.shape[0] + test_query_df.shape[0] == N_TOTAL
del train_val_df

train_top_genre_counter = count_genres(train_df)
val_top_genre_counter = count_genres(val_df)
test_query_top_genre_counter = count_genres(test_query_df)

# Plot
fig,ax = plt.subplots(nrows=3, figsize=(15,12), constrained_layout=True)
ax = np.array(ax).flatten()
fig.suptitle("Top Genre Distributions of FMA_full vs Our Stratified Splits", fontsize=20)

ax[0].bar(full_top_genre_counts.keys(), full_top_genre_counts.values())
ax[0].set_title("FMA Full (Multiple Top Genre is 'None' Column)", fontsize=15)

ax[1].bar(train_top_genre_counter.keys(), train_top_genre_counter.values(), label="Train")
ax[1].bar(val_top_genre_counter.keys(), val_top_genre_counter.values(), color="orange", label="Validation")
ax[1].set_title("Development Set (Only Single Top Genre)", fontsize=15)
ax[1].legend()

ax[2].bar(test_query_top_genre_counter.keys(), test_query_top_genre_counter.values())
ax[2].set_title("Test Query Set (Only Single Top Genre)", fontsize=15)

for _x in ax:
    _x.grid()
    _x.set_ylabel("#Tracks (log)", fontsize=13)
    _x.set_xlabel("Genres", fontsize=13)
    _x.tick_params(axis='x', labelrotation=15)
    _x.set_yscale("log")
    _x.set_yticks(10**np.arange(6))
    _x.set_ylim([0.1, 10**5])

plt.show()

fig.savefig(os.path.join(sample_figures_dir, "large_single-our_splits-stratified.png"))

### From Medium

In [None]:
train_val_df, test_query_df = train_test_split(
                                           df_medium.copy(), 
                                           train_size=N_TRAIN+N_VAL, 
                                           test_size=N_TEST, 
                                           stratify=df_medium['track', 'genre_top'], 
                                           random_state=27
                                           )
train_df, val_df = train_test_split(train_val_df, 
                                    train_size=N_TRAIN, 
                                    test_size=N_VAL, 
                                    stratify=train_val_df['track', 'genre_top'], 
                                    random_state=27
                                    )
assert train_df.shape[0] + val_df.shape[0] + test_query_df.shape[0] == N_TOTAL
del train_val_df

train_top_genre_counter = count_genres(train_df)
val_top_genre_counter = count_genres(val_df)
test_query_top_genre_counter = count_genres(test_query_df)

# Plot
fig,ax = plt.subplots(nrows=4, figsize=(15,16), constrained_layout=True)
ax = np.array(ax).flatten()
fig.suptitle("Top Genre Distributions of FMA_full vs Our Stratified Splits", fontsize=20)

ax[0].bar(full_top_genre_counts.keys(), full_top_genre_counts.values())
ax[0].set_title("FMA Full (Multiple Top Genre is 'None' Column)", fontsize=15)

ax[1].bar(medium_top_genre_counts.keys(), medium_top_genre_counts.values())
ax[1].set_title("FMA Medium", fontsize=15)

ax[2].bar(train_top_genre_counter.keys(), train_top_genre_counter.values(), label="Train")
ax[2].bar(val_top_genre_counter.keys(), val_top_genre_counter.values(), color="orange", label="Validation")
ax[2].set_title("Development Set (Subset of FMA_medium)", fontsize=15)
ax[2].legend()

ax[3].bar(test_query_top_genre_counter.keys(), test_query_top_genre_counter.values())
ax[3].set_title("Test Query Set (Subset of FMA_medium)", fontsize=15)

for _x in ax:
    _x.grid()
    _x.set_ylabel("#Tracks (log)", fontsize=13)
    _x.set_xlabel("Genres", fontsize=13)
    _x.tick_params(axis='x', labelrotation=15)
    _x.set_yscale("log")
    _x.set_yticks(10**np.arange(6))
    _x.set_ylim([0.1, 10**5])

plt.show()

fig.savefig(os.path.join(sample_figures_dir, "medium-our_splits-stratified.png"))

### From Large

In [None]:
train_val_df, test_query_df = train_test_split(
                                           df.copy(), 
                                           train_size=N_TRAIN+N_VAL, 
                                           test_size=N_TEST, 
                                           stratify=df['track', 'genre_top_id'], 
                                           random_state=27
                                           )
train_df, val_df = train_test_split(train_val_df, 
                                    train_size=N_TRAIN, 
                                    test_size=N_VAL, 
                                    stratify=train_val_df['track', 'genre_top_id'], 
                                    random_state=27
                                    )
assert train_df.shape[0] + val_df.shape[0] + test_query_df.shape[0] == N_TOTAL
del train_val_df

train_top_genre_counter = count_genres(train_df)
val_top_genre_counter = count_genres(val_df)
test_query_top_genre_counter = count_genres(test_query_df)

# Plot
fig,ax = plt.subplots(nrows=3, figsize=(15,12), constrained_layout=True)
ax = np.array(ax).flatten()
fig.suptitle("Top Genre Distributions of FMA_full vs Our Stratified Splits", fontsize=20)

ax[0].bar(full_top_genre_counts.keys(), full_top_genre_counts.values())
ax[0].set_title("FMA Full", fontsize=15)

ax[1].bar(train_top_genre_counter.keys(), train_top_genre_counter.values(), label="Train")
ax[1].bar(val_top_genre_counter.keys(), val_top_genre_counter.values(), color="orange", label="Validation")
ax[1].set_title("Development Set (Subset of FMA_full)", fontsize=15)
ax[1].legend()

ax[2].bar(test_query_top_genre_counter.keys(), test_query_top_genre_counter.values())
ax[2].set_title("Test Query Set (Subset of FMA_full)", fontsize=15)

for _x in ax:
    _x.grid()
    _x.set_ylabel("#Tracks (log)", fontsize=13)
    _x.set_xlabel("Genres", fontsize=13)
    _x.tick_params(axis='x', labelrotation=15)
    _x.set_yscale("log")
    _x.set_yticks(10**np.arange(6))
    _x.set_ylim([0.1, 10**5])

plt.show()

fig.savefig(os.path.join(sample_figures_dir, "full-our_splits-stratified.png"))

## Examine the NAFP Splits

In [None]:
import glob
import os
from itertools import combinations
import random

In [None]:
music_dir = "/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset/music"

train_dir = os.path.join(music_dir, "train-10k-30s")
val_dir = os.path.join(music_dir, "val-query-db-500-30s")
test_dummy_dir = os.path.join(music_dir, "test-dummy-db-100k-full/fma_full")
test_query_db_dir = os.path.join(music_dir, "test-query-db-500-30s/db")

train_fps = sorted(glob.glob(os.path.join(train_dir, "**/*.wav"), recursive=True))
val_fps = sorted(glob.glob(os.path.join(val_dir, "**/*.wav"), recursive=True))
test_dummy_fps = sorted(glob.glob(os.path.join(test_dummy_dir, "**/*.wav"), recursive=True))
test_query_db_fps = sorted(glob.glob(os.path.join(test_query_db_dir, "**/*.wav"), recursive=True))

train_fnames = set([os.path.basename(fp) for fp in train_fps])
print(len(train_fnames))
val_fnames = set([os.path.basename(fp) for fp in val_fps])
print(len(val_fnames))
test_dummy_fnames = set([os.path.basename(fp) for fp in test_dummy_fps])
print(len(test_dummy_fnames))
test_query_db_fnames = set([os.path.basename(fp) for fp in test_query_db_fps])
print(len(test_query_db_fnames))
print()

for a,b in combinations([('train', train_fnames), ('val', val_fnames), ('test_dummy', test_dummy_fnames), ('test_query', test_query_db_fnames)], 2):
    print(a[0], b[0], len(a[1].intersection(b[1])))

In [None]:
train_track_ids = [int(os.path.splitext(os.path.basename(fp))[0].lstrip('0')) for fp in train_fps]
val_track_ids = [int(os.path.splitext(os.path.basename(fp))[0]) for fp in val_fps]
test_query_db_track_ids = [int(os.path.splitext(os.path.basename(fp))[0]) for fp in test_query_db_fps]

train_df = df.loc[train_track_ids].copy()
val_df = df.loc[val_track_ids].copy()
test_query_db_df = df.loc[test_query_db_track_ids].copy()

train_top_genre_counter = count_genres(train_df)
val_top_genre_counter = count_genres(val_df)
test_query_top_genre_counter = count_genres(test_query_db_df)

fig,ax = plt.subplots(nrows=3, figsize=(15,12), constrained_layout=True)
ax = np.array(ax).flatten()
fig.suptitle("Top Genre Distributions of FMA_full vs NAFP Splits", fontsize=20)

ax[0].bar(full_top_genre_counts.keys(), full_top_genre_counts.values())
ax[0].set_title("FMA Full (Multiple Top Genre is 'None' Column)", fontsize=15)

ax[1].bar(train_top_genre_counter.keys(), train_top_genre_counter.values(), label="Train")
ax[1].bar(val_top_genre_counter.keys(), val_top_genre_counter.values(), color="orange", label="Validation")
ax[1].set_title("Development Set (Only Single Top Genre)", fontsize=15)
ax[1].legend()

ax[2].bar(test_query_top_genre_counter.keys(), test_query_top_genre_counter.values())
ax[2].set_title("Test Query Set (Only Single Top Genre)", fontsize=15)

for _x in ax:
    _x.grid()
    _x.set_ylabel("#Tracks (log)")
    _x.set_xlabel("Genres")
    _x.tick_params(axis='x', labelrotation=15)
    _x.set_yscale("log")
    _x.set_yticks(10**np.arange(6))
    _x.set_ylim([0.1, 10**5])

fig.savefig(os.path.join(figure_dir, f"nafp-splits.png"))

plt.show()

### Genre Experiment

In [None]:
train_track_ids = [int(os.path.splitext(os.path.basename(fp))[0].lstrip('0')) for fp in train_fps]
val_track_ids = [int(os.path.splitext(os.path.basename(fp))[0]) for fp in val_fps]
test_query_db_track_ids = [int(os.path.splitext(os.path.basename(fp))[0]) for fp in test_query_db_fps]

train_df = df.loc[train_track_ids].copy()
val_df = df.loc[val_track_ids].copy()
test_query_db_df = df.loc[test_query_db_track_ids].copy()

# Drop dev tracks from medium db
dev_track_ids = train_track_ids + val_track_ids
#_df = df_medium.drop(dev_track_ids)

# Sample 500 stratified tracks from the rest of the db, it can coincide with old test_query_db
# _ , new_test_query_db_df = train_test_split(_df, 
#                                             test_size=500, 
#                                             random_state=34, 
#                                             stratify=_df['track', 'genre_top'])

_df = df_single_top_genre.drop(dev_track_ids)
_ , new_test_query_db_df = train_test_split(_df, 
                                            test_size=500, 
                                            random_state=34, 
                                            stratify=_df['track', 'genre_top'])

new_test_indices = set(new_test_query_db_df.index.to_list())
print(len(new_test_indices.intersection(set(test_query_db_track_ids))))

In [None]:
_df['track', 'genre_top'].value_counts()

In [None]:
train_top_genre_counter = count_genres(train_df)
val_top_genre_counter = count_genres(val_df)
test_query_top_genre_counter = count_genres(test_query_db_df)
new_test_query_top_genre_counter = count_genres(new_test_query_db_df)

fig,ax = plt.subplots(nrows=4, figsize=(15,16), constrained_layout=True)
ax = np.array(ax).flatten()

fig.suptitle("Top Genre Distributions of FMA_full vs Test Query", fontsize=20)

ax[0].bar(full_top_genre_counts.keys(), full_top_genre_counts.values())
ax[0].set_title("FMA Full (Multiple Top Genre is 'None' Column)", fontsize=15)

ax[1].bar(train_top_genre_counter.keys(), train_top_genre_counter.values(), label="Train")
ax[1].bar(val_top_genre_counter.keys(), val_top_genre_counter.values(), color="orange", label="Validation")
ax[1].set_title("Development Set (Only Single Top Genre)", fontsize=15)
ax[1].legend()

ax[2].bar(test_query_top_genre_counter.keys(), test_query_top_genre_counter.values())
ax[2].set_title("Old Test Query Set", fontsize=15)

ax[3].bar(new_test_query_top_genre_counter.keys(), new_test_query_top_genre_counter.values())
ax[3].set_title("New Test Query Set", fontsize=15)

for _x in ax:
    _x.grid()
    _x.set_ylabel("#Tracks (log)")
    _x.set_xlabel("Genres")
    _x.tick_params(axis='x', labelrotation=15)
    _x.set_yscale("log")
    _x.set_yticks(10**np.arange(6))
    _x.set_ylim([0.1, 10**5])

fig.savefig(os.path.join(figure_dir, f"nafp-new_splits-single_top.png"))

plt.show()

## test_ids

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
test_ids_path = "/home/oaraz/nextcore/fingerprinting/neural-audio-fp/eval/test_ids_icassp2021.npy"
test_ids = np.load(test_ids_path)

In [None]:
boundaries_path = "/home/oaraz/nextcore/fingerprinting/neural-audio-fp/logs/emb/640_lamb/101/db-track_boundaries.npy"
boundaries = np.load(boundaries_path)

In [None]:
counter = {n: 0 for n in range(500)}
for test_id in test_ids:
    track_idx = np.where((boundaries[:,0]<=test_id) & (boundaries[:,1]>=test_id))[0][0]
    counter[track_idx] += 1

In [None]:
sum([1 if c==0 else 0 for c in counter.values()])

In [None]:
np.median(list(counter.values()))

In [None]:
max(counter.values())

In [None]:
x = list(counter.keys())
y = list(counter.values())
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(x, y)
ax.grid()
plt.show()

In [None]:
test_seq_len = [1, 3, 5, 9, 15, 19]

test_ids = []
for s,e in boundaries:
    # Cut the query into segments of test_seq_len
    # If the last segment is shorter than test_seq_len, ignore it
    test_ids.append(np.arange(s, e+1-test_seq_len[-1], 11)) # end is inclusive
test_ids = np.concatenate(test_ids)
print(len(test_ids))

counter = {n: 0 for n in range(500)}
for test_id in test_ids:
    track_idx = np.where((boundaries[:,0]<=test_id) & (boundaries[:,1]>=test_id))[0][0]
    counter[track_idx] += 1
print(sum([1 if c==0 else 0 for c in counter.values()]))

x = list(counter.keys())
y = list(counter.values())
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(x, y)
ax.grid()
plt.show()

In [None]:
test_seq_len = [1, 3, 5, 9, 15, 19]

test_ids = []
for s,e in boundaries:
    # Cut the query into segments of test_seq_len
    # If the last segment is shorter than test_seq_len, ignore it
    test_ids.append(np.arange(s, e+1-test_seq_len[-1], 5)) # end is inclusive
test_ids = np.concatenate(test_ids)

print(len(test_ids))

counter = {n: 0 for n in range(500)}
for test_id in test_ids:
    track_idx = np.where((boundaries[:,0]<=test_id) & (boundaries[:,1]>=test_id))[0][0]
    counter[track_idx] += 1

x = list(counter.keys())
y = list(counter.values())
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(x, y)
ax.grid()
plt.show()

### Check if boundaries are preserved

In [None]:
test_ids_path = "/home/oaraz/nextcore/fingerprinting/neural-audio-fp/eval/test_ids_icassp2021.npy"
test_ids = np.load(test_ids_path)

boundaries_path = "/home/oaraz/nextcore/fingerprinting/neural-audio-fp/logs/emb/640_lamb/101/db-track_boundaries.npy"
boundaries = np.load(boundaries_path)

In [None]:
test_seq_len = [1, 3, 5, 9, 15, 19]

In [None]:
counter = 0
for test_id in test_ids:
    track_idx = np.where((boundaries[:,0]<=test_id) & (boundaries[:,1]>=test_id))[0][0]
    _, u_b = boundaries[track_idx]
    for s_l in test_seq_len:
        if test_id + s_l - 1 <= u_b:
            continue
        else:
            #print(test_id, s_l, u_b)
            counter += 1
print(100*counter/(len(test_seq_len)*len(test_ids)))

## Track Duration Analysis 

In [None]:
wav_paths = sorted(glob(fma_wav_dir + "/**/*.wav", recursive=True))
print("Number of wav files:", len(wav_paths))

t0 = time.time()
with open("/home/oaraz/nextcore/fingerprinting/datasets/fma_wav_8k/track_information-loading.json", "w") as out_f:

    for i,wav_path in enumerate(wav_paths):

        pt_wav = wave.open(wav_path, 'r')
        fs = pt_wav.getframerate()
        n_samples = pt_wav.getnframes()
        pt_wav.setpos(0)
        x = pt_wav.readframes(n_samples)
        pt_wav.close()
        x = np.frombuffer(x, dtype=np.int16)

        out_f.write(json.dumps({'file_name': os.path.basename(wav_path),
                                'duration(s)': x.shape[0]/fs,
                                'fs': fs})+"\n")

        if (i+1)%1000==0:
            delta_t = time.time()-t0
            print(f"{i+1}/{len(wav_paths)} [total time: {delta_t:.2f}(s)]")
            t0 = time.time()