In [1]:
%reload_ext autoreload
%autoreload 2

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import sys
code_dir = os.path.dirname(os.getcwd())
sys.path.append(code_dir)

from glob import glob
import json
import glob
import wave
import math
from itertools import permutations

import numpy as np
from scipy import stats, signal
import pandas as pd

import essentia.standard as es

from model.utils.audio_utils import load_audio, background_mix, max_normalize, ir_aug

import IPython.display as ipd
from model.utils.audio_utils import load_audio, get_fns_seg_dict

from model.dataset import Dataset

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


## FMA Tests

In [4]:
tracks_csv_path = "/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset/extras/fma_info/tracks.csv"
df_full = pd.read_csv(tracks_csv_path, low_memory=False)
print(df_full.shape)
df_full.head()

(106576, 53)


Unnamed: 0.1,Unnamed: 0,album,album.1,album.2,album.3,album.4,album.5,album.6,album.7,album.8,...,track.10,track.11,track.12,track.13,track.14,track.15,track.16,track.17,track.18,track.19
0,,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
1,track_id,,,,,,,,,,...,,,,,,,,,,
2,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
4,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World


In [5]:
df_full.columns

Index(['Unnamed: 0', 'album', 'album.1', 'album.2', 'album.3', 'album.4',
       'album.5', 'album.6', 'album.7', 'album.8', 'album.9', 'album.10',
       'album.11', 'album.12', 'artist', 'artist.1', 'artist.2', 'artist.3',
       'artist.4', 'artist.5', 'artist.6', 'artist.7', 'artist.8', 'artist.9',
       'artist.10', 'artist.11', 'artist.12', 'artist.13', 'artist.14',
       'artist.15', 'artist.16', 'set', 'set.1', 'track', 'track.1', 'track.2',
       'track.3', 'track.4', 'track.5', 'track.6', 'track.7', 'track.8',
       'track.9', 'track.10', 'track.11', 'track.12', 'track.13', 'track.14',
       'track.15', 'track.16', 'track.17', 'track.18', 'track.19'],
      dtype='object')

In [6]:
df_full.iloc[0]

Unnamed: 0                  NaN
album                  comments
album.1            date_created
album.2           date_released
album.3                engineer
album.4               favorites
album.5                      id
album.6             information
album.7                 listens
album.8                producer
album.9                    tags
album.10                  title
album.11                 tracks
album.12                   type
artist        active_year_begin
artist.1        active_year_end
artist.2      associated_labels
artist.3                    bio
artist.4               comments
artist.5           date_created
artist.6              favorites
artist.7                     id
artist.8               latitude
artist.9               location
artist.10             longitude
artist.11               members
artist.12                  name
artist.13      related_projects
artist.14                  tags
artist.15               website
artist.16        wikipedia_page
set     

In [7]:
df_full['set'].unique()

array(['split', nan, 'training', 'validation', 'test'], dtype=object)

In [8]:
print(df_full[df_full['set']=="training"].shape)
print(df_full[df_full['set']=="validation"].shape)
print(df_full[df_full['set']=="test"].shape)

(84353, 53)
(10958, 53)
(11263, 53)


In [9]:
df_full['set.1'].unique()

array(['subset', nan, 'small', 'medium', 'large'], dtype=object)

In [10]:
print(df_full[df_full['set.1']=="small"].shape)
print(df_full[df_full['set.1']=="medium"].shape)
print(df_full[df_full['set.1']=="large"].shape)

(8000, 53)
(17000, 53)
(81574, 53)


In [11]:
total_unique = 81574 + 17000 + 8000
print(total_unique)

106574


### Understand NAFP Dataset
all 8000 of small and 2000 from medium used for training
500 val 
500 query

total used 11000
total different 106574

In [17]:
total_used = 10000 + 500 + 500
print(total_used)

11000


In [18]:
total_unique - total_used

95574

In [19]:
100000-93458

6542

In [20]:
set(df_full[df_full['set.1']=="small"].index.to_numpy()).intersection(set(df_full[df_full['set.1']=="medium"].index.to_numpy()))

set()

## Load Again Properly With known header row

In [72]:
df = pd.read_csv(tracks_csv_path, low_memory=False, header=1)
df.rename(columns={'Unnamed: 0': 'track_id'}, inplace=True)
df.drop(0, axis=0, inplace=True)
print(df.shape)
df.head()

(106574, 53)


Unnamed: 0,track_id,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,information.1,interest,language_code,license,listens.1,lyricist,number,publisher,tags.2,title.1
1,2,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,4656.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293.0,,3.0,,[],Food
2,3,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,1470.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514.0,,4.0,,[],Electric Ave
3,5,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,1933.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151.0,,6.0,,[],This World
4,10,0.0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4.0,6.0,,47632.0,,...,,54881.0,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135.0,,1.0,,[],Freeway
5,20,0.0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2.0,4.0,"<p> ""spiritual songs"" from Nicky Cook</p>",2710.0,,...,,978.0,en,Attribution-NonCommercial-NoDerivatives (aka M...,361.0,,3.0,,[],Spiritual Level


In [73]:
df.columns

Index(['track_id', 'comments', 'date_created', 'date_released', 'engineer',
       'favorites', 'id', 'information', 'listens', 'producer', 'tags',
       'title', 'tracks', 'type', 'active_year_begin', 'active_year_end',
       'associated_labels', 'bio', 'comments.1', 'date_created.1',
       'favorites.1', 'id.1', 'latitude', 'location', 'longitude', 'members',
       'name', 'related_projects', 'tags.1', 'website', 'wikipedia_page',
       'split', 'subset', 'bit_rate', 'comments.2', 'composer',
       'date_created.2', 'date_recorded', 'duration', 'favorites.2',
       'genre_top', 'genres', 'genres_all', 'information.1', 'interest',
       'language_code', 'license', 'listens.1', 'lyricist', 'number',
       'publisher', 'tags.2', 'title.1'],
      dtype='object')

In [74]:
drop_cols = [
    "date_created",
    "date_released",
    "engineer",
    "favorites",
    "id",
    "producer",
    "tags",
    "active_year_begin",
    "active_year_end",
    "associated_labels",
    "comments.1",
    "date_created.1",
    "favorites.1",
    "id.1",
    "latitude",
    "location",
    "longitude",
    "related_projects",
    "tags.1",
    "website",
    "wikipedia_page",
    "comments.2",
    "date_created.2",
    "date_recorded",
    "favorites.2",
    "information.1",
    "interest",
    "language_code",
    "license",
    "listens.1",
    "lyricist",
    "number",
    "publisher",
    "tags.2",
    "listens",
]
df.drop(labels=drop_cols, axis=1, inplace=True)

In [75]:
df.columns

Index(['track_id', 'comments', 'information', 'title', 'tracks', 'type', 'bio',
       'members', 'name', 'split', 'subset', 'bit_rate', 'composer',
       'duration', 'genre_top', 'genres', 'genres_all', 'title.1'],
      dtype='object')

In [68]:
df['genres']

1                  [21]
2                  [21]
3                  [21]
4                  [10]
5             [76, 103]
              ...      
106570             [25]
106571             [25]
106572             [25]
106573             [25]
106574    [10, 12, 169]
Name: genres, Length: 106574, dtype: object

In [42]:
df['genre_top'].unique()

array(['Hip-Hop', 'Pop', nan, 'Rock', 'Experimental', 'Folk', 'Jazz',
       'Electronic', 'Spoken', 'International', 'Soul-RnB', 'Blues',
       'Country', 'Classical', 'Old-Time / Historic', 'Instrumental',
       'Easy Listening'], dtype=object)

In [76]:
df[df['genre_top'].isna()].iloc[0]

track_id                                                      20
comments                                                     0.0
information            <p> "spiritual songs" from Nicky Cook</p>
title                                                      Niris
tracks                                                      13.0
type                                                       Album
bio            <p>Songs written by: Nicky Cook</p>\n<p>VOCALS...
members                                             Nicky Cook\n
name                                                  Nicky Cook
split                                                   training
subset                                                     large
bit_rate                                                256000.0
composer                                                     NaN
duration                                                   311.0
genre_top                                                    NaN
genres                   

In [39]:
df['genres_all']

1                      [21]
2                      [21]
3                      [21]
4                      [10]
5         [17, 10, 76, 103]
                ...        
106570             [25, 12]
106571             [25, 12]
106572             [25, 12]
106573             [25, 12]
106574     [169, 10, 12, 9]
Name: genres_all, Length: 106574, dtype: object

In [51]:
df[df['genres'].isna()]

Unnamed: 0,track_id,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,information.1,interest,language_code,license,listens.1,lyricist,number,publisher,tags.2,title.1


In [53]:
df[df['genres_all'].isna()]

Unnamed: 0,track_id,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,information.1,interest,language_code,license,listens.1,lyricist,number,publisher,tags.2,title.1


In [77]:
print(df[(df['subset']=="large") &  (df['genre_top'].isna())].shape)

(56976, 18)


In [84]:
df[df['subset']=="medium"]['genre_top'].unique()

array(['Hip-Hop', 'Rock', 'Folk', 'Jazz', 'Electronic', 'Experimental',
       'Soul-RnB', 'Pop', 'Blues', 'Spoken', 'Country', 'Classical',
       'Old-Time / Historic', 'Instrumental', 'International',
       'Easy Listening'], dtype=object)

In [81]:
df[(df['subset']=="large") &  (df['genre_top'].isna())].iloc[3]

track_id                                                      46
comments                                                     0.0
information            <p> "spiritual songs" from Nicky Cook</p>
title                                                      Niris
tracks                                                      13.0
type                                                       Album
bio            <p>Songs written by: Nicky Cook</p>\n<p>VOCALS...
members                                             Nicky Cook\n
name                                                  Nicky Cook
split                                                   training
subset                                                     large
bit_rate                                                256000.0
composer                                                     NaN
duration                                                   104.0
genre_top                                                    NaN
genres                   

## Genre CSV

In [3]:
genres_csv_path = "/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset/extras/fma_info/genres.csv"
df_genres_full = pd.read_csv(genres_csv_path, low_memory=False)
print(df_genres_full.shape)
df_genres_full.head()

(163, 5)


Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5


## Examine the NAFP Splits, Find Missing tracks

In [None]:
import glob
import os
from itertools import combinations
import random

In [None]:
music_dir = "/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset/music"

train_dir = os.path.join(music_dir, "train-10k-30s")
val_dir = os.path.join(music_dir, "val-query-db-500-30s")
test_dummy_dir = os.path.join(music_dir, "test-dummy-db-100k-full/fma_full")
test_query_db_dir = os.path.join(music_dir, "test-query-db-500-30s/db")

In [None]:
train_fps = sorted(glob.glob(os.path.join(train_dir, "**/*.wav"), recursive=True))
val_fps = sorted(glob.glob(os.path.join(val_dir, "**/*.wav"), recursive=True))
test_dummy_fps = sorted(glob.glob(os.path.join(test_dummy_dir, "**/*.wav"), recursive=True))
test_query_db_fps = sorted(glob.glob(os.path.join(test_query_db_dir, "**/*.wav"), recursive=True))

In [None]:
train_fnames = set([os.path.basename(fp) for fp in train_fps])
print(len(train_fnames))
val_fnames = set([os.path.basename(fp) for fp in val_fps])
print(len(val_fnames))
test_dummy_fnames = set([os.path.basename(fp) for fp in test_dummy_fps])
print(len(test_dummy_fnames))
test_query_db_fnames = set([os.path.basename(fp) for fp in test_query_db_fps])
print(len(test_query_db_fnames))

In [None]:
required = 100000 - len(test_dummy_fnames)
print(required)

In [None]:
for a,b in combinations([('train', train_fnames), ('val', val_fnames), ('test_dummy', test_dummy_fnames), ('test_query', test_query_db_fnames)], 2):
    print(a[0], b[0], len(a[1].intersection(b[1])))

In [None]:
train_val = train_fnames.union(val_fnames)
print(len(train_val))

missing_ids = random.sample(list(train_val), k=required)
print(len(missing_ids))
print(len(set(missing_ids).intersection(test_dummy_fnames)))

print(len(set(missing_ids).intersection(test_query_db_fnames)))

['processing /mnt/mtgdb-audio/incoming/fma/audio/068/068586.mp3',
 'sox WARN rate: rate clipped 17 samples; decrease volume?',
 'sox WARN dither: dither clipped 14 samples; decrease volume?']

In [None]:
with open("/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset-missing_6542/missing_ids.txt", "w") as o_f:
    o_f.write("\n".join(missing_ids))

for fnames in [train_fnames, val_fnames, test_dummy_fnames, test_query_db_fnames]:
    for fname in fnames:
        fname = os.path.splitext(fname)[0]
        if df[df['track_id']==int(fname)].size==0:
            print("wtf")

## Move files

In [None]:
with open("/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset-missing_6542/missing_ids.txt", "r") as i_f:
    missing_ids = [id.strip() for id in i_f.readlines()]
for id in missing_ids:
    if id == "":
        print("wtf")

In [None]:
missing_ids[:10]

In [None]:
with open("/home/oaraz/nextcore/fingerprinting/datasets/neural-audio-fp-dataset-missing_6542/missing_ids-source_paths.txt", "w") as o_f:
    for id in missing_ids:
        id = os.path.splitext(id)[0]
        source_path = os.path.join(id[:3], f"{id}.mp3") # "/mnt/mtgdb-audio/incoming/fma/audio"
        o_f.write(f"{source_path}\n")

In [None]:
source_path

## test_ids

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
test_ids_path = "/home/oaraz/nextcore/fingerprinting/neural-audio-fp/eval/test_ids_icassp2021.npy"
test_ids = np.load(test_ids_path)

In [None]:
boundaries_path = "/home/oaraz/nextcore/fingerprinting/neural-audio-fp/logs/emb/640_lamb/101/db-track_boundaries.npy"
boundaries = np.load(boundaries_path)

In [None]:
counter = {n: 0 for n in range(500)}
for test_id in test_ids:
    track_idx = np.where((boundaries[:,0]<=test_id) & (boundaries[:,1]>=test_id))[0][0]
    counter[track_idx] += 1

In [None]:
sum([1 if c==0 else 0 for c in counter.values()])

In [None]:
np.median(list(counter.values()))

In [None]:
max(counter.values())

In [None]:
x = list(counter.keys())
y = list(counter.values())
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(x, y)
ax.grid()
plt.show()

In [None]:
test_seq_len = [1, 3, 5, 9, 15, 19]

test_ids = []
for s,e in boundaries:
    # Cut the query into segments of test_seq_len
    # If the last segment is shorter than test_seq_len, ignore it
    test_ids.append(np.arange(s, e+1-test_seq_len[-1], 11)) # end is inclusive
test_ids = np.concatenate(test_ids)
print(len(test_ids))

counter = {n: 0 for n in range(500)}
for test_id in test_ids:
    track_idx = np.where((boundaries[:,0]<=test_id) & (boundaries[:,1]>=test_id))[0][0]
    counter[track_idx] += 1
print(sum([1 if c==0 else 0 for c in counter.values()]))

x = list(counter.keys())
y = list(counter.values())
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(x, y)
ax.grid()
plt.show()

In [None]:
test_seq_len = [1, 3, 5, 9, 15, 19]

test_ids = []
for s,e in boundaries:
    # Cut the query into segments of test_seq_len
    # If the last segment is shorter than test_seq_len, ignore it
    test_ids.append(np.arange(s, e+1-test_seq_len[-1], 5)) # end is inclusive
test_ids = np.concatenate(test_ids)

print(len(test_ids))

counter = {n: 0 for n in range(500)}
for test_id in test_ids:
    track_idx = np.where((boundaries[:,0]<=test_id) & (boundaries[:,1]>=test_id))[0][0]
    counter[track_idx] += 1

x = list(counter.keys())
y = list(counter.values())
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(x, y)
ax.grid()
plt.show()

### Check if boundaries are preserved

In [None]:
test_ids_path = "/home/oaraz/nextcore/fingerprinting/neural-audio-fp/eval/test_ids_icassp2021.npy"
test_ids = np.load(test_ids_path)

boundaries_path = "/home/oaraz/nextcore/fingerprinting/neural-audio-fp/logs/emb/640_lamb/101/db-track_boundaries.npy"
boundaries = np.load(boundaries_path)

In [None]:
test_seq_len = [1, 3, 5, 9, 15, 19]

In [None]:
counter = 0
for test_id in test_ids:
    track_idx = np.where((boundaries[:,0]<=test_id) & (boundaries[:,1]>=test_id))[0][0]
    _, u_b = boundaries[track_idx]
    for s_l in test_seq_len:
        if test_id + s_l - 1 <= u_b:
            continue
        else:
            #print(test_id, s_l, u_b)
            counter += 1
print(100*counter/(len(test_seq_len)*len(test_ids)))