In [42]:
import pandas as pd
import os
import ast

In [44]:
import numpy as np
from multiprocessing import Pool

In [46]:
import warnings

In [48]:
from scipy import stats

In [50]:
from tqdm import tqdm
import asyncio

In [52]:
import librosa

In [54]:
def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks


In [56]:
tracks = load("../../Original Data/fma_metadata/tracks.csv")

In [57]:
def get_audio_path(set_name, track_id):
    """
    Return the path to the mp3 given the directory where the audio is stored
    and the track ID.

    Examples
    --------
    >>> import utils
    >>> AUDIO_DIR = os.environ.get('AUDIO_DIR')
    >>> utils.get_audio_path(AUDIO_DIR, 2)
    '../data/fma_small/000/000002.mp3'

    """
    tid_str = '{:06d}'.format(track_id)
    print(os.path.join("fma_full", tid_str[:3], tid_str + '.mp3'))
    return os.path.join("fma_full", tid_str[:3], tid_str + '.mp3')

In [58]:
def columns():
    feature_sizes = dict(mel_spec=128)
    moments = ('mean', 'std', 'skew', 'kurtosis', 'median', 'min', 'max')

    columns = []
    for name, size in feature_sizes.items():
        for moment in moments:
            it = ((name, moment, '{:02d}'.format(i+1)) for i in range(size))
            columns.extend(it)

    names = ('feature', 'statistics', 'number')
    columns = pd.MultiIndex.from_tuples(columns, names=names)

    # More efficient to slice if indexes are sorted.
    return columns.sort_values()

In [59]:
async def compute_features(tid,set_name,zipe):

    features = pd.Series(index=columns(), dtype=np.float32, name=tid)

    # Catch warnings as exceptions (audioread leaks file descriptors).
    warnings.filterwarnings('error', module='librosa')
    
    def feature_stats(name, values):
        features[name, 'mean'] = np.mean(values, axis=1)
        features[name, 'std'] = np.std(values, axis=1)
        features[name, 'skew'] = stats.skew(values, axis=1)
        features[name, 'kurtosis'] = stats.kurtosis(values, axis=1)
        features[name, 'median'] = np.median(values, axis=1)
        features[name, 'min'] = np.min(values, axis=1)
        features[name, 'max'] = np.max(values, axis=1)

    try:
        filepath = get_audio_path(set_name,tid)
        file = await getFile(zipe,filepath)
        x, sr = librosa.load(filepath, sr=None, mono=True)  # kaiser_fast
        stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
        assert stft.shape[0] == 1 + 2048 // 2
        assert np.ceil(len(x)/512) <= stft.shape[1] <= np.ceil(len(x)/512)+1
        del x
        
        f = librosa.feature.melspectrogram(S=stft)
        feature_stats('mel_spec', f)
        del stft
        os.remove("./"+filepath)

    except Exception as e:
        print('{}: {}'.format(tid, repr(e)))

    return features

In [60]:
async def getFile(zipe,filepath):
    return zipe.extract(filepath)

In [61]:
features = pd.DataFrame(index=tracks.index,
                            columns=columns(), dtype=np.float32)

In [62]:
async def getF():
    
    tids = tracks.iloc[94958:95000].index
    from remotezip import RemoteZip
    with RemoteZip('https://os.unil.cloud.switch.ch/fma/fma_full.zip') as zip:
        for ids in tids:
            set_id = tracks.iloc[94958:95000].loc[ids]['set','subset']
            f = await compute_features(ids,set_id,zip)
            features.loc[f.name] = f
    
        return features

    
        
    

In [63]:
mel_spec_features = await getF()

fma_full/138/138892.mp3
fma_full/138/138893.mp3
fma_full/138/138894.mp3
fma_full/138/138895.mp3
fma_full/138/138896.mp3
fma_full/138/138897.mp3
fma_full/138/138898.mp3
fma_full/138/138899.mp3
fma_full/138/138900.mp3
fma_full/138/138901.mp3
fma_full/138/138902.mp3
fma_full/138/138903.mp3
fma_full/138/138904.mp3
fma_full/138/138905.mp3
fma_full/138/138906.mp3
fma_full/138/138907.mp3
fma_full/138/138908.mp3
fma_full/138/138917.mp3
fma_full/138/138918.mp3
fma_full/138/138919.mp3
fma_full/138/138920.mp3
fma_full/138/138921.mp3
fma_full/138/138922.mp3
fma_full/138/138923.mp3
fma_full/138/138924.mp3
fma_full/138/138925.mp3
fma_full/138/138926.mp3
fma_full/138/138927.mp3
fma_full/138/138928.mp3
fma_full/138/138929.mp3
fma_full/138/138930.mp3
fma_full/138/138931.mp3
fma_full/138/138932.mp3
fma_full/138/138933.mp3
fma_full/138/138934.mp3
fma_full/138/138935.mp3
fma_full/138/138936.mp3
fma_full/138/138937.mp3
fma_full/138/138938.mp3
fma_full/138/138939.mp3
fma_full/138/138940.mp3
fma_full/138/138

In [71]:
mel_spec_features.to_csv("mel_spec_feature_all_94958_95000.csv")