In [1]:
import librosa

import pandas as pd
import numpy as np
import os

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
def mfccs_from_file(file_path, maxlen=None):
    try:
        audio, sr = librosa.load(file_path, res_type='kaiser_fast')

        if len(audio) == 0:
            raise ValueError(f"Input signal length is too small in file: {file_path}")

        if sr != 22050:
            audio = librosa.resample(audio, sr, 22050, res_type='kaiser_fast')

        mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=20)

        maxlen = maxlen if maxlen else 256
        if mfccs.shape[1] > maxlen:
            mfccs = mfccs[:, :maxlen]
        elif mfccs.shape[1] < maxlen:
            mfccs = np.pad(mfccs, ((0, 0), (0, maxlen - mfccs.shape[1])), mode='constant')

        return mfccs
    except Exception as e:
        print(e)

In [3]:
def get_mfcc(wav_file_path):
  y, sr = librosa.load(wav_file_path, offset=0, duration=30)
  mfcc = np.array(librosa.feature.mfcc(y=y, sr=sr))
  return mfcc

def get_melspectrogram(wav_file_path):
  y, sr = librosa.load(wav_file_path, offset=0, duration=30)
  melspectrogram = np.array(librosa.feature.melspectrogram(y=y, sr=sr))
  return melspectrogram

def get_chroma_vector(wav_file_path):
  y, sr = librosa.load(wav_file_path)
  chroma = np.array(librosa.feature.chroma_stft(y=y, sr=sr))
  return chroma

def get_tonnetz(wav_file_path):
  y, sr = librosa.load(wav_file_path)
  tonnetz = np.array(librosa.feature.tonnetz(y=y, sr=sr))
  return tonnetz

def get_feature(file_path):
    mfcc = get_mfcc(file_path)
    mfcc_mean = mfcc.mean(axis=1)
    mfcc_min = mfcc.min(axis=1)
    mfcc_max = mfcc.max(axis=1)
    mfcc_feature = np.concatenate( (mfcc_mean, mfcc_min, mfcc_max) )

    melspectrogram = get_melspectrogram(file_path)
    melspectrogram_mean = melspectrogram.mean(axis=1)
    melspectrogram_min = melspectrogram.min(axis=1)
    melspectrogram_max = melspectrogram.max(axis=1)
    melspectrogram_feature = np.concatenate( (melspectrogram_mean, melspectrogram_min, melspectrogram_max) )

    chroma = get_chroma_vector(file_path)
    chroma_mean = chroma.mean(axis=1)
    chroma_min = chroma.min(axis=1)
    chroma_max = chroma.max(axis=1)
    chroma_feature = np.concatenate( (chroma_mean, chroma_min, chroma_max) )

    tntz = get_tonnetz(file_path)
    tntz_mean = tntz.mean(axis=1)
    tntz_min = tntz.min(axis=1)
    tntz_max = tntz.max(axis=1)
    tntz_feature = np.concatenate( (tntz_mean, tntz_min, tntz_max) ) 

    feature = np.concatenate((chroma_feature, melspectrogram_feature, mfcc_feature, tntz_feature) )
    return feature

In [25]:
df = pd.read_parquet('crowd_train_without_duplicates.pqt')

In [26]:
df.source_id.nunique()

1806

In [29]:
tqdm.pandas()

df.audio_path = df['audio_path'].progress_apply(lambda x: os.path.join('crowd_train', x))

  0%|          | 0/124569 [00:00<?, ?it/s]

In [30]:
df['audio_feature'] = df['audio_path'].progress_apply(lambda x: get_feature(x))

  0%|          | 0/124569 [00:00<?, ?it/s]

  return pitch_tuning(


In [31]:
df.to_pickle('crowd_train_all_data_embedded.pkl') #в пикл, чтобы np.array не стали строками