This is a stand-alone notebook that generates FFT features. It is based on this Giba's notebook: https://www.kaggle.com/titericz/0-309-baseline-logisticregression-using-fft

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
from scipy.interpolate import interp1d
import gc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

import soundfile as sf
# Librosa Libraries
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, label_ranking_average_precision_score

In [None]:
trainfiles = glob.glob( '../input/rfcx-species-audio-detection/train/*.flac' )
testfiles = glob.glob( '../input/rfcx-species-audio-detection/test/*.flac' )
len(trainfiles), len(testfiles), trainfiles[0]

In [None]:
traint = pd.read_csv( '../input/rfcx-species-audio-detection/train_tp.csv' )
trainf = pd.read_csv( '../input/rfcx-species-audio-detection/train_fp.csv' )
traint.shape, trainf.shape

In [None]:
traint.head()

In [None]:
trainf.head()

In [None]:
def extract_features(fn):
    data, samplerate = sf.read(fn)

    varfft = np.abs( np.fft.fft(data)[:(len(data)//2)] )
    x = np.linspace(0, len(varfft), num=len(varfft), endpoint=True)
    f1 = interp1d(x, varfft, kind='cubic')
    x = np.linspace(0, len(varfft), num=1000, endpoint=True)
    varfft = f1(x)
    
    return varfft

In [None]:
FT = Parallel(n_jobs=4)(delayed(extract_features)( '../input/rfcx-species-audio-detection/train/'+fn+'.flac' ) for fn in tqdm(traint.recording_id.values))
FT = np.stack(FT)
gc.collect()

FT.shape

In [None]:
FF = Parallel(n_jobs=4)(delayed(extract_features)( '../input/rfcx-species-audio-detection/train/'+fn+'.flac' ) for fn in tqdm(trainf.recording_id.values))
FF = np.stack(FF)
gc.collect()

FF.shape

In [None]:
#Combine True Positives and False Positives

TRAIN = np.vstack( (FT, FF) )
TRAIN.shape

In [None]:
TEST = Parallel(n_jobs=4)(delayed(extract_features)(fn) for fn in tqdm(testfiles))
TEST = np.stack(TEST)
gc.collect()

TEST.shape

In [None]:
tt = traint[['recording_id','species_id']].copy()
tf = trainf[['recording_id','species_id']].copy()
tf['species_id'] = -1

TRAIN_TAB = pd.concat( (tt, tf) )

for i in range(24):
    TRAIN_TAB['s'+str(i)] = 0
    TRAIN_TAB.loc[TRAIN_TAB.species_id==i,'s'+str(i)] = 1

TRAIN_TAB.head()

In [None]:
TRAIN_TAB.head()

In [None]:
np.save('TRAIN', TRAIN)
np.save('TEST', TEST)

In [None]:
TRAIN_TAB.to_csv('TRAIN_TAB.csv', index=False)