In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

import soundfile as sf  
from tsfresh.feature_extraction import feature_calculators
import librosa
import pywt

from glob import glob 
from joblib import Parallel, delayed
from tqdm import tqdm_notebook

import gc

In [None]:
os.listdir('../input/rfcx-species-audio-detection/')

In [None]:
tp = pd.read_csv('../input/rfcx-species-audio-detection/train_tp.csv')
fp = pd.read_csv('../input/rfcx-species-audio-detection/train_fp.csv')
train = tp.append(fp)
del(tp,fp)

In [None]:
train_paths = glob('../input/rfcx-species-audio-detection/train/*')
test_paths = glob('../input/rfcx-species-audio-detection/test/*')

In [None]:
def audio_read(path):
    data, samplerate = sf.read(path) 
    return data

def denoise_signal_simple(x, wavelet='db4', level=1):
    coeff = pywt.wavedec(x, wavelet, mode="per")
    #univeral threshold
    uthresh = 10
    coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])
    # Reconstruct the signal using the thresholded coefficients
    return pywt.waverec(coeff, wavelet, mode='per')


def feature_gen(path):
    X = audio_read(path)
    z = X - np.median(X,axis=0)
    sig = z
    den_sample_simple = denoise_signal_simple(sig)
    mfcc = librosa.feature.mfcc(sig)
    mfcc_mean = mfcc.mean(axis=1)
    percentile_roll50_std_20 = np.percentile(pd.Series(sig).rolling(50).std().dropna().values, 20)
    
    return [feature_calculators.number_peaks(den_sample_simple, 2),percentile_roll50_std_20,mfcc_mean[18],mfcc_mean[4]]

In [None]:
%%time
#  28min 23s
max_len = len(train_paths)
X = pd.DataFrame(np.array([Parallel(n_jobs=4)(delayed(feature_gen)(filename) for filename in tqdm_notebook(train_paths[:max_len]))])[0])
X['recording_id'] = pd.Series(train_paths).apply(lambda x: x.split('/')[-1].split('.')[0])
train = train.merge(X,on='recording_id')
del(X,train_paths)

In [None]:
gc.collect()

In [None]:
train

In [None]:
from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder(sparse=False)
OHE.fit(train['species_id'].values.reshape(-1, 1))

In [None]:
y = train['species_id']
train.drop(['recording_id','species_id','t_min','songtype_id','f_min','t_max','f_max'],axis=1,inplace=True)

In [None]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
cat = LGBMClassifier(n_estimators=1000)
cat.fit(train,y)
accuracy_score(y,cat.predict(train))

In [None]:
sub = pd.read_csv('../input/rfcx-species-audio-detection/sample_submission.csv')

In [None]:
sub

In [None]:
len(test_paths)

In [None]:
test = pd.DataFrame(np.array([Parallel(n_jobs=4)(delayed(feature_gen)(filename) for filename in tqdm_notebook(test_paths))])[0])
preds = pd.DataFrame(cat.predict(test))

In [None]:
# test case
# sub.loc[:,'s0':'s23'] = OHE.transform(train['species_id'].values.reshape(-1, 1))[:sub.shape[0],:]

In [None]:
sub.loc[:,'s0':'s23'] = OHE.transform(np.array(preds).flatten().reshape(-1, 1))

In [None]:
sub.to_csv('submission.csv', index=False)