In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import cudf
import cuml
import cupy as cp
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
from scipy.interpolate import interp1d
import gc
from cuml.linear_model import LogisticRegression
from cuml.neighbors import KNeighborsClassifier
from cuml.svm import SVC
from sklearn.model_selection import GroupKFold

import soundfile as sf
# Librosa Libraries
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.metrics import roc_auc_score, label_ranking_average_precision_score

In [None]:
trainfiles = glob.glob( '../input/rfcx-species-audio-detection/train/*.flac' )
testfiles = glob.glob( '../input/rfcx-species-audio-detection/test/*.flac' )
len(trainfiles), len(testfiles), trainfiles[0]

In [None]:
traint = cudf.read_csv( '../input/rfcx-species-audio-detection/train_tp.csv' )
traint['t_dif'] = traint['t_max'] - traint['t_min']
traint['f_dif'] = traint['f_max'] - traint['f_min']

trainf = cudf.read_csv( '../input/rfcx-species-audio-detection/train_fp.csv' )
trainf['t_dif'] = trainf['t_max'] - trainf['t_min']
trainf['f_dif'] = trainf['f_max'] - trainf['f_min']

traint.shape, trainf.shape

In [None]:
traint.head()

In [None]:
trainf.head()

In [None]:
trainf.f_dif.unique()

In [None]:
data, samplerate = sf.read(trainfiles[0]) 
print( data.shape, samplerate )
librosa.display.waveplot(y = data, sr = samplerate, color = "#B50D")

In [None]:
traint.describe()

In [None]:
trainf.describe()

In [None]:
TRAIN = []
TARGET = []
for i in tqdm(range(traint.shape[0])):

    fn = traint.recording_id.to_array()[i]
    tmin = traint.t_min.values[i]
    tmax = traint.t_max.values[i]
    fmin = traint.f_min.values[i]
    fmax = traint.f_max.values[i]
    #print(tmin,tmax, fmin,fmax )

    data, samplerate = sf.read( '../input/rfcx-species-audio-detection/train/'+fn+'.flac')
    #print( data.shape, samplerate )
    var_time = np.arange(0,data.shape[0]) / samplerate

    data = cp.asarray(data)[ np.where( (cp.asarray(var_time)>=tmin)&(cp.asarray(var_time)<=tmax) )[0] ]

    varfft = np.abs( np.fft.fft(data.get())[:(len(data.get())//2)] )
    x = np.linspace(0, len(varfft), num=len(varfft), endpoint=True)
    f1 = interp1d(x, varfft, kind='cubic')
    x = np.linspace(0, len(varfft), num=1000, endpoint=True)
    varfft = f1(x)
    
    TRAIN.append( varfft )
    TARGET.append( traint.species_id.values[i] )
    
FT = np.stack(TRAIN)
TARGET = np.array(TARGET)
FT.shape, len(TARGET)

In [None]:
from joblib import Parallel, delayed

def extract_features( fn ):
    data, samplerate = sf.read( '../input/rfcx-species-audio-detection/train/'+fn+'.flac')
    
    data = cp.array(data)

    varfft = cp.abs( cp.fft.fft(data)[:(len(data)//2)] )
    x = cp.linspace(0, len(varfft), num=len(varfft), endpoint=True)
    f1 = interp1d(x.get(), varfft.get(), kind='cubic')
    x = np.linspace(0, len(varfft.get()), num=1000, endpoint=True)
    varfft = f1(x)
    
    return varfft
    
FP = Parallel(n_jobs=4)(delayed(extract_features)(fn) for i in tqdm(trainf.recording_id.to_array()))
FP = np.stack(FP)
gc.collect()
FP.shape

In [None]:
def extract_features( fn ):
    data, samplerate = sf.read(fn)
    
    data = cp.array(data)

    varfft = cp.abs( cp.fft.fft(data)[:(len(data)//2)] )
    x = cp.linspace(0, len(varfft), num=len(varfft), endpoint=True)
    f1 = interp1d(x.get(), varfft.get(), kind='cubic')
    x = np.linspace(0, len(varfft.get()), num=1000, endpoint=True)
    varfft = f1(x)
    
    return varfft
    
TEST = Parallel(n_jobs=4)(delayed(extract_features)(fn) for fn in tqdm(testfiles))
TEST = np.stack(TEST)
gc.collect()
TEST.shape

In [None]:
TRAIN = np.vstack( (FT, FP) )
TRAIN.shape

In [None]:
tt = traint[['recording_id','species_id']].copy()
tf = trainf[['recording_id','species_id']].copy()

tf['species_id'] = -1

TRAIN_TAB = cudf.concat( (tt, tf) )

for i in range(24):
    TRAIN_TAB['s'+str(i)] = 0
    TRAIN_TAB.loc[TRAIN_TAB.species_id==i,'s'+str(i)] = 1

TRAIN_TAB.shape

In [None]:
TRAIN_TAB.head()

In [None]:
%%time
sub = cudf.DataFrame({'recording_id': [f.split('/')[-1].split('.')[0] for f in testfiles] })
gkf = GroupKFold(5)

groups = TRAIN_TAB['recording_id'].to_array()
for tgt in tqdm(range(24)):
    target = TRAIN_TAB['s'+str(tgt)].values

    ytrain = np.zeros(TRAIN.shape[0])
    ytest = np.zeros(TEST.shape[0])
    for ind_train, ind_valid in gkf.split( TRAIN, target, groups ):
        model1 = LogisticRegression( C=1, max_iter=5000 )
        model1.fit( TRAIN[ind_train], target[ind_train] )
        
        model2 = SVC(probability=True, kernel='rbf', gamma='auto')
        model2.fit( TRAIN[ind_train], target[ind_train] )
        
        model3 = KNeighborsClassifier(n_neighbors=30)
        model3.fit( TRAIN[ind_train], target[ind_train] )
        
        ytrain[ind_valid] = (model1.predict_proba(TRAIN[ind_valid])[:,1]+model2.predict_proba(TRAIN[ind_valid])[:,1]+model3.predict_proba(TRAIN[ind_valid])[:,1])/3.
        ytest += (model1.predict_proba(TEST)[:,1]+model2.predict_proba(TEST)[:,1]+model3.predict_proba(TEST)[:,1]) / 15.

    print( 'Target AUC', tgt, roc_auc_score(target.get(), ytrain) )
    
    TRAIN_TAB['y'+str(tgt)] = ytrain
    sub['s'+str(tgt)] = ytest

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)