<img src="https://developer.nvidia.com/sites/default/files/pictures/2018/rapids/rapids-logo.png"/>

[Rapids](https://rapids.ai) is an open-source GPU accelerated Data Sceince and Machine Learning library, developed and mainatained by [Nvidia](https://www.nvidia.com). It is designed to be compatible with many existing CPU tools, such as Pandas, scikit-learn, numpy, etc. It enables **massive** acceleration of many data-science and machine learning tasks, oftentimes by a factor fo 100X, or even more. 

Rapids is still undergoing developemnt, and this notebook is the first use of RAPIDS in the Kaggle Docker environment. If you are interested in installing and running Rapids locally on your own machine, then you should [refer to the followong instructions](https://rapids.ai/start.html).

This notebook is based on the following Giba notebook: https://www.kaggle.com/titericz/0-309-baseline-logisticregression-using-fft/output

In [None]:
import cudf
import cuml
import cupy as cp
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
from scipy.interpolate import interp1d
import gc
from cuml.linear_model import LogisticRegression
from cuml.neighbors import KNeighborsClassifier
from cuml.svm import SVC
from cuml.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold

import soundfile as sf
# Librosa Libraries
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.metrics import roc_auc_score, label_ranking_average_precision_score

In [None]:
trainfiles = glob.glob( '../input/rfcx-species-audio-detection/train/*.flac' )
testfiles = glob.glob( '../input/rfcx-species-audio-detection/test/*.flac' )
len(trainfiles), len(testfiles), trainfiles[0]

In [None]:
traint = cudf.read_csv( '../input/rfcx-species-audio-detection/train_tp.csv' )
traint['t_dif'] = traint['t_max'] - traint['t_min']
traint['f_dif'] = traint['f_max'] - traint['f_min']

trainf = cudf.read_csv( '../input/rfcx-species-audio-detection/train_fp.csv' )
trainf['t_dif'] = trainf['t_max'] - trainf['t_min']
trainf['f_dif'] = trainf['f_max'] - trainf['f_min']

traint.shape, trainf.shape

In [None]:
traint.head()


In [None]:
trainf.head()

In [None]:
trainf.f_dif.unique()

In [None]:
data, samplerate = sf.read(trainfiles[0]) 
print( data.shape, samplerate )
librosa.display.waveplot(y = data, sr = samplerate, color = "#B14D")

In [None]:
traint.describe()

In [None]:
trainf.describe()

In [None]:
def extract_fft(fn):
    data, samplerate = sf.read(fn)
    data = cp.array(data)

    varfft = cp.abs( cp.fft.fft(data)[:(len(data)//2)] )
    
    return cp.asnumpy( varfft.reshape( (1000,1440) ).mean(axis=1) )

In [None]:
FT = []
for fn in tqdm(traint.recording_id.to_array()):
    FT.append( extract_fft( '../input/rfcx-species-audio-detection/train/'+fn+'.flac' ) )
FT = np.stack(FT)
gc.collect()

FT.shape

In [None]:
# This loop runs in 7min using cupy(GPU) and 40min on numpy(CPU). ~7x Faster in GPU

FF = []
for fn in tqdm(trainf.recording_id.to_array()):
    FF.append( extract_fft( '../input/rfcx-species-audio-detection/train/'+fn+'.flac' ) )
FF = np.stack(FF)
gc.collect()

FF.shape

In [None]:
#Combine True Positives and False Positives

TRAIN = np.vstack( (FT, FF) )

del FT, FF
gc.collect()
TRAIN.shape

In [None]:
TEST = []
for fn in tqdm(testfiles):
    TEST.append( extract_fft(fn) )
TEST = np.stack(TEST)
gc.collect()

TEST.shape

In [None]:
TEST.shape

In [None]:
tt = traint[['recording_id','species_id']].copy()
tf = trainf[['recording_id','species_id']].copy()

tf['species_id'] = -1

TRAIN_TAB = cudf.concat( (tt, tf) )

for i in range(24):
    TRAIN_TAB['s'+str(i)] = 0
    TRAIN_TAB.loc[TRAIN_TAB.species_id==i,'s'+str(i)] = 1

TRAIN_TAB.shape

In [None]:
TRAIN_TAB.head()


In [None]:
%%time
n_folds = 8
sub = cudf.DataFrame({'recording_id': [f.split('/')[-1].split('.')[0] for f in testfiles] })
gkf = GroupKFold(n_folds)

groups = TRAIN_TAB['recording_id'].to_array()
for tgt in tqdm(range(24)):
    target = TRAIN_TAB['s'+str(tgt)].values

    ytrain = np.zeros(TRAIN.shape[0])
    ytest = np.zeros(TEST.shape[0])
    for ind_train, ind_valid in gkf.split( TRAIN, target, groups ):
        model1 = LogisticRegression( C=1, max_iter=10000 )
        model1.fit( TRAIN[ind_train], target[ind_train] )
        
        model2 = SVC(probability=True, kernel='rbf', gamma='auto')
        model2.fit( TRAIN[ind_train], target[ind_train] )
        
        model3 = KNeighborsClassifier(n_neighbors=30)
        model3.fit( TRAIN[ind_train], target[ind_train] )
        
        model4 = RandomForestClassifier(n_estimators=500, max_depth=13)
        model4.fit( np.float32(TRAIN[ind_train]), target[ind_train] )
        
        ytrain[ind_valid] = (0.2*model1.predict_proba(np.float32(TRAIN[ind_valid]))[:,1]+0.2*model2.predict_proba(np.float32(TRAIN[ind_valid]))[:,1]+
                             0.2*model3.predict_proba(np.float32(TRAIN[ind_valid]))[:,1]+0.4*model4.predict_proba(np.float32(TRAIN[ind_valid]))[:,1])
        ytest += (0.2*model1.predict_proba(np.float32(TEST))[:,1]+0.2*model2.predict_proba(np.float32(TEST))[:,1]+0.2*model3.predict_proba(np.float32(TEST))[:,1]
                 +0.4*model4.predict_proba(np.float32(TEST))[:,1]) / n_folds.

    print( 'Target AUC', tgt, roc_auc_score(target.get(), ytrain) )
    
    TRAIN_TAB['y'+str(tgt)] = ytrain
    sub['s'+str(tgt)] = ytest

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)