<div>
    <h1 align="center">"Rainforest Connection Species Audio Detection"</h1></h1>
    <h4 align="center">By: Somayyeh Gholami & Mehran Kazeminia</h4>
</div>

<div class="alert alert-success">  
</div>

**The codes of this address have been used:**

https://www.kaggle.com/titericz/0-525-tabular-xgboost-gpu-fft-gpu-cuml-fast

You can also find the full description there. Thanks to Mr. [Giba](https://www.kaggle.com/titericz) & https://rapids.ai

<div class="alert alert-success">  
</div>

In [None]:
import os
import gc
import time
import glob
import numpy as np 
import pandas as pd 
import xgboost as xgb
import lightgbm as lgb
import soundfile as sf

from tqdm.notebook import tqdm
from joblib import Parallel, delayed

from scipy.stats import rankdata
from scipy.interpolate import interp1d

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, label_ranking_average_precision_score

import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt

import cuml as cm
import cupy as cp


<div class="alert alert-success">  
</div>

In [None]:
trainfiles = glob.glob( '../input/rfcx-species-audio-detection/train/*.flac' )
testfiles = glob.glob( '../input/rfcx-species-audio-detection/test/*.flac' )

len(trainfiles), len(testfiles), trainfiles[0]


In [None]:
traint = pd.read_csv( '../input/rfcx-species-audio-detection/train_tp.csv' )
traint['t_dif'] = traint['t_max'] - traint['t_min']
traint['f_dif'] = traint['f_max'] - traint['f_min']

trainf = pd.read_csv( '../input/rfcx-species-audio-detection/train_fp.csv' )
trainf['t_dif'] = trainf['t_max'] - trainf['t_min']
trainf['f_dif'] = trainf['f_max'] - trainf['f_min']

traint.shape, trainf.shape


In [None]:
traint.head()


In [None]:
trainf.head()


In [None]:
trainf.f_dif.unique()


In [None]:
traint.nunique()


In [None]:
trainf.nunique()


In [None]:
traint.describe()


In [None]:
trainf.describe()


<div class="alert alert-success">  
</div>

In [None]:
def extract_fft(fn):
    data, samplerate = sf.read(fn)
    data = cp.array(data)

    varfft = cp.abs( cp.fft.fft(data)[:(len(data)//2)] )
    
    return cp.asnumpy( varfft.reshape( (1000,1440) ).mean(axis=1) )


In [None]:
FT = []
for fn in tqdm(traint.recording_id.values):
    FT.append( extract_fft( '../input/rfcx-species-audio-detection/train/'+fn+'.flac' ) )
FT = np.stack(FT)
gc.collect()

FT.shape


In [None]:
FF = []
for fn in tqdm(trainf.recording_id.values):
    FF.append( extract_fft( '../input/rfcx-species-audio-detection/train/'+fn+'.flac' ) )
FF = np.stack(FF)
gc.collect()

FF.shape


In [None]:
#Combine True Positives and False Positives
TRAIN = np.vstack( (FT, FF) )

del FT, FF
gc.collect()
TRAIN.shape


In [None]:
TEST = []
for fn in tqdm(testfiles):
    TEST.append( extract_fft(fn) )
TEST = np.stack(TEST)
gc.collect()

TEST.shape


In [None]:
tt = traint[['recording_id','species_id']].copy()
tf = trainf[['recording_id','species_id']].copy()
tf['species_id'] = -1

TRAIN_TAB = pd.concat( (tt, tf) )

for i in range(24):
    TRAIN_TAB['s'+str(i)] = 0
    TRAIN_TAB.loc[TRAIN_TAB.species_id==i,'s'+str(i)] = 1

TRAIN_TAB.shape


In [None]:
TRAIN_TAB.head()


In [None]:
std = StandardScaler()
std.fit( np.vstack((TRAIN,TEST)) )

TRAIN = std.transform(TRAIN)
TEST  = std.transform(TEST)
gc.collect()


<div class="alert alert-success">  
</div>

In [None]:
sub = pd.DataFrame({'recording_id': [f.split('/')[-1].split('.')[0] for f in testfiles] })
gkf = GroupKFold(5)

SCORE = []
groups = TRAIN_TAB['recording_id'].values

for tgt in range(0,24):
    starttime = time.time()
    target = TRAIN_TAB['s'+str(tgt)].values

    ytrain = np.zeros(TRAIN.shape[0])
    ytest = np.zeros(TEST.shape[0])
    
    for ind_train, ind_valid in gkf.split( TRAIN, target, groups ):
        
        # Define 4 models
        model1 = xgb.XGBClassifier(n_estimators=1000,
                                   max_depth=4,
                                   learning_rate=0.09,
                                   verbosity=0,
                                   objective='binary:logistic',
                                   subsample=0.95,
                                   colsample_bytree=0.95,
                                   random_state=2021,
                                   tree_method='gpu_hist',
                                   predictor='gpu_predictor',
                                   n_jobs=2,
                                   scale_pos_weight = np.sum(target==0) / np.sum(target==1),
                                  )
        model2 = cm.linear_model.LogisticRegression( C=1, max_iter=5000 )
        
        model3 = cm.svm.SVC(C=1.0, class_weight='balanced', probability=True, kernel='rbf', gamma='auto')
        
        model4 = cm.neighbors.KNeighborsClassifier(n_neighbors=55)
        
        # Train using GPUs
        model1.fit( X=TRAIN[ind_train], y=target[ind_train], eval_set=[(TRAIN[ind_valid], target[ind_valid])], eval_metric='auc', early_stopping_rounds=60, verbose=False )
        model2.fit( TRAIN[ind_train], target[ind_train] )
        model3.fit( TRAIN[ind_train], target[ind_train] )
        model4.fit( TRAIN[ind_train], target[ind_train] )
        
        # Predict valid and test sets
        yvalid1 = model1.predict_proba(TRAIN[ind_valid])[:,1]
        yvalid2 = model2.predict_proba(TRAIN[ind_valid])[:,1]
        yvalid3 = model3.predict_proba(TRAIN[ind_valid])[:,1]
        yvalid4 = model4.predict_proba(TRAIN[ind_valid])[:,1]
        
        ytest1 = model1.predict_proba(TEST)[:,1]
        ytest2 = model2.predict_proba(TEST)[:,1]
        ytest3 = model3.predict_proba(TEST)[:,1]
        ytest4 = model4.predict_proba(TEST)[:,1]
        
        #Rank predictions
        SZ = len(ind_valid) + len(ytest1)
        yvalid1 = rankdata( np.concatenate((yvalid1,ytest1)) )[:len(ind_valid)] / SZ
        yvalid2 = rankdata( np.concatenate((yvalid2,ytest2)) )[:len(ind_valid)] / SZ
        yvalid3 = rankdata( np.concatenate((yvalid3,ytest3)) )[:len(ind_valid)] / SZ
        yvalid4 = rankdata( np.concatenate((yvalid4,ytest4)) )[:len(ind_valid)] / SZ
        
        ytest1 = rankdata( np.concatenate((yvalid1,ytest1)) )[len(ind_valid):] / SZ
        ytest2 = rankdata( np.concatenate((yvalid2,ytest2)) )[len(ind_valid):] / SZ
        ytest3 = rankdata( np.concatenate((yvalid3,ytest3)) )[len(ind_valid):] / SZ
        ytest4 = rankdata( np.concatenate((yvalid4,ytest4)) )[len(ind_valid):] / SZ
        
        #Weighted average models
        ytrain[ind_valid] = ((0.85*yvalid1) + (0.15*yvalid2) + (0.00*yvalid3) + (0.00*yvalid4)) / 4.
        ytest += ((0.85*ytest1) + (0.15*ytest2) + (0.00*ytest3) + (0.00*ytest4)) / (4.*5)

    score = roc_auc_score(target, ytrain)
    print( 'Target AUC', tgt, score, time.time()-starttime )
    SCORE.append(score)
    
    TRAIN_TAB['y'+str(tgt)] = ytrain
    sub['s'+str(tgt)] = ytest

print('Overall Score:', np.mean(SCORE) )


In [None]:
sub.head()


In [None]:
sub.to_csv('submission.csv', index=False)


In [None]:
!ls


<div class="alert alert-success">  
</div>