In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc
import time
from scipy.interpolate import interp1d
import lightgbm as lgb
import xgboost as xgb
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from scipy.stats import rankdata

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, label_ranking_average_precision_score

import soundfile as sf
# Librosa Libraries
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt

In [None]:
trainfiles = glob.glob( '../input/rfcx-species-audio-detection/train/*.flac' )
testfiles = glob.glob( '../input/rfcx-species-audio-detection/test/*.flac' )
len(trainfiles), len(testfiles), trainfiles[0]

In [None]:
traint = pd.read_csv( '../input/rfcx-species-audio-detection/train_tp.csv' )
trainf = pd.read_csv( '../input/rfcx-species-audio-detection/train_fp.csv' )
traint.shape, trainf.shape

In [None]:
traint.head()

In [None]:
trainf.head()

In [None]:
def extract_fft(fn):
    data, samplerate = sf.read(fn)
    data = np.array(data)

    varfft = np.abs( np.fft.fft(data)[:(len(data)//2)] )
    return np.array( varfft.reshape( (1000,1440) ).mean(axis=1) )

In [None]:
FT = []
for fn in tqdm(traint.recording_id.values):
    FT.append( extract_fft( '../input/rfcx-species-audio-detection/train/'+fn+'.flac' ) )
FT = np.stack(FT)
gc.collect()

FT.shape

In [None]:
# This loop runs in 7min using cupy(GPU) and 40min on numpy(CPU). ~7x Faster in GPU

FF = []
for fn in tqdm(trainf.recording_id.values):
    FF.append( extract_fft( '../input/rfcx-species-audio-detection/train/'+fn+'.flac' ) )
FF = np.stack(FF)
gc.collect()

FF.shape

In [None]:
#Combine True Positives and False Positives

TRAIN = np.vstack( (FT, FF) )

del FT, FF
gc.collect()
TRAIN.shape

In [None]:
TEST = []
for fn in tqdm(testfiles):
    TEST.append( extract_fft(fn) )
TEST = np.stack(TEST)
gc.collect()

TEST.shape

In [None]:
tt = traint[['recording_id','species_id']].copy()
tf = trainf[['recording_id','species_id']].copy()
tf['species_id'] = -1

TRAIN_TAB = pd.concat( (tt, tf) )

for i in range(24):
    TRAIN_TAB['s'+str(i)] = 0
    TRAIN_TAB.loc[TRAIN_TAB.species_id==i,'s'+str(i)] = 1

TRAIN_TAB.head()

In [None]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit( np.vstack((TRAIN,TEST)) )

TRAIN = std.transform(TRAIN)
TEST  = std.transform(TEST)
gc.collect()

In [None]:
from sklearn.metrics import log_loss


In [None]:
def log_loss_metric(y_true, y_pred):
    metrics = []
    pred_col = y_pred.columns
    true_col = y_true.columns
    for _target in range(24):
        metrics.append(log_loss(y_true.loc[:,true_col[_target]], y_pred.loc[:,pred_col[_target]].astype(float), labels = [0,1]))
    return np.mean(metrics)

In [None]:
from sklearn.ensemble import RandomForestClassifier as RF
sub = pd.DataFrame({'recording_id': [f.split('/')[-1].split('.')[0] for f in testfiles] })
gkf = GroupKFold(5)
AUC_scores = []

groups = TRAIN_TAB['recording_id'].values
for tgt in range(24):
    target = TRAIN_TAB['s'+str(tgt)].values

    ytrain = np.zeros(TRAIN.shape[0])
    ytest = np.zeros(TEST.shape[0])
    for ind_train, ind_valid in gkf.split( TRAIN, target, groups ):
        model = RF(max_depth = 25,n_estimators = 1200,class_weight ="balanced",random_state=0,n_jobs=-1)
        model.fit( TRAIN[ind_train], target[ind_train] )
        
        ytrain[ind_valid] = model.predict_proba(TRAIN[ind_valid])[:,1]
        ytest += model.predict_proba(TEST)[:,1] / 5.
    AUC_scores.append(roc_auc_score(target, ytrain))

    print( 'Target AUC', tgt, roc_auc_score(target, ytrain) )
    
    TRAIN_TAB['y'+str(tgt)] = ytrain
    sub['s'+str(tgt)] = ytest
print(f'AUC mean:{np.mean(AUC_scores)}') 


In [None]:
sub1 = sub.copy()
oof1 = TRAIN_TAB[TRAIN_TAB.columns[26:]]

In [None]:
perd = TRAIN_TAB[TRAIN_TAB.columns[26:]]
true = TRAIN_TAB[TRAIN_TAB.columns[2:26]]
print(f'log_loss:{log_loss_metric(true, perd)}')



In [None]:
from sklearn.svm import SVC
sub = pd.DataFrame({'recording_id': [f.split('/')[-1].split('.')[0] for f in testfiles] })
gkf = GroupKFold(5)
AUC_scores = []

groups = TRAIN_TAB['recording_id'].values
for tgt in range(24):
    target = TRAIN_TAB['s'+str(tgt)].values

    ytrain = np.zeros(TRAIN.shape[0])
    ytest = np.zeros(TEST.shape[0])
    for ind_train, ind_valid in gkf.split( TRAIN, target, groups ):
        model = SVC(C=1.0, class_weight='balanced', probability=True, kernel='rbf', gamma='auto')
        model.fit( TRAIN[ind_train], target[ind_train] )
        
        ytrain[ind_valid] = model.predict_proba(TRAIN[ind_valid])[:,1]
        ytest += model.predict_proba(TEST)[:,1] / 5.
    AUC_scores.append(roc_auc_score(target, ytrain))

    print( 'Target AUC', tgt, roc_auc_score(target, ytrain) )
    
    TRAIN_TAB['y'+str(tgt)] = ytrain
    sub['s'+str(tgt)] = ytest
print(np.mean(AUC_scores))    

In [None]:
sub2 = sub.copy()
oof2 =  TRAIN_TAB[TRAIN_TAB.columns[26:]]

In [None]:
perd = TRAIN_TAB[TRAIN_TAB.columns[26:]]
true = TRAIN_TAB[TRAIN_TAB.columns[2:26]]
print(f'log_loss:{log_loss_metric(true, perd)}')

# Model Blending Weights Optimisation


In [None]:
sys.path.append('../input/autograd')


In [None]:
oof1 = np.array(oof1.values,dtype='float64')
oof2 = np.array(oof2.values,dtype='float64')

In [None]:
import datetime
import pandas as pd
from time import time
import tensorflow as tf
from autograd import grad
import autograd.numpy as np
from scipy.optimize import minimize, fsolve

In [None]:
# Numpy Log Loss
def log_loss_numpy(y_pred):
    loss = 0
    y_pred_clip = np.clip(y_pred, 1e-16, 1 - 1e-16)
    for i in range(y_pred.shape[1]):
        loss += - np.mean(y_true[:, i] * np.log(y_pred_clip[:, i]) + (1 - y_true[:, i]) * np.log(1 - y_pred_clip[:, i]))
    return loss / y_pred.shape[1]

def func_numpy_metric(weights):
    coef = 1e-6
    oof_blend = weights[0] * oof1 + weights[1] * oof2 + weights[2] * oof3 
    score = log_loss_numpy(oof_blend)
    penalty = coef * (np.sum(weights) - 1) ** 2
    return score + penalty

In [None]:
y_true = true.values

In [None]:
print('Model 1 OOF (Numpy Log Loss):', log_loss_numpy(oof1))
print('Model 2 OOF (Numpy Log Loss):', log_loss_numpy(oof2))
print('-' * 50)


In [None]:
def Lagrange_func(params):
    w1, w2,_lambda = params
    oof_blend = w1 * oof1 + w2 * oof2 
    return log_loss_numpy(oof_blend) - _lambda * (w1 + w2 - 1)

In [None]:
grad_L = grad(Lagrange_func)


In [None]:
def Lagrange_obj(params):
    w1, w2,_lambda = params
    dLdw1, dLdw2, dLdlam = grad_L(params)
    return [dLdw1, dLdw2, w1 + w2 - 1]

In [None]:
start_time = time()
w1, w2, _lambda = fsolve(Lagrange_obj, [0.5,0.5,0.1])
print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] Optimised Weights:', [w1, w2])
oof_b = w1 * oof1 + w2 * oof2
print('Optimised Blend OOF:', log_loss_numpy(oof_b))

In [None]:
print('Check Condition (1a):', w1 + w2 )
if w1 + w2 - 1 <= 1e-10:
    print('Great! The sum of all weights equals to 1!')
else:
    print('Manual adjustion is needed to modify the weights.')

In [None]:
sub = sub1.copy()
tar_col = sub.columns
tar_col = tar_col[1:]
sub.loc[:,tar_cols]= sub1.loc[:,tar_cols]*w1+sub2.loc[:,tar_cols]*w2

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)

In [None]:
!ls