This is a modified version of Bojan's Adverserial Rainforest notebook. https://www.kaggle.com/tunguz/adversarial-rainforest
- fft features are extracted from both train and test at the file level instead of using tp and fp labels for train.
- The classier trained on this data scores **0.6372492726994334** roc_auc vs **0.8668358325958252** when using tp and fp slices to extract fft features for train.
- The lower score of ~0.6 suggests the train and test distributions do not differ greatly which seems to be more in line with CV vs LB scores. 

In [None]:
!pip install --use-feature=2020-resolver https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/xgboost-1.3.0_SNAPSHOT%2Bdda9e1e4879118738d9f9d5094246692c0f6123c-py3-none-manylinux2010_x86_64.whl

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import glob
import os
from scipy.interpolate import interp1d
from scipy import signal
import gc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import shap

import soundfile as sf
# Librosa Libraries
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, accuracy_score

import xgboost

In [None]:
train_files = glob.glob( '../input/rfcx-species-audio-detection/train/*.flac' )
test_files = glob.glob( '../input/rfcx-species-audio-detection/test/*.flac' )
len(train_files), len(test_files), train_files[0]

In [None]:
def extract_features(fn):
    data, samplerate = sf.read(fn)
    data_fft = np.fft.fft(data)
    data_fft = data_fft[:(len(data)//2)]
    varfft = np.abs(data_fft)
    x = np.linspace(0, len(varfft), num=len(varfft), endpoint=True)
    f1 = interp1d(x, varfft, kind='cubic')
    x = np.linspace(0, len(varfft), num=1000, endpoint=True)
    varfft = f1(x)
    
    return varfft

In [None]:
train_fft_features = Parallel(n_jobs=4)(delayed(extract_features)(fn) for fn in tqdm(train_files))
train_fft_features = np.stack(train_fft_features)
gc.collect()

train_fft_features.shape

In [None]:
test_fft_features = Parallel(n_jobs=4)(delayed(extract_features)(fn) for fn in tqdm(test_files))
test_fft_features = np.stack(test_fft_features)
gc.collect()

test_fft_features.shape

In [None]:
target = np.hstack([np.ones(train_fft_features.shape[0]), np.zeros(train_fft_features.shape[0])])

In [None]:
train_test = np.vstack([train_fft_features, test_fft_features])

In [None]:
index = list(range(train_test.shape[0]))
random.shuffle(index)

In [None]:
train_test = train_test[index, :]
target = target[index]

In [None]:
train, test, y_train, y_test = train_test_split(train_test, target, test_size=0.33, random_state=42)

In [None]:
train = xgboost.DMatrix(train, label=y_train)
test = xgboost.DMatrix(test, label=y_test)

In [None]:
%%time
param = {
    'eta': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'objective': 'reg:logistic',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist', 
    'predictor': 'gpu_predictor'
}
clf = xgboost.train(param, train, 600)

In [None]:
preds = clf.predict(test)

In [None]:
roc_auc_score(y_test, preds)

In [None]:
%%time
shap_preds = clf.predict(test, pred_contribs=True)

In [None]:
shap.initjs()

In [None]:
shap.summary_plot(shap_preds[:,:1000])

In [None]:
shap.summary_plot(shap_preds[:,:1000], plot_type="bar")