- Original notebook: https://www.kaggle.com/titericz/0-525-tabular-xgboost-gpu-fft-gpu-cuml-fast/
- Changes to the original notebook
        - Training samples more carefully selected
        - One model instead of multiple models
        - Hyperparameter fine tuned
- This model does not use data in `TFRecord` format. No data in the `tfrecords` folder is used.
- This model does not use `t_min, f_min, t_max, f_max` to extract relevant sections of the audio.
- Because the testing data does not have `songtype_id, t_min, f_min, t_max, f_max`, also will not use them as plain XGBoost features

In [None]:
import copy
import gc
import glob
import numpy as np
import pandas as pd
import time

import cupy as cp
from matplotlib import pyplot
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import soundfile as sf
from tqdm.notebook import tqdm
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance

In [None]:
trainfiles = sorted(glob.glob( '../input/rfcx-species-audio-detection/train/*.flac' ))
testfiles = sorted(glob.glob( '../input/rfcx-species-audio-detection/test/*.flac' ))
len(trainfiles), len(testfiles), trainfiles[0]

In [None]:
traint = pd.read_csv( '../input/rfcx-species-audio-detection/train_tp.csv' )
trainf = pd.read_csv( '../input/rfcx-species-audio-detection/train_fp.csv' )
traint.shape, trainf.shape

### Take a look at the True Positive species labels and False Positives species labels

- This model does not use `t_min, f_min, t_max, f_max` to extract relevant sections of the audio.
- Because the testing data does not have `songtype_id, t_min, f_min, t_max, f_max`, also will not use them as plain XGBoost features

In [None]:
traint.head()

In [None]:
trainf.head()

### Fourier transform function

- The reason to reshape to (1000, 1440) is to reduce the features. Instead of using 1440000 features, we average every 1440 features, decreasing the number of features to 1000.
- In `varfft = cp.abs( cp.fft.fft(data)[:(len(data)//2)] )` we only get half of the length, because the rest half is just mirroring.

In [None]:
pyplot.rcParams["figure.figsize"] = (20,10)
data, samplerate = sf.read('../input/rfcx-species-audio-detection/train/a66546dfd.flac')
data = cp.array(data)
varfft = cp.abs( cp.fft.fft(data)[:(len(data)//2)] )
reshaped = cp.asnumpy( varfft.reshape( (1000,1440) ).mean(axis=1) )

print(f"len(cp.fft.fft(data)): {len(cp.fft.fft(data))}")
print(f"len(data): {len(data)}")
print(f"len(data)//2: {len(data)//2}")
print(f"varfft.shape: {varfft.shape}")
print(f"reshaped.shape: {reshaped.shape}")

Before reshaping

In [None]:
pyplot.plot(range(0, len(varfft)), cp.asnumpy(varfft))

After reshaping

In [None]:
pyplot.plot(range(0, len(reshaped)), cp.asnumpy(reshaped))

In [None]:
del data, varfft, reshaped

In [None]:
def extract_fft(fn):
    data, samplerate = sf.read(fn)
    data = cp.array(data)
    varfft = cp.abs( cp.fft.fft(data)[:(len(data)//2)] )
    return cp.asnumpy( varfft.reshape( (1000,1440) ).mean(axis=1) )

### X_train

In [None]:
X_train = []
for fn in tqdm(trainfiles):
    X_train.append( extract_fft(fn) )
X_train = np.stack(X_train)
gc.collect()

X_train.shape

### X_test

In [None]:
X_test = []
for fn in tqdm(testfiles):
    X_test.append( extract_fft(fn) )
X_test = np.stack(X_test)
gc.collect()

X_test.shape

### y_train

In [None]:
tt = traint[['recording_id','species_id']].copy()
tf = trainf[['recording_id','species_id']].copy()
tt["tp_and_fp"] = "tp"
tf["tp_and_fp"] = "fp"
# The order is True Positive first, False Positive second
y_train_all_classes = pd.concat( (tt, tf) )

for i in range(24):
    y_train_all_classes['s'+str(i)] = 0
    # Notice that the False Positive labels should be 0
    # Will correct them in the helper function
    y_train_all_classes.loc[y_train_all_classes.species_id==i,'s'+str(i)] = 1

y_train_all_classes.head()

A helper function

In [None]:
def get_unique_tp_and_tn(y_train_all_classes, target_col):
    """
    Get the recording_id of True Positive and True Negative only
    """
    df_tp_and_tn = (
        y_train_all_classes[["recording_id", "tp_and_fp", target_col]]
        # Exclude False Negative ones as they are useless
        .query(f'tp_and_fp == "tp" or {target_col} == 1')
    )
    # If they are False Positive, need to correct the label
    df_tp_and_tn.loc[df_tp_and_tn.tp_and_fp=="fp", target_col] = 0
    df_unique_tp_and_tn = (
        df_tp_and_tn
        .groupby("recording_id")
        .max(target_col)
    )
    return df_unique_tp_and_tn

# Example
get_unique_tp_and_tn(y_train_all_classes, "s19").sort_values("s19", ascending=False)

### Turn into dataframe

In [None]:
X_train = pd.DataFrame(X_train)
recording_id = [path[44:53] for path in trainfiles]
X_train = X_train.set_index(pd.Index(recording_id))

X_test = pd.DataFrame(X_test)

### Check

In [None]:
X_train.head()

### Train

    specify a submission file skeleton
    specify a hyperparameter dict template
    specify a RandomizedSearchCV template
    for each class
        extract the corresponding target col
        specify an edited hyperparameter dict, for the sake of scale_pos_weight
        specify an edited RandomizedSearchCV, for the sake of scale_pos_weight
        RandomizedSearchCV fit(X_train, y_train)
        RandomizedSearchCV.best_estimator_.predict_proba(X_test)
        append the prediction to the submission file as a new column
  
The following 2 parameters make GPU work for XGBClassifier
- `tree_method='gpu_hist'`
- `predictor='gpu_predictor'`

In [None]:
sub = pd.DataFrame({'recording_id': [f.split('/')[-1].split('.')[0] for f in testfiles] })

params = {
    "n_estimators": [40, 60, 80, 100, 120],
    "max_depth": [1, 2, 3, 5, 8, 13],
    "learning_rate": [0.02, 0.04, 0.08, 0.16, 0.32],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "colsample_bylevel": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "colsample_bynode": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "gamma": [0, 0.001, 0.005, 0.025, 0.125],
    "min_child_weight": [0, 1, 2, 3, 4, 5],
    "max_delta_step": [0, 1, 2, 3, 4, 5],
    "reg_alpha": [0, 0.001, 0.005, 0.025, 0.125],
    "reg_lambda": [0.9, 1, 1.1, 1.2]
}

model = XGBClassifier(
    n_jobs=-1,
    random_state=1,
    tree_method="gpu_hist",
    predictor="gpu_predictor"
)

cv = RandomizedSearchCV(
    estimator = model, 
    param_distributions = params,
    n_jobs = -1,
    n_iter = 32,
    cv = StratifiedKFold(n_splits=5, random_state=1),
    return_train_score = False,
    random_state = 1,
    refit = True,
    scoring = "average_precision"
)

for tgt in range(0,24):
    # updated each loop
    starttime = time.time()
    # extract the corresponding target col
    y_train = get_unique_tp_and_tn(y_train_all_classes, 's'+str(tgt))
    # part of the scale_pos_weight can only be specified within the loop
    params_copy = copy.deepcopy(params)
    params_copy["scale_pos_weight"] = [4, 8, 16, np.sum(y_train.values==0) / np.sum(y_train.values==1)]
    cv_copy = copy.deepcopy(cv)
    cv_copy.param_distributions = params_copy
    # fit
    cv_copy.fit(
        X_train.loc[y_train.index],
        y_train.values.ravel()
    )
    print("==================================================")
    print(f"best_score of {tgt}: {cv_copy.best_score_}")
    print(f"best_params of {tgt}: {cv_copy.best_params_}")
    # plot_importance(cv_copy.best_estimator_, max_num_features = 20, title = "Feature importance" + str(tgt))
    # pyplot.scatter(range(len(cv_copy.best_estimator_.feature_importances_)), cv_copy.best_estimator_.feature_importances_)

    # predict
    sub['s'+str(tgt)] = cv_copy.best_estimator_.predict_proba(X_test)[:,1]
    print(f"{tgt} time: {time.time()-starttime}")

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)

In [None]:
!ls