<img src="https://developer.nvidia.com/sites/default/files/pictures/2018/rapids/rapids-logo.png"/>

[Rapids](https://rapids.ai) is an open-source GPU accelerated Data Sceince and Machine Learning library, developed and mainatained by [Nvidia](https://www.nvidia.com). It is designed to be compatible with many existing CPU tools, such as Pandas, scikit-learn, numpy, etc. It enables **massive** acceleration of many data-science and machine learning tasks, oftentimes by a factor fo 100X, or even more. If you are interested in installing and running Rapids locally on your own machine, then you should [refer to the followong instructions](https://rapids.ai/start.html).

In [None]:
import cudf
import cuml
import cupy as cp
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
from scipy.interpolate import interp1d
import gc
from cuml.linear_model import LogisticRegression
from cuml.neighbors import KNeighborsClassifier
from cuml.svm import SVC
from cuml.ensemble import RandomForestClassifier
from cuml.preprocessing.TargetEncoder import TargetEncoder
from sklearn.model_selection import GroupKFold, KFold
from cuml.metrics import mean_squared_error

import soundfile as sf
# Librosa Libraries
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.metrics import roc_auc_score, label_ranking_average_precision_score

In [None]:
train = cudf.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv")
test = cudf.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv")
sample_submission = cudf.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')

In [None]:
target = train['target'].values
columns = test.columns[1:]
cat_features = columns[:19]
cat_features

In [None]:
train.head()

In [None]:
test.head()

In this notebook we'll deal with categorical features using Target Encoding. For the sake of consistency, target encoding needs to be applied withing the cross-validation loop; otherwise, we'll be easily leakign targt information to the out-of-fold rows, which can lead to serious overfitting.

We'll also start with a simple Ridge regression. This is the simplest ML algo, and in general can give us a good idea of what the baseline score would be for our problem.

In [None]:
lr_train_oof = cp.zeros((300000,))
lr_test_preds = 0
lr_train_oof.shape

In [None]:
%%time
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
        #print(f'Fold {f}')
        train_df, val_df = train.iloc[train_ind][columns], train.iloc[val_ind][columns]
        train_target, val_target = target[train_ind], target[val_ind]
        test_df = test.copy()
        
        for cat_col in cat_features:
            te = TargetEncoder()
            train_df[cat_col] = te.fit_transform(train_df[cat_col], train_target)
    
            val_df[cat_col] = te.transform(val_df[cat_col])
            test_df[cat_col] = te.transform(test_df[cat_col])
            
        model = LogisticRegression()
        model.fit(train_df, train_target)
        temp_oof = model.predict_proba(val_df)[[1]].values.flatten()
        temp_test = model.predict_proba(test_df[columns])[[1]].values.flatten()

        lr_train_oof[val_ind] = temp_oof
        lr_test_preds += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target.get(), temp_oof.get()))

In [None]:
print(roc_auc_score(target.get(), lr_train_oof.get()))

In [None]:
cp.save('lr_train_oof', lr_train_oof)
cp.save('lr_test_preds', lr_test_preds)

In [None]:
%%time

knn_train_oof = cp.zeros((300000,))
knn_test_preds = 0
knn_train_oof.shape

NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
        #print(f'Fold {f}')
        train_df, val_df = train.iloc[train_ind][columns], train.iloc[val_ind][columns]
        train_target, val_target = target[train_ind], target[val_ind]
        test_df = test.copy()
        
        for cat_col in cat_features:
            te = TargetEncoder()
            train_df[cat_col] = te.fit_transform(train_df[cat_col], train_target)
    
            val_df[cat_col] = te.transform(val_df[cat_col])
            test_df[cat_col] = te.transform(test_df[cat_col])
            
        model = KNeighborsClassifier(n_neighbors=150)
        model.fit(train_df, train_target)
        temp_oof = model.predict_proba(val_df)[[1]].values.flatten()
        temp_test = model.predict_proba(test_df[columns])[[1]].values.flatten()

        knn_train_oof[val_ind] = temp_oof
        knn_test_preds += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target.get(), temp_oof.get()))
        
print('\nOverall score:', roc_auc_score(target.get(), knn_train_oof.get()))

cp.save('knn_train_oof', knn_train_oof)
cp.save('knn_test_preds', knn_test_preds)

In [None]:
0.880144183149357

In [None]:
%%time

rf_train_oof = cp.zeros((300000,))
rf_test_preds = 0
rf_train_oof.shape

cu_rf_params = {'n_estimators': 2000,
    'max_depth': 12,
    'n_bins': 15,
    'n_streams': 8
}

NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
        #print(f'Fold {f}')
        train_df, val_df = train.iloc[train_ind][columns], train.iloc[val_ind][columns]
        train_target, val_target = target[train_ind], target[val_ind]
        test_df = test.copy()
        
        for cat_col in cat_features:
            te = TargetEncoder()
            train_df[cat_col] = te.fit_transform(train_df[cat_col], train_target)
    
            val_df[cat_col] = te.transform(val_df[cat_col])
            test_df[cat_col] = te.transform(test_df[cat_col])
            
        model = RandomForestClassifier(**cu_rf_params)
        model.fit(train_df.astype(np.float32), train_target.astype(np.float32))
        temp_oof = model.predict_proba(val_df.astype(np.float32))[[1]].values.flatten()
        temp_test = model.predict_proba(test_df.astype(np.float32)[columns])[[1]].values.flatten()

        rf_train_oof[val_ind] = temp_oof
        rf_test_preds += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target.get(), temp_oof.get()))
        
print('\nOverall score:', roc_auc_score(target.get(), rf_train_oof.get()))

cp.save('rf_train_oof', rf_train_oof)
cp.save('rf_test_preds', rf_test_preds)

In [None]:
print(roc_auc_score(target.get(), 0.6*knn_train_oof.get()+0.4*lr_train_oof.get()))

In [None]:
print(roc_auc_score(target.get(), 0.5*knn_train_oof.get()+0.25*lr_train_oof.get()+0.25*rf_train_oof.get()))

In [None]:
sample_submission['target'] = lr_test_preds
sample_submission.to_csv('submission_lr.csv', index=False)

In [None]:
sample_submission['target'] = knn_test_preds
sample_submission.to_csv('submission_knn.csv', index=False)

In [None]:
sample_submission['target'] = rf_test_preds
sample_submission.to_csv('submission_rf.csv', index=False)

In [None]:
sample_submission['target'] = 0.6*knn_test_preds+0.4*lr_test_preds
sample_submission.to_csv('submission_blend_0.csv', index=False)

In [None]:
sample_submission['target'] = 0.5*knn_test_preds+0.25*lr_test_preds+0.25*rf_test_preds
sample_submission.to_csv('submission_blend_1.csv', index=False)