In [None]:
import sys
!cp ../input/rapids/rapids.0.14.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import time

import pandas as pd

from sklearn.preprocessing import OneHotEncoder
import sklearn.neighbors
import sklearn.svm
import sklearn.ensemble
from sklearn.model_selection import KFold

import cudf
import cuml

import matplotlib.pyplot as plt
import numpy as np

In [None]:
NFOLDS = 5
ITERATION = 300

# Load data

In [None]:
train_orig_df = pd.read_csv("../input/titanic/train.csv")
test_orig_df = pd.read_csv("../input/titanic/test.csv")

# Pre-process

In [None]:
train_df = train_orig_df.copy()
train_df.drop(["Cabin", "Ticket", "Name"], axis=1, inplace=True)
train_df = pd.get_dummies(train_df.iloc[:, 1:], columns=["Pclass", "Sex", "Embarked"])
train_df.dropna(inplace=True)

X_all = train_df.drop(["Survived"], axis=1).astype("float32")
y_all = train_df["Survived"].astype("int32")

X_all_gpu = cudf.from_pandas(X_all)
y_all_gpu = cudf.from_pandas(y_all)

# Benchmark code

In [None]:
def bench(X, y, classifiers, params):
    elapsed = {}
    for name, clf_class in classifiers.items():
        elapsed_list = []

        for _ in range(ITERATION):
            kf = KFold(n_splits=NFOLDS)
            clf = clf_class()
            clf.set_params(**params[name])

            elapsed_sum = 0
            for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
                X_train = X_all.iloc[train_idx]
                y_train = y_all.iloc[train_idx]
                X_val = X_all.iloc[val_idx]
                y_val = y_all.iloc[val_idx]

                start = time.time()
                clf.fit(X_train, y_train)
                elapsed_sum += time.time() - start

            elapsed_list.append(elapsed_sum)

        elapsed[name] = pd.Series(elapsed_list).mean()
    return elapsed

# scikit-learn

In [None]:
classifiers = {
    "KNN": sklearn.neighbors.KNeighborsClassifier,
    "SVM": sklearn.svm.SVC,
    "RandomForest": sklearn.ensemble.RandomForestClassifier
}

params = {
    "KNN": {},
    "SVM": {
        "random_state": 47
    },
    "RandomForest": {
        "n_estimators": 100,
        "random_state": 47
    }
}

elapsed_sklearn = bench(X_all, y_all, classifiers, params)

# cuML

In [None]:
classifiers = {
    "KNN": cuml.neighbors.KNeighborsClassifier,
    "SVM": cuml.svm.SVC,
    "RandomForest": cuml.ensemble.RandomForestClassifier
}

params = {
    "KNN": {},
    "SVM": {},
    "RandomForest": {
        "n_estimators": 100
    }
}

elapsed_cuml = bench(X_all_gpu, y_all_gpu, classifiers, params)

# Results

In [None]:
left = np.arange(len(elapsed_sklearn.keys()))
width = 0.3

fig = plt.figure(figsize=(6, 6))
fig.patch.set_alpha(1)

plt.subplot(1, 1, 1)

plt.bar(left, elapsed_sklearn.values(), color='b', width=width, label="scikit-learn", align="center")
plt.bar(left + width, elapsed_cuml.values(), color="g", width=width, label="cuML", align="center")

plt.xticks(left + width / 2, elapsed_sklearn.keys())
plt.legend(loc=2)
plt.ylabel("sec / iter")
plt.title("fit() performance")
plt.show()

In [None]:
train_df = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
test_df = pd.read_csv("../input/home-credit-default-risk/application_test.csv")

In [None]:
ITERATION = 10

In [None]:
data_df = train_df.copy()
data_df.dropna(axis=1, inplace=True)

features_to_encode = [
    "NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY",
    "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE",
    "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE"
]

encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(data_df[features_to_encode])
labels = encoder.get_feature_names(features_to_encode)
encoded_df = pd.DataFrame(encoded, columns=labels)

data_df = pd.concat([data_df, encoded_df], axis=1)
data_df = data_df.drop(features_to_encode, axis=1)
data_df = data_df.drop(["SK_ID_CURR"], axis=1)

X_train = data_df.drop(["TARGET"], axis=1)
y_train = data_df["TARGET"]

In [None]:
def bench(X, y, classifiers, params):
    elapsed = {}
    for name, clf_class in classifiers.items():
        elapsed_list = []
        for i in range(ITERATION):
            print("{}: Trial {}".format(name, i))
            clf = clf_class()
            if not params:
                clf.set_params(**params)
            
            start = time.time()
            clf.fit(X, y)
            elapsed_list.append(time.time() - start)
        elapsed[name] = pd.Series(elapsed_list).mean()
    return elapsed

In [None]:
classifiers = {
    "KNN": sklearn.neighbors.KNeighborsClassifier,
    "RandomForest": sklearn.ensemble.RandomForestClassifier
}

params = {
    "KNN": {
        "n_jobs": -1,
    },
    "RandomForest": {
        "n_estimators": 100,
        "random_state": 47,
        "n_jobs": -1,
    }
}

elapsed_sklearn = bench(X_train, y_train, classifiers, params)

In [None]:
classifiers = {
    "KNN": cuml.neighbors.KNeighborsClassifier,
    "RandomForest": cuml.ensemble.RandomForestClassifier
}

params = {
    "KNN": {},
    "RandomForest": {
        "n_estimators": 100
    }
}

elapsed_cuml = bench(X_train, y_train, classifiers, params)

In [None]:
left = np.arange(len(elapsed_sklearn.keys()))
width = 0.3

fig = plt.figure(figsize=(6, 6))
fig.patch.set_alpha(1)

plt.subplot(1, 1, 1)

plt.bar(left, elapsed_sklearn.values(), color='b', width=width, label="scikit-learn", align="center")
plt.bar(left + width, elapsed_cuml.values(), color="g", width=width, label="cuML", align="center")

plt.xticks(left + width / 2, elapsed_sklearn.keys())
plt.legend(loc=2)
plt.ylabel("sec / iter")
plt.title("fit() performance")
plt.show()