In this notebook, we will learn about:
    
- How can we optimize the hyperparameters of a predictive model,
- How can we define an objective function that optuna can use,
- GPU powered cuML classifiers

**If you find this tutorial helpful for your studies, an upvote would be too much appreciated.**

# Imports

In [None]:
import gc

import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

import cudf
from cuml.preprocessing import train_test_split, StandardScaler
from cuml.svm import SVC
from cuml.ensemble import RandomForestClassifier

import optuna

In [None]:
BASE_DATA_DIR = "../input/tabular-playground-series-feb-2022/"
!ls {BASE_DATA_DIR}

In [None]:
df_train = cudf.read_csv(BASE_DATA_DIR + "train.csv")
df_test = cudf.read_csv(BASE_DATA_DIR + "test.csv")
df_sub = cudf.read_csv(BASE_DATA_DIR + "sample_submission.csv")

In [None]:
print("Train shape: ", df_train.shape)
print("Test shape: ", df_test.shape)

In [None]:
df_train.info()

In [None]:
idx_2_label = sorted(df_train["target"].unique().values_host)
label_2_idx = {label: idx for idx, label in enumerate(idx_2_label)}

In [None]:
df_train["target"].value_counts().to_pandas().plot(kind="bar", cmap="winter")

In [None]:
df_train["target"] = df_train["target"].map(label_2_idx)

# Defining the objective function

In this example, we only consider the RandomForestClassifier as our predictive model. But feel free to use any other classifier algorithm from the cuML [documentation](https://docs.rapids.ai/api/cuml/stable/api.html#regression-and-classification).

In [None]:
def objective(trial):
    classifier = trial.suggest_categorical('classifier', ["RandomForestClassifier"]) # "SVC", "XGBoostClassifier"
    if classifier == "SVC":
        svc_c = trial.suggest_float("C", 1e-2, 1e2, log=True)
        svc_gamma = trial.suggest_float("gamma", 1e-2, 1e-2, log=True)
        svc_kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf"])
        clf = SVC(C=svc_c, kernel=svc_kernel, gamma=svc_gamma)
    elif classifier == "RandomForestClassifier":
        n_estimators = trial.suggest_int("n_estimators", 50, 1000, log=True)
        max_depth = trial.suggest_int("max_depth", 10, 100, log=True)
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    elif classifier == "XGBoostClassifier":
        pass
    
    X_train, X_val = train_test_split(df_train, test_size=0.2, random_state=42)
    y_train = X_train["target"].values
    X_train = X_train.drop(["row_id", "target"], axis=1)

    y_test = X_val["target"].to_array()
    X_val = X_val.drop(["row_id", "target"], axis=1)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val).to_array()
    score = accuracy_score(y_test, preds)
    gc.collect()
    return score

# Running the study

In [None]:
study = optuna.create_study(study_name="tps-feb-2022", direction="maximize")
study.optimize(objective, n_trials=10)

In [None]:
trial = study.best_trial

In [None]:
print("  Best accuracy: ", study.best_value)
print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


### Retraining the classifier with the best params obtained in the previous step

In [None]:
del trial.params["classifier"]

In [None]:
clf = RandomForestClassifier(**trial.params)
X_train, X_val = train_test_split(df_train, test_size=0.2, random_state=42)
y_train = X_train["target"].values
X_train = X_train.drop(["row_id", "target"], axis=1)

y_test = X_val["target"].to_array()
X_val = X_val.drop(["row_id", "target"], axis=1)
clf.fit(X_train, y_train)
preds = clf.predict(X_val).to_array()

# Prediction on test data

In [None]:
df_test = df_test.drop(["row_id"], axis=1)
y_sub = clf.predict(df_test)
pred_list = [idx_2_label[pred_idx] for pred_idx in y_sub.to_array()]

# Submission File

In [None]:
df_sub["target"] = pred_list
df_sub.to_csv("submission.csv", index=False)