In [1]:
# Hyperparameter Optimization with RandomSearchCV (BONUS)

In [2]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV, validation_curve

pd.set_option("display.max_columns",None)
warnings.simplefilter(action="ignore", category=Warning)

df = pd.read_csv("diabetes.csv")
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

In [3]:
rf_model = RandomForestClassifier(random_state=17)

In [5]:
rf_random_params = {"max_depth":np.random.randint(5,50,10),
                    "max_features": [3,5,7,"auto","sqrt"],
                    "min_samples_split": np.random.randint(2,50,20),
                    "n_estimators": [int(x) for x in np.linspace(start=200, stop=1500, num=10)]}

In [7]:
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_random_params, n_iter=100, cv=3, verbose=True,
                               random_state=42, n_jobs=1)

rf_random.fit(X,y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [8]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 22,
 'max_features': 'sqrt',
 'max_depth': 19}

In [12]:
rf_random_final = rf_model.set_params(**rf_random.best_params_, random_state=17).fit(X,y)

cv_results = cross_validate(rf_random_final, X, y, cv=5, scoring=["accuracy","f1","roc_auc"])

cv_results["test_accuracy"].mean()
# 0.7696120872591461

cv_results["test_f1"].mean()
# 0.6349765689355348

cv_results["test_roc_auc"].mean()
# 0.8361747030048916

0.8361747030048916