In [3]:
!pip install wandb
!pip install scikit-learn
!pip install pandas
!pip install numpy



In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import wandb
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

# Loading in data

In [5]:
data = load_breast_cancer()

In [6]:
print(data.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [7]:
print(data.target_names)

['malignant' 'benign']


In [8]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Bayes Theory

***
$$P(Y|X) = \frac{P(X|Y)*P(Y)}{P(X)}$$

<ul>
    <li>P(X) is the probability of observing this new evidence.</li>
    <li>P(X|Y) is the probability of observing the new evidence X given the event Y that we care about.</li>
    <li>P(Y) is the initial hypothesis about the event Y that we care about.</li>
</ul>

$$P(metric|hyperparameters) = \frac{P(hyperparameters|metric)*P(metric)}{P(hyperparameters)}$$

<ul>
    <li>P(metric | hyperparameter combination) gives the probability of the given metric to be minimized/maximized given the combination of hyperparameter values.</li>
    <li>P(hyperparameter combination | metric) is the probability of a certain hyperparameter combination if the given metric is minimized/maximized.</li>
    <li>P(metric) is the initial metric quantity in scalar.</li>
    <li>P(hyperparameter combination) is the probability of getting that particular hyperparameter combination.</li>
</ul>

***

# Setup Code

In [10]:
sweep_config = {
    "method": "bayes",
    "metric": {"goal": "maximize", "name": "F1-Score"},
    "parameters": {
        "criterion": {"values": ["gini", "entropy", "log_loss"]},
        "max_depth": {"min": 5, "max": 10},
        "n_estimators": {"min": 100, "max": 500}
    }
}

In [11]:
def objective():
#     Call wandb.init() to start server
    with wandb.init(project="rf_hyperparam_tuning_tech_discussion_group_bayes", config=sweep_config) as run:
    #     Instantiate model
        model = RandomForestClassifier(
            random_state=42, 
            n_estimators=wandb.config.n_estimators,
            criterion=wandb.config.criterion,
            max_depth=wandb.config.max_depth
        )
    #     Fit model with training data
        model.fit(X_train, y_train)
    #     Make predictions
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)
    #     Generate output plots
        wandb.sklearn.plot_learning_curve(model, X, y)
        wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, y_pred, y_prob, data.target_names,
                                                             model_name='RF', feature_names=data.feature_names)
        wandb.sklearn.plot_precision_recall(y_test, y_prob, data.target_names)
        wandb.sklearn.plot_feature_importances(model, data.feature_names)
    #     Compute performance
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
    #     Log results
        wandb.log(
            {
                "F1-Score": f1,
                "Precision": precision,
                "Recall": recall
            }
        )

# Run Sweeps

In [12]:
sweep_id = wandb.sweep(sweep_config, project="rf_hyperparam_tuning_tech_discussion_group_bayes")
wandb.agent(sweep_id, function=objective, count=5)

Create sweep with ID: pir0r19p
Sweep URL: https://wandb.ai/scott-clare/rf_hyperparam_tuning_tech_discussion_group_bayes/sweeps/pir0r19p


[34m[1mwandb[0m: Agent Starting Run: x7y7ue2x with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 271
[34m[1mwandb[0m: Currently logged in as: [33mscott-clare[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RF.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
F1-Score,▁
Precision,▁
Recall,▁

0,1
F1-Score,0.97959
Precision,0.96
Recall,1.0


[34m[1mwandb[0m: Agent Starting Run: mqsuxfcv with config:
[34m[1mwandb[0m: 	criterion: entropy
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 135


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RF.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
F1-Score,▁
Precision,▁
Recall,▁

0,1
F1-Score,0.97959
Precision,0.96
Recall,1.0


[34m[1mwandb[0m: Agent Starting Run: kt7qel07 with config:
[34m[1mwandb[0m: 	criterion: log_loss
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 364


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RF.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
F1-Score,▁
Precision,▁
Recall,▁

0,1
F1-Score,0.97297
Precision,0.94737
Recall,1.0


[34m[1mwandb[0m: Agent Starting Run: flzl43bk with config:
[34m[1mwandb[0m: 	criterion: entropy
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	n_estimators: 176


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RF.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
F1-Score,▁
Precision,▁
Recall,▁

0,1
F1-Score,0.97959
Precision,0.96
Recall,1.0


[34m[1mwandb[0m: Agent Starting Run: 6ipb0m3r with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 129


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RF.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
F1-Score,▁
Precision,▁
Recall,▁

0,1
F1-Score,0.97959
Precision,0.96
Recall,1.0
