In [16]:
import os

import mlflow
import mlflow.sklearn

import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


In [21]:
os.environ["GIT_PYTHON_REFRESH"] = "quiet"

tracking_uri = 'http://localhost:5001'

mlflow.set_tracking_uri(tracking_uri)
# mlflow.set_experiment('compare_max_depth')
# mlflow.start_run(experiment_id='1')
# mlflow.end_run()
mlflow.sklearn.autolog()


In [27]:
data_wine = datasets.load_wine()
data_wine


{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [28]:
X_all = pd.DataFrame(data_wine["data"],columns=data_wine["feature_names"])

y_all = pd.DataFrame(data_wine["target"],columns=["target"])
y_all = y_all.replace({0:'class_0', 1:'class_1', 2:'class_2'})

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.33, random_state=0
)

train = pd.concat([X_train,y_train],axis=1,sort=False)


In [29]:
param_grid = {
    "max_depth":[1,2,3,5,7],
    "n_estimators":[100,200,500],
    "min_samples_split":[2,3, 5,7]
}

clf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=0),
    param_grid = param_grid,
    scoring="accuracy",
    cv = 5,
    n_jobs = -1
)

clf.fit(X_train,y_train["target"].values)

print("Best Model Parameter: ",clf.best_params_)


Best Model Parameter:  {'max_depth': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [25]:
clf_best = clf.best_estimator_ #best estimator

y_true, y_pred = y_test, clf_best.predict(X_test)
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

     class_0       0.95      1.00      0.98        20
     class_1       1.00      0.92      0.96        24
     class_2       0.94      1.00      0.97        15

    accuracy                           0.97        59
   macro avg       0.96      0.97      0.97        59
weighted avg       0.97      0.97      0.97        59



In [26]:
mlflow.sklearn.autolog(disable=True)


# References

- About sklearn & randomforestclassifier  
https://obgynai.com/wine-supervised-learning/
- Easy usage of mlflow  
https://www.kaggle.com/code/harupy/scikit-learn-autologging-in-mlflow
