In [1]:
import os
import pickle

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import mlflow
import git

# Data

In [2]:
data_file_path = '../data/iris.pkl'

In [3]:
if os.path.isfile(data_file_path):
    with open(data_file_path, 'rb') as p:
        iris = pickle.load(p)
else:
    iris = datasets.load_iris()
    with open('../data/iris.pkl', 'wb') as p:
        pickle.dump(iris, p, pickle.HIGHEST_PROTOCOL)

In [4]:
X, y = iris.data, iris.target

In [5]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [6]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [7]:
len(y)

150

# MLflow tracking

In [8]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Iris classifier")
mlflow.sklearn.autolog()

repo = git.Repo(search_parent_directories=True)
sha_commit = repo.head.object.hexsha

2024/03/07 10:46:36 INFO mlflow.tracking.fluent: Experiment with name 'Iris classifier' does not exist. Creating a new experiment.


# Models

## Support vector machine

In [14]:
mlflow.start_run()
mlflow.set_tag('mlflow.source.git.commit', sha_commit)
mlflow.set_tag('method', 'SVM')

In [15]:
clf_svm = svm.SVC(gamma='scale')

In [16]:
clf_svm.fit(X, y)

In [17]:
clf_svm.score(X, y)

0.9733333333333334

In [18]:
mlflow.end_run()

## Logistic regression

In [24]:
mlflow.start_run()
mlflow.set_tag('mlflow.source.git.commit', sha_commit)
mlflow.set_tag('method', 'Logistic regression')

In [25]:
scaler = StandardScaler()

In [26]:
logistic = LogisticRegression()

In [27]:
clf_logit = Pipeline(steps=[("scaler", scaler), ("logistic", logistic)])

In [28]:
clf_logit.fit(X, y)

In [29]:
clf_logit.score(X, y)

0.9733333333333334

In [30]:
mlflow.end_run()

## Random forest

In [19]:
mlflow.start_run()
mlflow.set_tag('mlflow.source.git.commit', sha_commit)
mlflow.set_tag('method', 'Random forest')

In [20]:
clf_rf = RandomForestClassifier(max_depth=2)

In [21]:
clf_rf.fit(X, y)

In [22]:
clf_rf.score(X, y)

0.96

In [23]:
mlflow.end_run()