In [3]:
!conda info --envs

# conda environments:
#
                         C:\Anaconda
base                     C:\Users\vvvis\Anaconda3
mlopszoomcamp         *  C:\Users\vvvis\Anaconda3\envs\mlopszoomcamp
tensorflow               C:\Users\vvvis\Anaconda3\envs\tensorflow



In [4]:
# the name of the active env is shown with *

In [42]:
import  mlflow
import sklearn
import matplotlib.pyplot as plt

In [43]:
print(mlflow.__version__)
print(sklearn.__version__)

1.26.1
1.0.2


## preprocess the data

In [10]:
!python preprocess.py --raw_data_path taxi_data --dest_path ./output

In [11]:
#there are 4 files inside the output folder after the training

## Train

In [36]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [41]:
mlflow.set_experiment("random_forest_regressor_1")

2022/06/01 23:03:32 INFO mlflow.tracking.fluent: Experiment with name 'random_forest_regressor_1' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/5', experiment_id='5', lifecycle_stage='active', name='random_forest_regressor_1', tags={}>

In [None]:
import argparse
import os
import pickle
import mlflow
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run(data_path):
    mlflow.sklearn.autolog()

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))
    
    with mlflow.start_run():

        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_valid)

        rmse = mean_squared_error(y_valid, y_pred, squared=False)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_path",
        default="./output",
        help="the location where the processed NYC taxi trip data was saved."
    )
    args = parser.parse_args()

    run(args.data_path)

In [45]:
!python train.py --data_path ./output

2022/06/01 23:12:47 INFO mlflow.tracking.fluent: Experiment with name 'random_forest_regressor_2' does not exist. Creating a new experiment.


## tracking server locally + local artifacts and sqlite db as backend

In [48]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [50]:
!python hypo.py


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]
  2%|2         | 1/50 [00:28<22:58, 28.14s/trial, best loss: 6.658956269343007]
  4%|4         | 2/50 [00:28<09:38, 12.06s/trial, best loss: 6.658956269343007]
  6%|6         | 3/50 [00:30<05:48,  7.42s/trial, best loss: 6.658956269343007]
  8%|8         | 4/50 [00:46<08:16, 10.79s/trial, best loss: 6.651438559376775]
 10%|#         | 5/50 [00:54<07:21,  9.81s/trial, best loss: 6.651438559376775]
 12%|#2        | 6/50 [01:23<11:51, 16.16s/trial, best loss: 6.651438559376775]
 14%|#4        | 7/50 [01:49<13:54, 19.40s/trial, best loss: 6.651438559376775]
 16%|#6        | 8/50 [01:52<09:50, 14.06s/trial, best loss: 6.651438559376775]
 18%|#8        | 9/50 [02:07<09:50, 14.41s/trial, best loss: 6.651438559376775]
 20%|##        | 10/50 [02:20<09:16, 13.92s/trial, best loss: 6.651438559376775]
 22%|##2       | 11/50 [02:31<08:28, 13.04s/trial, best loss: 6.642137287429206]
 24%|##4       | 12/50 [02:39<07:17, 11.51s/trial, best loss: 

2022/06/01 23:31:03 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.
2022/06/01 23:41:30 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt_1' does not exist. Creating a new experiment.



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]
  2%|2         | 1/50 [00:31<25:34, 31.31s/trial, best loss: 6.658956269343007]
  4%|4         | 2/50 [00:32<10:55, 13.66s/trial, best loss: 6.658956269343007]
  6%|6         | 3/50 [00:34<06:26,  8.22s/trial, best loss: 6.658956269343007]
  8%|8         | 4/50 [00:52<09:10, 11.96s/trial, best loss: 6.651438559376775]
 10%|#         | 5/50 [01:00<07:59, 10.65s/trial, best loss: 6.651438559376775]
 12%|#2        | 6/50 [01:43<16:00, 21.83s/trial, best loss: 6.651438559376775]
 14%|#4        | 7/50 [02:19<18:56, 26.44s/trial, best loss: 6.651438559376775]
 16%|#6        | 8/50 [02:24<13:35, 19.43s/trial, best loss: 6.651438559376775]
 18%|#8        | 9/50 [02:43<13:10, 19.27s/trial, best loss: 6.651438559376775]
 20%|##        | 10/50 [02:57<11:52, 17.82s/trial, best loss: 6.651438559376775]
 22%|##2       | 11/50 [03:10<10:40, 16.42s/trial, best loss: 6.642137287429206]
 24%|##4       | 12/50 [03:20<09:07, 14.40s/trial, best loss: 

## Promote the best model to the model registry

In [54]:
from mlflow.tracking import MlflowClient
client = MlflowClient()
experiment = client.get_experiment_by_name('random-forest-hyperopt_1')
best_run = client.search_runs( experiment_ids=experiment.experiment_id  )[0]

In [60]:
best_run.info.run_id

'3f26d8f51a534fa9af1b53468f5c3fa1'

In [66]:
!python register_model.py

2022/06/02 00:31:21 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models_1' does not exist. Creating a new experiment.
Successfully registered model 'random-forest-best'.
2022/06/02 00:34:36 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random-forest-best, version 1
Created version '1' of model 'random-forest-best'.


In [64]:
experiment = client.get_experiment_by_name('random-forest-best-models')
best_run1 = client.search_runs( experiment_ids=experiment.experiment_id ,order_by=["metrics.test_rmse DESC"] )[0]


In [65]:
best_run1

<Run: data=<RunData: metrics={'test_rmse': 6.549816636724069,
 'training_mae': 3.7785117890805413,
 'training_mse': 30.60450474365528,
 'training_r2_score': 0.7711041421908643,
 'training_rmse': 5.532133832767903,
 'training_score': 0.7711041421908643,
 'valid_rmse': 6.629728007710133}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '20',
 'max_features': 'auto',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '2',
 'min_samples_split': '3',
 'min_weight_fraction_leaf': '0.0',
 'n_estimators': '23',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegressor',
 'mlflow.log-model.history': '[{"run_id": "e82ea2e1fd5b44548a88c27fe1f663d0", '
                             '"artifact_path": "model", "utc_time_created": '
      