# MLOps Zoomcamp 2.2 - Getting started with MLflow

In [1]:
!pip install --upgrade pyarrow scikit-learn mlflow
# Run MLflow with "mlflow ui --backend-store-uri sqlite:///mlflow.db"

[0m

In [3]:
!mkdir models
!mkdir data
!wget -nc https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet --directory-prefix data
!wget -nc https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet --directory-prefix data
!pwd
# Run this command "mlflow ui --backend-store-uri sqlite:///mlflow.db" in the directory of pwd command

Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/jovyan/.wget-hsts'. HSTS will be disabled.
--2024-09-26 12:40:37--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 52.84.111.30, 52.84.111.148, 52.84.111.169, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|52.84.111.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1333519 (1.3M) [binary/octet-stream]
Saving to: ‘data/green_tripdata_2021-01.parquet’


utime(data/green_tripdata_2021-01.parquet): Operation not permitted
2024-09-26 12:40:37 (23.9 MB/s) - ‘data/green_tripdata_2021-01.parquet’ saved [1333519/1333519]

Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/jovyan/.wget-hsts'. HSTS will be disabled.
--2024-09-26 

In [4]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("my-very-new-experiment")

2024/09/24 15:10:16 INFO mlflow.tracking.fluent: Experiment with name 'my-very-new-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/jovyan/mlruns/2', creation_time=1727190616880, experiment_id='2', last_update_time=1727190616880, lifecycle_stage='active', name='my-very-new-experiment', tags={}>

In [5]:
import pandas as pd
import seaborn as sns

def load_prep_data(filepath):
    df_raw = pd.read_parquet(filepath)

    # Preserve the raw input data in df variable and work on a clone
    df = df_raw.copy()
    df.attrs['source_path'] = filepath
    df_raw.attrs['source_path'] = filepath

    # Convert pandas.Timedelta to float of minutes
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime

    # Convert pandas.Timedelta to float of minutes
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # Dropping outliers
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    return df_raw, df

df_raw_jan, df_jan = load_prep_data('./data/green_tripdata_2021-01.parquet')
df_raw_feb, df_feb = load_prep_data('./data/green_tripdata_2021-02.parquet')

In [6]:
print(df_jan.attrs['source_path'])
print(df_feb.attrs['source_path'])

./data/green_tripdata_2021-01.parquet
./data/green_tripdata_2021-02.parquet


In [7]:
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso

try:
    from sklearn.metrics import root_mean_squared_error
except Exception as e:
    print("Sklearn version has to be bigger than 1.4.0 to import root_mean_squared_error")
    raise e

# Hot encoding
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

train_dicts = df_jan[categorical + numerical].to_dict(orient='records')
val_dicts = df_feb[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_jan[target].values
y_val = df_feb[target].values

with mlflow.start_run():
    mlflow.set_tag("developer", "peter")
    mlflow.log_param("train-data-path", df_jan.attrs['source_path'])
    mlflow.log_param("valid-data-path", df_feb.attrs['source_path'])

    # Training a model
    alpha = 0.01
    mlflow.log_param("alpha", alpha)

    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_val_pred = lr.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_val_pred)
    print(f"Performance on train set: {rmse}")
    mlflow.log_metric("rmse", rmse)

# Save model and dictionary vectorizer
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)



Performance on train set: 11.167275941179728


# MLOps Zoomcamp 2.3 - Experiment tracking with MLflow

In [None]:
!pip install xgboost hyperopt

In [9]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

With autolog() function

In [20]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

def objective(params):
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)

    return {'loss': rmse, 'status': STATUS_OK}
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

mlflow.xgboost.autolog()

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=3,
    trials=Trials()
)

  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

2024/09/24 15:33:53 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '20550ae843b64d7993ef5edd93f30933', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow




[0]	validation-rmse:8.14334                          
[1]	validation-rmse:7.04262                          
[2]	validation-rmse:6.75868                          
[3]	validation-rmse:6.66658                          
[4]	validation-rmse:6.62778                          
[5]	validation-rmse:6.61440                          
[6]	validation-rmse:6.60923                          
[7]	validation-rmse:6.60210                          
[8]	validation-rmse:6.59669                          
[9]	validation-rmse:6.59278                          
[10]	validation-rmse:6.58577                         
[11]	validation-rmse:6.57960                         
[12]	validation-rmse:6.57730                         
[13]	validation-rmse:6.57151                         
[14]	validation-rmse:6.56680                         
[15]	validation-rmse:6.56286                         
[16]	validation-rmse:6.55703                         
[17]	validation-rmse:6.54597                         
[18]	validation-rmse:6.54321






 33%|███▎      | 1/3 [00:40<01:21, 40.89s/trial, best loss: 6.433569411667345]

2024/09/24 15:34:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c79ef26116a4427c8051904147fbd0b4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow




[0]	validation-rmse:11.05413                                                  
[1]	validation-rmse:10.11890                                                  
[2]	validation-rmse:9.36988                                                   
[3]	validation-rmse:8.77538                                                   
[4]	validation-rmse:8.30784                                                   
[5]	validation-rmse:7.94201                                                   
[6]	validation-rmse:7.65862                                                   
[7]	validation-rmse:7.43925                                                   
[8]	validation-rmse:7.26931                                                   
[9]	validation-rmse:7.13750                                                   
[10]	validation-rmse:7.03487                                                  
[11]	validation-rmse:6.95399                                                  
[12]	validation-rmse:6.89046                        






 67%|██████▋   | 2/3 [03:40<02:02, 122.62s/trial, best loss: 6.433569411667345]

2024/09/24 15:37:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9e4f14fefb694694ba65cfacd07b68d5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow




[0]	validation-rmse:9.27070                                                    
[1]	validation-rmse:7.84494                                                    
[2]	validation-rmse:7.20013                                                    
[3]	validation-rmse:6.91034                                                    
[4]	validation-rmse:6.77606                                                    
[5]	validation-rmse:6.70651                                                    
[6]	validation-rmse:6.67052                                                    
[7]	validation-rmse:6.64762                                                    
[8]	validation-rmse:6.63214                                                    
[9]	validation-rmse:6.62290                                                    
[10]	validation-rmse:6.61422                                                   
[11]	validation-rmse:6.60286                                                   
[12]	validation-rmse:6.59959            






100%|██████████| 3/3 [05:10<00:00, 103.66s/trial, best loss: 6.433569411667345]


# MLOps Zoomcamp 2.4 - Model management

In [11]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

mlflow.xgboost.autolog(disable=True)

best_params = {
    'max_depth': 23,
    'learning_rate': 0.597550486406435,
    'reg_alpha': 0.014687315023552708,
    'reg_lambda': 0.0025774681356800634,
    'min_child_weight': 8.07657987150076,
    'objective': 'reg:linear',
    'seed': 42
}

with mlflow.start_run():
    mlflow.set_tag("model", "xgboost")

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # Save dictionary vectorizer
    with open('models/preprocessor.bin', 'wb') as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.bin", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")




[0]	validation-rmse:8.04264
[1]	validation-rmse:7.00213
[2]	validation-rmse:6.74662
[3]	validation-rmse:6.67269
[4]	validation-rmse:6.64488
[5]	validation-rmse:6.62884
[6]	validation-rmse:6.62081
[7]	validation-rmse:6.61512
[8]	validation-rmse:6.60773
[9]	validation-rmse:6.60363
[10]	validation-rmse:6.60009
[11]	validation-rmse:6.59883
[12]	validation-rmse:6.59442
[13]	validation-rmse:6.59042
[14]	validation-rmse:6.58690
[15]	validation-rmse:6.58368
[16]	validation-rmse:6.58192
[17]	validation-rmse:6.57690
[18]	validation-rmse:6.57579
[19]	validation-rmse:6.57430
[20]	validation-rmse:6.57202
[21]	validation-rmse:6.57109
[22]	validation-rmse:6.56991
[23]	validation-rmse:6.56795
[24]	validation-rmse:6.56547
[25]	validation-rmse:6.56360
[26]	validation-rmse:6.55541
[27]	validation-rmse:6.55411
[28]	validation-rmse:6.55282
[29]	validation-rmse:6.55134
[30]	validation-rmse:6.55021
[31]	validation-rmse:6.54880
[32]	validation-rmse:6.54750
[33]	validation-rmse:6.54650
[34]	validation-rmse:6.5



In [19]:
# MLflow path to the run artifacts to import
logged_model = 'runs:/dbc921ed36f84c519d9fc6c2830b4680/models_mlflow'

# FIRST way of loading models from MLflow
xgboost_model = mlflow.xgboost.load_model(logged_model)  # xgboost
y_valid_pred = xgboost_model.predict(valid)
print(f"xgboost model predicted values: {y_valid_pred[:10]}")

# SECOND way of loading models from MLflow
loaded_model = mlflow.pyfunc.load_model(logged_model)  # PyFuncModel
y_valid_pred = loaded_model.predict(X_val)
print(f"PyFuncModel model predicted values: {y_valid_pred[:10]}")

xgboost model predicted values: [13.171659   5.9881744 21.653578  25.004402   9.48409   17.194054
 11.067449   8.207616   9.174636  15.484611 ]
PyFuncModel model predicted values: [13.171659   5.9881744 21.653578  25.004402   9.48409   17.194054
 11.067449   8.207616   9.174636  15.484611 ]


# MLOps Zoomcamp 2.5 - Model registry

# MLOps Zoomcamp 2.6 -  MLflow in practice

Notebooks in `./02-experiment-tracking/running-mlflow-examples`

# MLOps Zoomcamp 2.7 - MLflow: benefits, limitations and alternatives

Neptune
Comet
Weights & Biases
...

Comparison table: [https://neptune.ai/blog/best-ml-experiment-tracking-tools](https://neptune.ai/blog/best-ml-experiment-tracking-tools)