In [1]:
%load_ext autoreload
%autoreload 2

# MLflow Regression Pipeline Notebook

This notebook runs the MLflow Regression Pipeline on Databricks and inspects its results. For more information about the MLflow Regression Pipeline, including usage examples, see the [Regression Pipeline overview documentation](https://mlflow.org/docs/latest/pipelines.html#regression-pipeline) the [Regression Pipeline API documentation](https://mlflow.org/docs/latest/python_api/mlflow.pipelines.html#module-mlflow.pipelines.regression.v1.pipeline).

In [10]:
from mlflow.pipelines import Pipeline
import shutil
import pandas as pd
import os
p = Pipeline(profile="local")

2022/09/14 15:35:27 INFO mlflow.pipelines.pipeline: Creating MLflow Pipeline 'mlp-regression-template' with profile: 'local'


In [3]:
p.clean()

In [4]:
p.inspect()

In [19]:
p.run("ingest")

2022/09/14 13:28:49 INFO mlflow.pipelines.steps.ingest.datasets: Resolving input data from '['D:\\production\\mlp-regression-template\\data\\housing.csv']'
2022/09/14 13:28:49 INFO mlflow.pipelines.steps.ingest.datasets: Resolved input data to 'C:\PROGRA~1\KMSpico\temp\tmp8qs2qlvy\housing.csv'
2022/09/14 13:28:49 INFO mlflow.pipelines.steps.ingest.datasets: Converting dataset to parquet format, if necessary
Loading dataset CSV using `pandas.read_csv()` with default arguments and assumed index column 0 which may not produce the desired schema. If the schema is not correct, you can adjust it by modifying the `load_file_as_dataframe()` function in `steps/ingest.py`
2022/09/14 13:28:49 INFO mlflow.pipelines.steps.ingest: Successfully stored data in parquet format at 'C:\Users\PC\.mlflow\pipelines\409480574b814ed51ac904023c0beeedf9d46eb7c5be24d06d0c142b92ef4de4\steps\ingest\outputs\dataset.parquet'
2022/09/14 13:28:49 INFO mlflow.pipelines.steps.ingest: Profiling ingested dataset
2022/09/14

name,type
longitude,number
latitude,number
housing_median_age,integer
total_rooms,integer
total_bedrooms,number
population,integer
households,integer
median_income,number
median_house_value,integer
ocean_proximity,string


In [21]:
p.run("split")

2022/09/14 13:29:41 INFO mlflow.pipelines.steps.split: Creating hash buckets on input dataset containing 20640 rows consumes 0.11004066467285156 seconds.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].fillna(median, inplace=True)


In [22]:
p.run("transform")

Name,Type
longitude,float64
latitude,float64
housing_median_age,int64
total_rooms,int64
total_bedrooms,float64
population,int64
households,int64
median_income,float64
median_house_value,int64
ocean_proximity,object

Name,Type
f_00,float64
f_01,float64
f_02,float64
f_03,float64
f_04,float64
f_05,float64
f_06,float64
f_07,float64
f_08,float64
f_09,float64


In [32]:
p.run("train")

2022/09/14 15:13:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  inputs = _infer_schema(model_input)
2022/09/14 15:13:41 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
  sample_weight=1 / eval_df["prediction"].values,
  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
2022/09/14 15:13:42 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2022/09/14 15:13:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


Metric,training,validation
root_mean_squared_error,68467.5,68569.1
example_count,16457.0,2106.0
max_error,770672.0,487633.0
mean_absolute_error,49666.1,50080.8
mean_absolute_percentage_error,0.284799,0.29492
mean_on_label,207328.0,203958.0
mean_squared_error,4687800000.0,4701710000.0
r2_score,0.651003,0.626006
score,0.651003,0.626006
sum_on_label,3411990000.0,429535000.0

Name,Type
longitude,double
latitude,double
housing_median_age,long
total_rooms,long
total_bedrooms,double
population,long
households,long
median_income,double
ocean_proximity,string

Name,Type
-,"Tensor('float64', (-1,))"

absolute_error,prediction,median_house_value,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
770672.0,-636272.0,134400,-117.42,33.35,14,25135,4819.0,35682,4769,2.5729,<1H OCEAN
464337.0,35664.0,500001,-118.08,34.15,28,238,58.0,142,31,0.4999,INLAND
459409.0,40592.0,500001,-120.1,38.91,33,1561,282.0,30,11,1.875,INLAND
409569.0,90432.0,500001,-117.22,33.87,16,56,7.0,39,14,2.625,INLAND
389297.0,110704.0,500001,-117.7,33.72,6,211,51.0,125,44,1.9659,<1H OCEAN
381313.0,118688.0,500001,-117.08,34.08,34,45,11.0,39,14,3.0625,INLAND
376444.0,488944.0,112500,-118.22,34.06,52,48,6.0,41,10,10.2264,<1H OCEAN
364993.0,135008.0,500001,-118.15,34.15,52,275,123.0,273,111,1.1667,<1H OCEAN
364384.0,85616.0,450000,-120.92,37.63,39,45,8.0,22,9,1.7679,INLAND
359441.0,140560.0,500001,-120.67,35.3,19,1540,715.0,1799,635,0.7025,NEAR OCEAN

Unnamed: 0,Latest,Best,2nd Best
Model Rank,4,1,1
root_mean_squared_error,68569.1,3.25884,3.25884
weighted_mean_squared_error,4.99062e+09,8.18055,8.18055
max_error,487633,53.1241,53.1241
mean_absolute_error,50080.8,1.63894,1.63894
mean_absolute_percentage_error,0.29492,0.146808,0.146808
mean_squared_error,4.70171e+09,10.62,10.62
Run Time,2022-09-14 15:13:29,2022-08-29 14:16:20,2022-08-29 14:13:04
Run ID,ddbf4b7be0c7403ab6ad147930f00895,9460b19c880447f396a289dc44d2cbcd,dd0d3e203cae462e835116a62123b246


In [31]:
# dest = 'D:/production/mlp-regression-template/metadata/mlflow/mlartifacts'


# src ='C:/Users/PC/.mlflow/pipelines/409480574b814ed51ac904023c0beeedf9d46eb7c5be24d06d0c142b92ef4de4/metadata/mlflow/mlartifacts/'
# all_subdirs = [os.path.join(src, d) for d in os.listdir(src) if os.path.isdir(os.path.join(src, d))]
# latest_subdir = max(all_subdirs, key=os.path.getmtime)

# shutil.copytree(latest_subdir, os.path.join(dest, os.path.basename(latest_subdir)))

'D:/production/mlp-regression-template/metadata/mlflow/mlartifacts\\ae2e36397af14fe88c799e23ef55293b'

In [28]:
p.run("evaluate")

2022/09/14 15:00:22 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2022/09/14 15:00:22 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


Metric,validation,test
root_mean_squared_error,66283.1,72853.329811
example_count,2106.0,2077.0
max_error,337501.0,457501.0
mean_absolute_error,42603.4,45623.871449
mean_absolute_percentage_error,0.228543,0.263068
mean_on_label,203958.0,206055.843043
mean_squared_error,4393450000.0,5307607664.555127
r2_score,0.650526,0.595893
score,0.650526,0.595893
sum_on_label,429535000.0,427977986.0

metric,greater_is_better,value,threshold,validated
root_mean_squared_error,0,72853.3,10,❌
mean_absolute_error,0,45623.9,50,❌
weighted_mean_squared_error,0,3588930000.0,20,❌


In [37]:
p.run("register")

Successfully registered model 'house_price_regressor'.
2022/09/14 15:16:47 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: house_price_regressor, version 1
Created version '1' of model 'house_price_regressor'.


In [154]:
p.inspect("train")

Metric,training,validation
root_mean_squared_error,3.43472,3.25884
example_count,8019.0,955.0
max_error,212.188,53.1241
mean_absolute_error,1.5208,1.63894
mean_absolute_percentage_error,0.148791,0.146808
mean_on_label,12.3563,13.0743
mean_squared_error,11.7973,10.62
r2_score,0.890054,0.908446
score,0.890054,0.908446
sum_on_label,99085.0,12486.0

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer
pickup_dow,long
pickup_hour,long
trip_duration,double

Name,Type
-,"Tensor('float64', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip,pickup_dow,pickup_hour,trip_duration
212.188444,62.811556,275.0,2016-02-12 20:55:19,2016-02-12 21:52:38,20.85,10013,7008,4,20,57.316667
51.38178,3.61822,55.0,2016-02-28 04:50:41,2016-02-28 04:52:32,0.18,10115,10027,6,4,1.85
39.557483,45.442517,85.0,2016-02-11 17:52:13,2016-02-11 18:38:17,14.46,10282,7114,3,17,46.066667
38.458072,13.541928,52.0,2016-01-26 09:04:58,2016-01-26 09:43:15,3.0,11109,10199,1,9,38.283333
36.095139,15.904861,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367,4,16,11.816667
31.547252,13.452748,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302,5,23,17.133333
31.402555,20.597445,52.0,2016-01-16 00:12:20,2016-01-16 00:27:06,6.1,11378,10012,5,0,14.766667
29.897227,58.102773,88.0,2016-02-11 12:47:12,2016-02-11 13:16:59,19.02,10119,10710,3,12,29.783333
28.73453,31.23453,2.5,2016-01-16 17:50:50,2016-01-16 17:51:24,9.6,10007,10007,5,17,0.566667
22.171355,24.671355,2.5,2016-01-04 10:20:18,2016-01-04 11:20:43,7.2,11370,11205,0,10,60.416667

Unnamed: 0,Latest,Best,2nd Best
Model Rank,2,1,2
root_mean_squared_error,3.25884,3.25884,3.25884
weighted_mean_squared_error,8.18055,8.18055,8.18055
max_error,53.1241,53.1241,53.1241
mean_absolute_error,1.63894,1.63894,1.63894
mean_absolute_percentage_error,0.146808,0.146808,0.146808
mean_squared_error,10.62,10.62,10.62
Run Time,2022-08-29 14:31:03,2022-08-29 14:18:29,2022-08-29 14:31:03
Run ID,02c6de8cb0cd44258e21f2224e1ec861,d54b44d5a741465d8e7ed7720efd627c,02c6de8cb0cd44258e21f2224e1ec861


In [8]:
trained_model = p.get_artifact("model")
print(trained_model)

mlflow.pyfunc.loaded_model:
  artifact_path: train/model
  flavor: mlflow.sklearn
  run_id: ddbf4b7be0c7403ab6ad147930f00895



In [11]:
housing_data = pd.read_csv("data/housing.csv")

In [79]:
sample = housing_data.iloc[[0]]

In [80]:
predictions = trained_model.predict(sample)

In [81]:
predictions

array([410736.])