In [1]:
import mlflow
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import plotly.express as px

In [2]:
# Set experiment details
experiment_name = 'ryan.osullivan mlflow test'
experiment_description = 'an experiment to showcase mlflow in action'

In [3]:
# Set the logging destination to the remote MLflow instance
#mlflow.set_tracking_uri('https://xxx')

# Only need to run this once for each project
try:
    mlflow.create_experiment(name=experiment_name, tags={'mlflow.note.content': experiment_description})
    
except mlflow.exceptions.RestException as e:
    print('The experiment you were trying to create already exists. You can activate it directly using set_experiment())')

mlflow.set_experiment(experiment_name=experiment_name)

<Experiment: artifact_location='file:///c:/Users/ryan.sullivan/OneDrive%20-%20Entain%20Group/Documents/ryan/repo/a_data_scientists_guide_to_software_engineering/guide/04_versioning/02_mlflow/mlruns/624210934195793856', creation_time=1709726654205, experiment_id='624210934195793856', last_update_time=1709726654205, lifecycle_stage='active', name='ryan.osullivan mlflow test', tags={'mlflow.note.content': 'an experiment to showcase mlflow in action'}>

In [4]:
# Train model
X, y = make_regression(n_samples=1000, n_features=5, noise=1, random_state=42)
X = pd.DataFrame(X, columns =['x1', 'x2', 'x3', 'x4', 'x5'])
y = pd.DataFrame(y, columns =['y'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
fig = px.histogram(x=y_pred, title='The distribution of y_pred', width=800, height=400)

In [5]:
# Set run details
run_description = 'random forest model'
run_date = datetime.now().strftime('%Y-%m-%d')
run_time = datetime.now().strftime('%H:%M:%S')
run_name = 'train/initial model'

notebook_file_to_log = 'mlflow_demo.ipynb'
notebook_file_folder_name = 'notebooks'
plot_folder_nane = 'plots'
model_folder_name = 'models'

In [7]:
# Track run results
with mlflow.start_run(description=run_description, 
                      tags={'Date': run_date, 
                            'Time': run_time},
                      run_name=run_name):

    # X train
    X_train_mlflow_data = mlflow.data.from_pandas(X_train, name='X_train')
    mlflow.log_input(X_train_mlflow_data)
    
    # y train
    y_train_mlflow_data = mlflow.data.from_pandas(y_train, name='y_train')
    mlflow.log_input(y_train_mlflow_data)
    
    # X test
    X_test_mlflow_data = mlflow.data.from_pandas(X_test, name='X_test')
    mlflow.log_input(X_test_mlflow_data)
    
    # y test
    y_test_mlflow_data = mlflow.data.from_pandas(y_test, name='y_test')
    mlflow.log_input(y_test_mlflow_data)
    
    # Jupyter notebooks
    mlflow.log_artifact(local_path=notebook_file_to_log, artifact_path=notebook_file_folder_name)

    # hyperparameters    
    mlflow.log_params(rf.get_params())

    # metrics
    mlflow.log_metrics({'mse': mse})
    
    # plot
    mlflow.log_figure(fig, artifact_file=f'{plot_folder_nane}/distribution_y_pred.html')

    # model
    signature = mlflow.models.infer_signature(X_test, rf.predict(X_test)) # infer the data type of input and output of the model, ensure consistency and reproducibility.
    mlflow.sklearn.log_model(rf, artifact_path=model_folder_name, signature=signature)