In [None]:
# Databricks notebook source
# MAGIC %md This notebook is slightly modified version of `MLflow Training Tutorial` from [MLflow examples](https://github.com/mlflow/mlflow/tree/master/examples/sklearn_elasticnet_wine).
# MAGIC
# MAGIC It predicts the quality of wine using [sklearn.linear_model.ElasticNet](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html).
# MAGIC This is a base code and will be modified further during the `Databricks: Reproducible experiments with MLflow and Delta Lake` tutorial.
# MAGIC
# MAGIC Attribution
# MAGIC * The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
# MAGIC * P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
# MAGIC * Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

In [11]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor

import logging
import mlflow
import mlflow.sklearn

In [12]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000") 

In [13]:
# Wine Quality Sample
def train(model_type="elasticnet", alpha=0.5, l1_ratio=0.5, n_estimators=100, max_depth=None):
    """
    Train a regression model and log the experiment using MLflow.

    Parameters:
    - model_type: str, "elasticnet" or "random_forest"
    - alpha: float, ElasticNet alpha parameter
    - l1_ratio: float, ElasticNet l1_ratio parameter
    - n_estimators: int, number of estimators for Random Forest
    - max_depth: int, max depth of Random Forest trees
    """
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2

    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the wine-quality csv file from the URL
    csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    try:
        data = pd.read_csv(csv_url, sep=';')
    except Exception as e:
        logger.exception("Unable to download training & test CSV. Error: %s", e)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    with mlflow.start_run():
        if model_type == "elasticnet":
            model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
            mlflow.log_param("alpha", alpha)
            mlflow.log_param("l1_ratio", l1_ratio)
        elif model_type == "random_forest":
            model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("max_depth", max_depth)
        else:
            raise ValueError("Invalid model_type. Choose 'elasticnet' or 'random_forest'.")

        # Train the model
        model.fit(train_x, train_y)

        # Predict and evaluate metrics
        predicted_qualities = model.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Log metrics
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        # Log model
        mlflow.sklearn.log_model(model, "model")

        print(f"{model_type.capitalize()} model:")
        print(f"  RMSE: {rmse}")
        print(f"  MAE: {mae}")
        print(f"  R2: {r2}")

In [14]:
# Train ElasticNet
train(model_type="elasticnet", alpha=0.5, l1_ratio=0.5)



Elasticnet model:
  RMSE: 0.7931640229276851
  MAE: 0.6271946374319586
  R2: 0.10862644997792614
🏃 View run defiant-midge-76 at: http://127.0.0.1:5000/#/experiments/0/runs/73ed5320cfb540f4a86cb9887fdb34b6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


In [15]:
# Train Random Forest
train(model_type="random_forest", n_estimators=100, max_depth=10)



Random_forest model:
  RMSE: 0.5909938464063711
  MAE: 0.43808613147760644
  R2: 0.5051202911831713
🏃 View run bemused-boar-858 at: http://127.0.0.1:5000/#/experiments/0/runs/8e7050df61ee4fbca127501c1fc40b7b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
