In [6]:
import numpy as np
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from faker import generate_apple_sales_data_with_promo_adjustment

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [9]:
# Sets the current active experiment to the "Apple_Models" experiment and
# returns the Experiment metadata
apple_experiment = mlflow.set_experiment("Apple_Models")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "apples_rf_test"

# Define an artifact path that the model will be saved to.
artifact_path = "rf_apples"

In [10]:
data = generate_apple_sales_data_with_promo_adjustment()
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["previous_days_demand"].fillna(
  df["previous_days_demand"].fillna(


Unnamed: 0,date,average_temperature,rainfall,weekend,holiday,price_per_kg,promo,demand,previous_days_demand
0,2010-07-27 07:41:22.258538,30.584727,1.199291,0,0,1.726258,0,851.276659,851.276659
1,2010-07-28 07:41:22.258536,15.465069,1.037626,0,0,0.576471,0,906.836626,851.276659
2,2010-07-29 07:41:22.258534,10.786525,5.656089,0,0,2.513328,0,857.895424,906.836626
3,2010-07-30 07:41:22.258532,23.648154,12.030937,0,0,1.839225,0,848.961007,857.895424
4,2010-07-31 07:41:22.258530,13.861391,4.303812,1,0,1.531772,0,1283.128282,848.961007


In [12]:
# Split the data into features and target and drop irrelevant date field and target field
X = data.drop(columns=["date", "demand"])
y = data["demand"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 888,
}

# Train the RandomForestRegressor
rf = RandomForestRegressor(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate error metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}

# Initiate the MLflow run context
with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=rf, input_example=X_val, artifact_path=artifact_path, registered_model_name="api_apples_rf_test",
    )

Successfully registered model 'api_apples_rf_test'.
2024/04/03 07:54:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: api_apples_rf_test, version 1
Created version '1' of model 'api_apples_rf_test'.
