# Dataset tracking in MLFlow

### Imports

In [7]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost

import mlflow
from mlflow.data.pandas_dataset import PandasDataset


In [8]:
# connect to MLFlow instance
mlflow.set_tracking_uri('http://localhost:5000')

### Loading and saving the dataset

In [9]:
# Loading the dataset
dataset_source_url = "https://raw.githubusercontent.com/mlflow/mlflow/master/tests/datasets/winequality-white.csv"
raw_data = pd.read_csv(dataset_source_url, delimiter=";")

### Making the data ready for training

In [10]:

# Extract the features and target data separately
y = raw_data["quality"]
X = raw_data.drop("quality", axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=17
)

# Create a label encoder object
le = LabelEncoder()

# Fit and transform the target variable
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)


### Training the XGBoost model

In [11]:

# Fit an XGBoost binary classifier on the training data split
model = xgboost.XGBClassifier().fit(X_train, y_train_encoded)

# Build the Evaluation Dataset from the test set
y_test_pred = model.predict(X=X_test)

eval_data = X_test
eval_data["label"] = y_test

# Assign the decoded predictions to the Evaluation Dataset
eval_data["predictions"] = le.inverse_transform(y_test_pred)

### Tracking the model and dataset in an experiment

In [12]:

# Create the PandasDataset for use in mlflow evaluate
pd_dataset = mlflow.data.from_pandas(
    eval_data, predictions="predictions", targets="label"
)

mlflow.set_experiment("White Wine Quality")
# Log the Dataset, model, and execute an evaluation run using the configured Dataset
with mlflow.start_run() as run:
    mlflow.log_input(pd_dataset, context="training")

    mlflow.xgboost.log_model(
        artifact_path="white-wine-xgb", xgb_model=model, input_example=X_test
    )

    result = mlflow.evaluate(data=pd_dataset, predictions=None, model_type="classifier")

2025/04/14 14:27:59 INFO mlflow.tracking.fluent: Experiment with name 'White Wine Quality' does not exist. Creating a new experiment.
  self.get_booster().save_model(fname)
ValueError("feature_names mismatch: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'] ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'label', 'predictions']\ntraining data did not have the following fields: predictions, label")Traceback (most recent call last):
  File "/home/farbod/project/mlflow-docker-compose/.venv/lib/python3.12/site-packages/mlflow/utils/_capture_modules.py", line 166, in load_model_and_predict
    model.predict(input_example, params=params)
  File "/home/farbod/project/mlflow-docker-compose/.venv/lib/python3.12/site-packages/mlflow/xgboost/__init__.py"

🏃 View run silent-gnat-624 at: http://localhost:5000/#/experiments/2/runs/af0f85fa7f6c40b1914519badb4bb7cb
🧪 View experiment at: http://localhost:5000/#/experiments/2


<Figure size 1050x700 with 0 Axes>