# Step 1: Install and Set Up Environment

In [None]:
# Install dependencies
!pip install mlflow scikit-learn pandas seaborn jupyter

# Step 2: Import Necessary Libraries

In [1]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import seaborn as sns
from mlflow.models.signature import infer_signature
import numpy as np


# Step 3: Load Data

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

df = pd.read_csv(url, sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [None]:
# Separate Features and Target
X = df.drop("quality", axis=1)
y = df["quality"]

# Step 4: Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Define Models and Hyperparameters


In [8]:
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}


# Step 6: Set Experiment

In [28]:
# Set or get the experiment
experiment = mlflow.set_experiment("Wine_Quality_Prediction_exp")

# Print experiment details
print(f"""
Name              : {experiment.name}
ID                : {experiment.experiment_id}
Artifact Location : {experiment.artifact_location}
Lifecycle Stage   : {experiment.lifecycle_stage}
Creation Time     : {experiment.creation_time}
Last Update Time  : {experiment.last_update_time}
Tags              : {experiment.tags}
""")



Name              : Wine_Quality_Prediction_exp
ID                : 674643091897476556
Artifact Location : file:///C:/Users/Sachith/Projects/mlflow/mlruns/674643091897476556
Lifecycle Stage   : active
Creation Time     : 1744362281099
Last Update Time  : 1744554449022
Tags              : {}



# Step 7: Training and Logging with MLflow

In [29]:
run_ids = {}  # To store run IDs for each model

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name) as run:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, preds))
        r2 = r2_score(y_test, preds)

        mlflow.log_param("model_type", model_name)

        if model_name != "LinearRegression":
            for param, value in model.get_params().items():
                mlflow.log_param(param, value)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2_score", r2)

        input_example = X_test[:1]
        signature = infer_signature(X_test, preds)

        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=model_name,
            input_example=input_example,
            signature=signature
        )

        print(f"{model_name} - RMSE: {rmse:.4f} | R2: {r2:.4f}")

        # Save run ID for later
        run_ids[model_name] = run.info.run_id


LinearRegression - RMSE: 0.6245 | R2: 0.4032
RandomForest - RMSE: 0.6062 | R2: 0.4376
GradientBoosting - RMSE: 0.6019 | R2: 0.4456


# Step 8: Load and Use the Model from MLflow

In [15]:
# Load model from MLflow using the run ID
model_uri = "runs:/dd6bcd0d10f34eefa8cec7b7472ed8fd/RandomForest"  # Replace <run_id> with the actual run ID
loaded_model = mlflow.sklearn.load_model(model_uri)

# Use the model for prediction
loaded_preds = loaded_model.predict(X_test)

In [29]:
# # Check if a run is active before ending it
# if mlflow.active_run() is not None:
#     mlflow.end_run()  # Only end the run if it's active
# else:
#     print("No active run to end.")


# Step 9: Register the Model

In [28]:
# Register the model in the registry
mlflow.register_model(
    model_uri=f"runs:/{run_ids['RandomForest']}/RandomForest",
    name="RandomForest"
)

Registered model 'RandomForest' already exists. Creating a new version of this model...
Created version '4' of model 'RandomForest'.


<ModelVersion: aliases=[], creation_timestamp=1744363378776, current_stage='None', description=None, last_updated_timestamp=1744363378776, name='RandomForest', run_id='e9e3e4b73cd7413faead29ffcddca9a6', run_link=None, source='file:///C:/Users/Sachith/Projects/mlflow/mlruns/674643091897476556/e9e3e4b73cd7413faead29ffcddca9a6/artifacts/RandomForest', status='READY', status_message=None, tags={}, user_id=None, version=4>

# Step 10: Tag the Model Version

In [33]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
client.set_model_version_tag(
    name="RandomForest",
    version=2,  # Specify the version number
    key="stage",
    value="production"
)

# Step 11: Search and Tag Model Versions

In [34]:
model_versions = client.search_model_versions(f"name='RandomForest'")

for version in model_versions:
    print(f"Model version {version.version} - Tags: {version.tags}")

client.set_model_version_tag(
    name="RandomForest",
    version=1,  # Specify the version you want to tag
    key="stage",
    value="production"
)

print("Model version 1 has been tagged as 'production'.")

Model version 4 - Tags: {}
Model version 3 - Tags: {}
Model version 2 - Tags: {'stage': 'production'}
Model version 1 - Tags: {'stage': 'production'}
Model version 1 has been tagged as 'production'.
