In [1]:
# %% [markdown]
# # End-to-End ML Workflow: Price Range Prediction
#
# This notebook performs the following steps:
# 1. Loads the dataset.
# 2. Preprocesses features and the target variable.
# 3. Splits data into training and testing sets.
# 4. Trains multiple machine learning models.
# 5. Logs parameters, metrics, and classification reports to MLflow.
# 6. Identifies the best performing model based on accuracy.
# 7. **Saves the best model and all necessary preprocessing artifacts (encoders) to .pkl files.**
#
# After running this notebook, you will have `.pkl` files ready to be used by a Streamlit application for inference.

# %% [markdown]
# ## 1. Setup and Configuration

# %%
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import dagshub
import os
import joblib
import numpy as np

# --- MLflow and DagsHub Configuration ---
# IMPORTANT: Set these environment variables in your terminal BEFORE running the notebook
# export DAGSHUB_USER="your_username"
# export DAGSHUB_TOKEN="your_token"
# export MLFLOW_TRACKING_URI="https://mlflow.dagshub.com/your_username/your_repo_name/..."

# Check if environment variables are set
if not os.environ.get("DAGSHUB_USER") or not os.environ.get("DAGSHUB_TOKEN") or not os.environ.get("MLFLOW_TRACKING_URI"):
    print("Error: DagsHub credentials (DAGSHUB_USER, DAGSHUB_TOKEN) or MLFLOW_TRACKING_URI are not set.")
    print("Please set them as environment variables in your terminal before running this notebook.")
    # Exit the notebook if credentials are not set
    # exit() # Uncomment to stop execution if variables are not set
else:
    print(f"MLflow tracking URI set to: {os.environ['MLFLOW_TRACKING_URI']}")

# Initialize DagsHub and MLflow
try:
    dagshub.init(repo_owner='sandhya-bdb', repo_name='mlflow_dagshub_new', mlflow=True)
    # Set the experiment name
    mlflow.set_experiment("Beverage Price Range Prediction")
    print("DagsHub and MLflow initialized successfully.")
except Exception as e:
    print(f"Error initializing DagsHub/MLflow: {e}")
    # exit() # Uncomment to stop execution if initialization fails

# --- Constants and File Paths ---
DATA_FILE = 'survey_results_op.csv'
TARGET_COLUMN = 'price_range'
RESPONDENT_ID_COLUMN = 'respondent_id'

# --- Filenames for saved artifacts ---
BEST_MODEL_FILENAME = 'best_price_range_model.pkl'
TARGET_ENCODER_FILENAME = 'price_range_target_encoder.pkl'
LABEL_ENCODERS_FILENAME = 'label_encoders.pkl' # Dictionary of encoders

# %% [markdown]
# ## 2. Load Data

# %%
try:
    df = pd.read_csv(DATA_FILE)
    print(f"Dataset '{DATA_FILE}' loaded successfully. Shape: {df.shape}")
    print("First 5 rows of the dataset:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{DATA_FILE}' was not found. Please ensure it's in the same directory as the notebook or provide the full path.")
    exit() # Exit if data loading fails
except Exception as e:
    print(f"An error occurred during data loading: {e}")
    exit()

# %% [markdown]
# ## 3. Data Preprocessing and Feature Engineering

# %%
# Separate target variable
if TARGET_COLUMN not in df.columns:
    print(f"Error: Target column '{TARGET_COLUMN}' not found in the dataset.")
    exit()
X = df.drop(columns=[TARGET_COLUMN, RESPONDENT_ID_COLUMN], errors='ignore') # Ignore respondent_id if it exists
y = df[TARGET_COLUMN]

print(f"\nFeatures shape: {X.shape}, Target shape: {y.shape}")

# Define columns for different preprocessing steps
label_encode_cols_definitions = [
    'age_group',
    'income_levels',
    'health_concerns',
    'consume_frequency(weekly)',
    'preferable_consumption_size'
]
# Filter to include only columns that actually exist in X
label_encode_cols = [col for col in label_encode_cols_definitions if col in X.columns]

# Identify categorical columns that are NOT in label_encode_cols for One-Hot Encoding
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
one_hot_encode_cols = [col for col in categorical_cols if col not in label_encode_cols]

print(f"\nColumns for Label Encoding: {label_encode_cols}")
print(f"Columns for One-Hot Encoding: {one_hot_encode_cols}")

# --- Apply Label Encoding ---
label_encoders = {} # Dictionary to store fitted LabelEncoders for later use
for col in label_encode_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le # Save the fitted encoder
    print(f"Label encoded '{col}'. Classes: {le.classes_}")

# --- Apply One-Hot Encoding ---
if one_hot_encode_cols:
    X = pd.get_dummies(X, columns=one_hot_encode_cols, drop_first=True)
    print(f"\nApplied One-Hot Encoding. New X shape: {X.shape}")
else:
    print("\nNo columns for One-Hot Encoding were found or specified.")

# Convert boolean columns created by get_dummies to integers (optional but good practice)
for col in X.columns:
    if X[col].dtype == 'bool':
        X[col] = X[col].astype(int)
        print(f"Converted boolean column '{col}' to int.")

# --- Encode Target Variable ---
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)
print(f"\nTarget variable '{TARGET_COLUMN}' encoded. Original classes: {target_encoder.classes_}")

# %% [markdown]
# ## 4. Data Splitting

# %%
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42)
print(f"Data split into training and testing sets.")
print(f"Training set features shape: {X_train.shape}")
print(f"Testing set features shape: {X_test.shape}")

# %% [markdown]
# ## 5. Model Training, Evaluation, and MLflow Logging

# %%
# Define models
models = {
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine (SVM)": SVC(probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Light Gradient Boosting Machine (LightGBM)": LGBMClassifier(random_state=42)
}

best_model = None
best_accuracy = -1
best_model_name = ""
best_run_id = "" # To store the MLflow run ID of the best model

print("\n--- Starting Model Training and Evaluation ---")

# Iterate through each model
for model_name, model_instance in models.items():
    # Start an MLflow run for the current model
    with mlflow.start_run(run_name=f"Train_{model_name}") as run: # Capture run object
        print(f"\nTraining {model_name}...")
        current_run_id = run.info.run_id # Get the ID of the current run
        print(f"MLflow Run ID: {current_run_id}")

        # Log hyperparameters of the current model
        mlflow.log_params(model_instance.get_params())

        # Train the model on the training data
        model_instance.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = model_instance.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        class_report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        class_report_str = classification_report(y_test, y_pred, zero_division=0)

        # Log metrics to MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("test_set_accuracy", accuracy)

        # Log classification report metrics
        for class_label, metrics in class_report_dict.items():
            if isinstance(metrics, dict):
                mlflow.log_metric(f"precision_{class_label}", metrics.get("precision", 0))
                mlflow.log_metric(f"recall_{class_label}", metrics.get("recall", 0))
                mlflow.log_metric(f"f1-score_{class_label}", metrics.get("f1-score", 0))
            elif isinstance(metrics, (int, float)) and class_label != 'accuracy':
                 mlflow.log_metric(f"{class_label}_avg", metrics)

        # Log classification report as a text artifact
        report_filename = f"{model_name}_classification_report.txt"
        with open(report_filename, "w") as f:
            f.write(class_report_str)
        mlflow.log_artifact(report_filename)
        os.remove(report_filename) # Clean up local file

        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Classification Report:\n{class_report_str}")
        print("-" * 30)

        # Track the best model based on accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model_instance
            best_model_name = model_name
            best_run_id = current_run_id # Store the MLflow run ID

print("\n--- Model Training and Evaluation Complete ---")
print(f"Best model found: '{best_model_name}' with accuracy {best_accuracy:.4f} (MLflow Run ID: {best_run_id})")

# %% [markdown]
# ## 6. Save Best Model and Preprocessing Artifacts (.pkl files)

# %%
print("\n--- Saving Best Model and Preprocessing Artifacts ---")

# Save the best performing model using joblib
try:
    joblib.dump(best_model, BEST_MODEL_FILENAME)
    print(f"Best model ('{best_model_name}') saved to '{BEST_MODEL_FILENAME}'")
except Exception as e:
    print(f"Error saving best model: {e}")

# Save the target encoder using joblib
try:
    joblib.dump(target_encoder, TARGET_ENCODER_FILENAME)
    print(f"Target encoder saved to '{TARGET_ENCODER_FILENAME}'")
except Exception as e:
    print(f"Error saving target encoder: {e}")

# Save the dictionary of label encoders using joblib
try:
    joblib.dump(label_encoders, LABEL_ENCODERS_FILENAME)
    print(f"Label encoders dictionary saved to '{LABEL_ENCODERS_FILENAME}'")
except Exception as e:
    print(f"Error saving label encoders: {e}")

# --- Log Saved Artifacts to MLflow ---
# It's good practice to log the final artifacts to the MLflow run of the best model.
# We can do this by reopening the run or by logging to the currently active (if any) or experiment.
# Since the loop has finished, we can retrieve the best model's run and log.
# However, a simpler way is to log them as general artifacts for the experiment.
# For best practice, we'll log them to the MLflow run associated with the best model.

if best_run_id: # Ensure we found a best model run
    try:
        print(f"Logging saved artifacts to MLflow run ID: {best_run_id}")
        # This requires that the run is still active or we fetch it.
        # A common way is to log during the run. Since we finished the run,
        # we can log them as general artifacts under the experiment.
        # If you MUST log them to that specific run, you'd need to use mlflow.search_runs
        # to find the run and then use mlflow.artifacts.log_artifacts or similar.

        # For simplicity and to ensure they are available, we'll log them generally
        # If you need them associated with the *specific* run, modify the loop to save/log INSIDE it
        # when accuracy > best_accuracy.
        # Let's log them to the experiment level for now.
        mlflow.log_artifact(BEST_MODEL_FILENAME)
        mlflow.log_artifact(TARGET_ENCODER_FILENAME)
        mlflow.log_artifact(LABEL_ENCODERS_FILENAME)
        print("Saved artifacts logged as MLflow artifacts.")

    except Exception as e:
        print(f"Error logging artifacts to MLflow: {e}")
else:
    print("No best model found or run ID not captured, skipping artifact logging to MLflow.")


# Log metadata about the best model found as parameters
mlflow.log_param("best_model_name", best_model_name)
mlflow.log_param("best_model_accuracy", best_accuracy)
mlflow.log_param("best_model_run_id", best_run_id) # Log the run ID of the best model

print("\nMLflow tracking and artifact saving complete.")
print(f"View results in MLflow UI by running 'mlflow ui' in your terminal.")

# %% [markdown]
# ## 7. Next Steps
#
# You can now use the saved files (`best_price_range_model.pkl`, `price_range_target_encoder.pkl`, `label_encoders.pkl`) in your Streamlit application to perform predictions.
#
# **Remember to:**
# 1.  Place these `.pkl` files in the same directory as your `app.py` file, or specify their correct paths in `app.py`.
# 2.  Ensure the preprocessing logic in `app.py` exactly mirrors what was done in this notebook for accurate predictions.



MLflow tracking URI set to: https://dagshub.com/api/v1/repo-buckets/s3/sandhya-bdb


DagsHub and MLflow initialized successfully.
Dataset 'survey_results_op.csv' loaded successfully. Shape: (29956, 20)
First 5 rows of the dataset:
  respondent_id gender  zone            occupation  income_levels  \
0        R00001      M     3  Working Professional              1   
1        R00002      F     4  Working Professional              5   
2        R00003      F     1  Working Professional              5   
3        R00004      F     3  Working Professional              3   
4        R00005      M     4               Student              0   

   consume_frequency(weekly) current_brand preferable_consumption_size  \
0                          2      Newcomer             Medium (500 ml)   
1                          3   Established             Medium (500 ml)   
2                          2      Newcomer             Medium (500 ml)   
3                          3      Newcomer             Medium (500 ml)   
4                          2   Established             Medium (500 ml

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Accuracy: 0.9246
  Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1930
           1       0.90      0.91      0.91      2223
           2       0.96      0.95      0.95      2430
           3       0.92      0.92      0.92       906

    accuracy                           0.92      7489
   macro avg       0.92      0.92      0.92      7489
weighted avg       0.92      0.92      0.92      7489

------------------------------
🏃 View run Train_XGBoost at: https://dagshub.com/sandhya-bdb/mlflow_dagshub_new.mlflow/#/experiments/0/runs/2b5eaa271efc4b22a8e3f82975661af0
🧪 View experiment at: https://dagshub.com/sandhya-bdb/mlflow_dagshub_new.mlflow/#/experiments/0

Training Light Gradient Boosting Machine (LightGBM)...
MLflow Run ID: f03e7d5885534cc7b7ca62bec59b1c60
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000835 seconds.
You can set `force_row_wise=true` to remove t