In [1]:
%pwd

'c:\\Arjun_Works\\SalesNexus\\research'

In [2]:
import os

os.chdir("../")

%pwd

'c:\\Arjun_Works\\SalesNexus'

<p>It’s a great time to integrate with MLflow to track our training, monitor metrics, and manage experiments more effectively.</p>

In [3]:
import dagshub
dagshub.init(repo_owner='phoenixarjun007', repo_name='SalesNexus', mlflow=True)

import mlflow
with mlflow.start_run():
  mlflow.log_param('parameter name', 'value')
  mlflow.log_metric('metric name', 1)

  import pkg_resources


In [4]:
import os
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
import numpy as np
import mlflow
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

from ml_service.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from ml_service.utils.main_utils import read_yaml, create_directories, save_json


In [5]:
@dataclass(frozen=True)
class ModelBuildingAndEvaluationConfig:
    path_of_model: Path
    input_train_file: Path
    input_test_file: Path
    metrics_file: Path
    all_params: dict
    mlflow_uri: str

In [6]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath: str = CONFIG_FILE_PATH,
                 params_filepath: str = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        root_dir = Path(self.config.modelBuildingAndEvaluation.root_dir)
        create_directories([root_dir])

    def get_modelBuilding_and_evaluation_config(self) -> ModelBuildingAndEvaluationConfig:
        """Construct the EvaluationConfig object based on modelBuildingAndEvaluation settings."""
        model_cfg = self.config.modelBuildingAndEvaluation

        return ModelBuildingAndEvaluationConfig(
            path_of_model=Path(model_cfg.model_file),
            input_train_file=Path(model_cfg.input_train_file),
            input_test_file=Path(model_cfg.input_test_file),
            metrics_file=Path(model_cfg.evaluation_metrics),
            all_params=self.params,
            mlflow_uri=self.params.get("TRACKING_SERVER", "")
        )

In [None]:
class ModelBuildingAndEvaluation:
    """Train, Evaluate Model and Track Results with MLflow."""
    def __init__(self, config):
        self.config = config
        self.model = None
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None

    def load_data(self):
        """Load, sample, and split data into train/val sets."""
        train_df = pd.read_csv(self.config.input_train_file)
        test_df = pd.read_csv(self.config.input_test_file)

        # Ensure 'date' is datetime
        train_df["date"] = pd.to_datetime(train_df["date"])
        # train_df = train_df.sample(n=50000, random_state=42).reset_index(drop=True)

        # Split into training and validation
        train_split = train_df[train_df["date"].dt.year <= 2016].reset_index(drop=True)
        val_split = train_df[train_df["date"].dt.year == 2017].reset_index(drop=True)

        self.X_train = train_split.drop(columns=["sales", "date"])
        self.y_train = train_split["sales"]

        self.X_test = val_split.drop(columns=["sales", "date"])
        self.y_test = val_split["sales"]

    def train_model(self):
        """Train XGBoost Model based on config parameters."""
        self.model = XGBRegressor(
            random_state=self.config.all_params["RANDOM_STATE"],
            n_estimators=self.config.all_params["N_ESTIMATORS"],
            learning_rate=self.config.all_params["LEARNING_RATE"],
            max_depth=self.config.all_params["MAX_DEPTH"],
            subsample=self.config.all_params["SUBSAMPLE"],
            colsample_bytree=self.config.all_params["COLSAMPLE_BY_TREE"],
            objective=self.config.all_params["OBJECTIVE"]
        )
        self.model.fit(self.X_train, self.y_train)

        # Save trained model
        os.makedirs(Path(self.config.path_of_model).parent, exist_ok=True)
        joblib.dump(self.model, self.config.path_of_model)

    def evaluate(self) -> dict:
        """Evaluate Model and Save Metrics."""
        y_pred = self.model.predict(self.X_test)
        rmse = np.sqrt(mean_squared_error(self.y_test, y_pred))
        mae = mean_absolute_error(self.y_test, y_pred)
        rmsle = np.sqrt(np.mean(np.square(np.log1p(np.maximum(y_pred, 0)) - np.log1p(self.y_test))))

        metrics = {"rmse": rmse, "mae": mae, "rmsle": rmsle}
        save_json(self.config.metrics_file, metrics)

        return metrics

    def log_into_mlflow(self, metrics: dict):
        """Log Model Parameters and Metrics into MLflow."""
        mlflow.set_tracking_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        model_params = {
            "MODEL_TYPE": self.config.all_params["MODEL_TYPE"],
            "RANDOM_STATE": self.config.all_params["RANDOM_STATE"],
            "N_ESTIMATORS": self.config.all_params["N_ESTIMATORS"],
            "LEARNING_RATE": self.config.all_params["LEARNING_RATE"],
            "MAX_DEPTH": self.config.all_params["MAX_DEPTH"],
            "SUBSAMPLE": self.config.all_params["SUBSAMPLE"],
            "COLSAMPLE_BY_TREE": self.config.all_params["COLSAMPLE_BY_TREE"],
            "REG_ALPHA": self.config.all_params["REG_ALPHA"],
            "REG_LAMBDA": self.config.all_params["REG_LAMBDA"],
            "GAMMA": self.config.all_params["GAMMA"],
            "OBJECTIVE": self.config.all_params["OBJECTIVE"]
        }

        with mlflow.start_run():
            mlflow.log_params(model_params)
            mlflow.log_metrics(metrics)

            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(self.model, "model",
                                          registered_model_name="SalesNexus_XGBoost_Model")
            else:
                mlflow.sklearn.log_model(self.model, "model")

    def create_submission(self, test_file, submission_file):
        """Create Submission File for Kaggle-style prediction."""
        test_df = pd.read_csv(test_file)

        # Ensure columns match training
        missing_cols = set(self.X_train.columns) - set(test_df.columns)
        for col in missing_cols:
            test_df[col] = 0
        test_features = test_df[self.X_train.columns]

        sales_predictions = self.model.predict(test_features)
        sales_predictions = np.where(sales_predictions < 0, 0, sales_predictions)

        submission = test_df[["id"]].copy()
        submission["sales"] = sales_predictions
        submission.to_csv(submission_file, index=False)

        print(f"✅ Submission saved! Shape: {submission.shape}")

    def run_pipeline(self):
        """Complete End-to-End Model Training, Evaluation, and Logging."""
        self.load_data()
        self.train_model()
        metrics = self.evaluate()
        self.log_into_mlflow(metrics)
        return metrics


In [14]:
try:
    config_manager = ConfigurationManager(CONFIG_FILE_PATH, PARAMS_FILE_PATH)
    model_and_eval_config = config_manager.get_modelBuilding_and_evaluation_config()

    process = ModelBuildingAndEvaluation(model_and_eval_config)

    metrics = process.run_pipeline()
    print(f"✅ Evaluation Done!\nMetrics saved to {model_and_eval_config.metrics_file}")

    process.create_submission(model_and_eval_config.input_test_file, "submission_XgBoost_model2.csv")

except Exception as e:
    raise e


[2025-06-27 19:16:20,070: INFO: main_utils: yaml file: config\config.yaml loaded successfully]
[2025-06-27 19:16:20,076: INFO: main_utils: yaml file: params.yaml loaded successfully]
[2025-06-27 19:16:20,079: INFO: main_utils: created directory at: artifacts\model]
[2025-06-27 19:17:59,610: INFO: main_utils: json file saved at: evaluation_metrics.json]


Registered model 'SalesNexus_XGBoost_Model' already exists. Creating a new version of this model...
2025/06/27 19:18:13 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: SalesNexus_XGBoost_Model, version 23
Created version '23' of model 'SalesNexus_XGBoost_Model'.


✅ Evaluation Done!
Metrics saved to evaluation_metrics.json
✅ Submission saved! Shape: (28512, 2)
