In [None]:
# === salary_model_pipeline.py ===

import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.xgboost

import pickle

from fetch_salary_data import fetch_salary_data
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.base import BaseEstimator, TransformerMixin
from evidently import Report
from evidently.presets import DataDriftPreset
import json
import re

from sqlalchemy import create_engine

import uuid


# === Custom Transformers ===
class OrdinalMapTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, mappings, columns):
        self.mappings = mappings
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.columns)
        X_copy = X.copy()
        for col in self.columns:
            X_copy[col] = X_copy[col].map(self.mappings[col]).fillna(-1).astype(int)
        return X_copy.values


class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column, target):
        self.column = column
        self.target = target

    def fit(self, X, y):
        df = X.copy()
        df[self.target] = y
        self.mapping_ = df.groupby(self.column)[self.target].mean().to_dict()
        self.default_ = df[self.target].mean()
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.column] = X_copy[self.column].map(self.mapping_).fillna(self.default_)
        return X_copy[[self.column]].values


def build_pipeline_and_train(df):
    df.dropna(inplace=True)

    target_col = 'salary_in_usd'
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # === Split to evaluate candidate models ===
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    def sanitize_name(name):
        return re.sub(r"[^\w\-/ .]", "_", name)

    report = Report([DataDriftPreset()])
    df_train_test_eval = report.run(X_train,X_test) 
    report_data = json.loads(df_train_test_eval.json())

    mlflow.set_experiment("evidently_train_vs_test")
    with mlflow.start_run():
        for i in report_data.get("metrics", []):
            metric_id = i.get("metric_id", "")
            value = i.get("value", None)

            if metric_id.startswith("Drifted"):
                mlflow.log_metric("Number_of_driftedcolumns", value['count'])
            else:
            
                clean_metric_id = sanitize_name(metric_id.lower().replace(" ", "_"))
                mlflow.log_metric(clean_metric_id, value)

            # print(value)

    # === Columns ===
    ordinal_cols = ['experience_level', 'employment_type', 'company_size']
    numeric_cols = ['years_experience', 'remote_ratio']
    target_encoded_cols = ['job_title', 'company_location', 'salary_currency']

    ordinal_mappings = {
        'experience_level': {'Intern': 0, 'Entry-level': 1, 'Mid': 2, 'Senior': 3, 'Lead': 4, 'Executive': 5, 'Unknown': -1},
        'employment_type': {'Intern': 0, 'Part-time': 1, 'Contract': 2, 'Freelance': 3, 'Full-time': 4, 'Unknown': -1},
        'company_size': {'Small': 0, 'Medium': 1, 'Large': 2, 'Unknown': -1}
    }

    # === Pipelines ===
    ordinal_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('ordinal_mapper', OrdinalMapTransformer(ordinal_mappings, ordinal_cols))
    ])

    target_enc_pipelines = [
        (col, Pipeline([
            ('target_encoder', TargetEncoder(column=col, target=target_col))
        ]), [col]) for col in target_encoded_cols
    ]

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('ord', ordinal_pipeline, ordinal_cols),
            ('num', numeric_pipeline, numeric_cols)
        ] + target_enc_pipelines
    )

    # === Models and Param Grids ===
    regressors = {
        'RandomForest': RandomForestRegressor(random_state=42),
        'GradientBoosting': GradientBoostingRegressor(random_state=42),
        'XGBoost': XGBRegressor(random_state=42, verbosity=0)
    }

    param_grids = {
        'RandomForest': {
            'regressor__n_estimators': [100],
            'regressor__max_depth': [10]
        },
        'GradientBoosting': {
            'regressor__n_estimators': [100],
            'regressor__learning_rate': [0.1]
        },
        'XGBoost': {
            'regressor__n_estimators': [100],
            'regressor__learning_rate': [0.1]
        }
    }

    # === MLflow Setup ===
    mlflow.set_experiment("Salary_Prediction_Productions")
    mlflow.sklearn.autolog()
    mlflow.xgboost.autolog()

    best_model_name = None
    best_rmse = float("inf")
    best_model_class = None
    best_model_params = {}

    for name, model in regressors.items():
        with mlflow.start_run(run_name=f"Candidate_{name}"):
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])

            grid = GridSearchCV(pipeline, param_grids[name], cv=3, scoring='neg_root_mean_squared_error')
            grid.fit(X_train, y_train)
            y_pred = grid.predict(X_test)

            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            mape = mean_absolute_percentage_error(y_test, y_pred)

            mlflow.log_metric("final_rmse", rmse)
            mlflow.log_metric("final_r2", r2)
            mlflow.log_metric("final_mape", mape)

            if rmse < best_rmse:
                best_rmse = rmse
                best_model_name = name
                best_model_class = model.__class__  # Save class for retraining
                best_model_params = grid.best_params_

            print(f"\nModel: {name}")
            print(f"Best Params: {grid.best_params_}")
            print(f"RMSE: {rmse:.2f} | R²: {r2:.4f} | MAPE: {mape:.2f}%")


    # === Retrain Best Model on Full Data ===
    print(f"\n✅ Best model: {best_model_name} (RMSE: {best_rmse:.2f}) — Retraining on full dataset...")

    # Extract just the hyperparameters for regressor (strip "regressor__")
    final_model_params = {k.replace("regressor__", ""): v for k, v in best_model_params.items()}
    final_model = best_model_class(random_state=42, **final_model_params)

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', final_model)
    ])

    with mlflow.start_run(run_name=f"Final_{best_model_name}_FullData"):
        full_pipeline.fit(X, y)
        if best_model_name == 'XGBoost':
            mlflow.xgboost.log_model(full_pipeline, artifact_path="model", registered_model_name="BestSalaryModel")
        else:
            mlflow.sklearn.log_model(full_pipeline, artifact_path="model", registered_model_name="BestSalaryModel")

        print(f"\n📦 Final model '{best_model_name}' retrained on all data and registered in MLflow.")





In [6]:
if __name__ == "__main__":
    
   
    df = fetch_salary_data("postgresql+psycopg2://postgres:admin@localhost:5432/hr-analytics-project")

    # Load the pipeline
    with open(r"Pickle\data_cleaning_pipeline.pkl", "rb") as f:
        cleaning_pipeline = pickle.load(f)

    # Clean it using the loaded pipeline
    df = cleaning_pipeline.transform(df)

    build_pipeline_and_train(df)

✅ Fetched 100000 rows.


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet













2025/07/07 11:02:56 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.





Model: RandomForest
Best Params: {'regressor__max_depth': 10, 'regressor__n_estimators': 100}
RMSE: 119962.75 | R²: 0.2882 | MAPE: 0.24%














2025/07/07 11:03:15 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.





Model: GradientBoosting
Best Params: {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100}
RMSE: 126055.12 | R²: 0.2141 | MAPE: 0.83%














2025/07/07 11:03:27 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.





Model: XGBoost
Best Params: {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100}
RMSE: 123427.13 | R²: 0.2465 | MAPE: 0.89%

✅ Best model: RandomForest (RMSE: 119962.75) — Retraining on full dataset...








Registered model 'BestSalaryModel' already exists. Creating a new version of this model...
Created version '6' of model 'BestSalaryModel'.



📦 Final model 'RandomForest' retrained on all data and registered in MLflow.
