In [10]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from Pipelines.Full_Preprocess_Pipeline import LeadScoringPreprocessor
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from evidently import Report
from evidently.presets import DataDriftPreset
import json
import re
from Pipelines.Feature_Engineering_Pipeline import LeadScoringFeatureEngineer
from Pipelines.Minimal_Training_Pipeline import LeadScoringMinimalCleaner

def prepare_data(df, target_col='Converted'):
    df.dropna(inplace=True)
    X = df.drop(columns=[target_col])
    y = df[target_col]  # Assuming categorical target (e.g., 'high' or 'low')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return X, y, X_train, X_test, y_train, y_test

def check_data_drift(X_train, X_test):
    def sanitize_name(name):
        return re.sub(r"[^\w\-/ .]", "_", name)
    
    report = Report(metrics=[DataDriftPreset()])
    snapshot = report.run(reference_data=X_train, current_data=X_test)  # Capture the Snapshot here
    report_data = json.loads(snapshot.json())  # Use snapshot.json() instead of report.json()
    
    mlflow.set_experiment("evidently_train_vs_test")
    with mlflow.start_run():
        for metric in report_data.get("metrics", []):
            metric_id = metric.get("metric", "")  # Use "metric" key as per updated Evidently JSON structure
            result = metric.get("result", {})
            if "dataset_drift" in result:
                mlflow.log_metric("dataset_drift", int(result["dataset_drift"]))
                mlflow.log_metric("number_of_drifted_columns", result["number_of_drifted_columns"])
            elif "share_of_drifted_columns" in result:
                mlflow.log_metric("share_of_drifted_columns", result["share_of_drifted_columns"])


def evaluate_candidates(X_train, y_train, X_test, y_test):
    classifiers = {
        'RandomForest': RandomForestClassifier(random_state=42),
        'GradientBoosting': GradientBoostingClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42, verbosity=0, use_label_encoder=False, eval_metric='logloss')
    }
    
    param_grids = {
        'RandomForest': {
            'classifier__n_estimators': [100],
            'classifier__max_depth': [10]
        },
        'GradientBoosting': {
            'classifier__n_estimators': [100],
            'classifier__learning_rate': [0.1]
        },
        'XGBoost': {
            'classifier__n_estimators': [100],
            'classifier__learning_rate': [0.1]
        }
    }
    
    mlflow.set_experiment("Lead_Scoring_Classification")
    mlflow.sklearn.autolog()
    mlflow.xgboost.autolog()
    
    best_model_name = None
    best_f1 = 0.0
    best_model_class = None
    best_model_params = {}
    
    for name, model in classifiers.items():
        with mlflow.start_run(run_name=f"Candidate_{name}"):
            pipeline = Pipeline([
                   ('preprocessor', LeadScoringMinimalCleaner()),
                ('Feature Engineer' , LeadScoringFeatureEngineer()),
                ('classifier', model)
            ])
            
            grid = GridSearchCV(pipeline, param_grids[name], cv=3, scoring='f1')
            grid.fit(X_train, y_train)
            y_pred = grid.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("f1_score", f1)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            
            if f1 > best_f1:
                best_f1 = f1
                best_model_name = name
                best_model_class = model.__class__
                best_model_params = grid.best_params_
            
            print(f"\nModel: {name}")
            print(f"Best Params: {grid.best_params_}")
            print(f"Accuracy: {accuracy:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")
    
    return best_model_name, best_f1, best_model_class, best_model_params

def retrain_best_model(X, y, best_model_name, best_model_class, best_model_params):
    print(f"\n✅ Best model: {best_model_name} (F1: {best_f1:.4f}) — Retraining on full dataset...")
    
    final_model_params = {k.replace("classifier__", ""): v for k, v in best_model_params.items()}
    if best_model_name == 'XGBoost':
        final_model_params.update({'use_label_encoder': False, 'eval_metric': 'logloss'})
    final_model = best_model_class(random_state=42, **final_model_params)
    
    full_pipeline = Pipeline([
        ('preprocessor', LeadScoringPreprocessor()),
        ('classifier', final_model)
    ])
    
    with mlflow.start_run(run_name=f"Final_{best_model_name}_FullData"):
        full_pipeline.fit(X, y)
        
        # Log model
        if best_model_name == 'XGBoost':
            mlflow.xgboost.log_model(full_pipeline, artifact_path="model", registered_model_name="BestLeadScoringModel")
        else:
            mlflow.sklearn.log_model(full_pipeline, artifact_path="model", registered_model_name="BestLeadScoringModel")
        
        print(f"\n📦 Final model '{best_model_name}' retrained and registered to MLflow.")

# Main function to orchestrate the process
def build_pipeline_and_train(df):
    X, y, X_train, X_test, y_train, y_test = prepare_data(df)
    check_data_drift(X_train, X_test)
    best_model_name, best_f1, best_model_class, best_model_params = evaluate_candidates(X_train, y_train, X_test, y_test)
    retrain_best_model(X, y, best_model_name, best_model_class, best_model_params)


In [69]:
df = pd.read_csv("Datasets/Lead Scoring.csv")

In [67]:
build_pipeline_and_train(df)

2025/07/21 09:09:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f294479d74ba411bb9a7a1115e870d3d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2025/07/21 09:09:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f2c4a4af41ad44d7a4fa4d593126ed07', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2025/07/21 09:09:59 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '70fc3499e9dc4bdbb8d597df8b516ae2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow

Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Newspaper Article_Yes
Feature names seen at fit time, yet now missing:
- Digital Advertisement_Yes
- What is your current occupation_Other


In [76]:
df = pd.read_csv("Datasets/Lead Scoring.csv")

In [83]:
full_pipeline = Pipeline([
        ('preprocessor', LeadScoringMinimalCleaner()),
        ('Feature Engineer' , LeadScoringFeatureEngineer())
    ])

In [84]:
X = df.drop(["Converted"],axis=1)
y=df["Converted"]

2025/07/21 11:41:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '388ca5157bb94bcd909077fd4cb486ae', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow




0,1,2
,steps,"[('preprocessor', ...), ('Feature Engineer', ...)]"
,transform_input,
,memory,
,verbose,False


In [61]:
full_pipeline.transform(X)





Unnamed: 0,Lead Origin,Lead Source,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,Do Not Email_Yes,A free copy of Mastering The Interview_Yes,Search_Yes,...,X Education Forums_Yes,Newspaper_Yes,Digital Advertisement_Yes,Through Recommendations_Yes,What is your current occupation_Housewife,What is your current occupation_Other,What is your current occupation_Student,What is your current occupation_Unemployed,What is your current occupation_Unknown,What is your current occupation_Working Professional
0,0.311453,0.255271,0.0,0,0.00,0.235937,0.286686,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,0.311453,0.393247,5.0,674,2.50,0.376836,0.286686,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,0.361850,0.321667,2.0,1532,2.00,0.376836,0.444169,False,True,False,...,False,False,False,False,False,False,True,False,False,False
3,0.361850,0.321667,1.0,305,1.00,0.333368,0.418719,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,0.361850,0.393247,2.0,1428,1.00,0.126168,0.286686,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,0.361850,0.321667,8.0,1845,2.67,0.558053,0.382514,True,False,False,...,False,False,False,False,False,False,False,True,False,False
9236,0.361850,0.321667,2.0,238,2.00,0.629144,0.418719,False,True,False,...,False,False,False,False,False,False,False,True,False,False
9237,0.361850,0.321667,2.0,199,2.00,0.629144,0.444169,True,True,False,...,False,False,False,False,False,False,False,True,False,False
9238,0.361850,0.393247,3.0,499,3.00,0.629144,0.457547,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [75]:
import pickle

In [None]:
full_pipeline = Pipeline([
        ('preprocessor', LeadScoringMinimalCleaner()),
        ('Feature Engineer' , LeadScoringFeatureEngineer())
    ])

full_pipeline.fit(X,y)

In [87]:
import pickle

# Save the fitted pipeline
with open('lead_scoring_pipeline.pkl', 'wb') as file:
    pickle.dump(full_pipeline, file)


In [3]:
import pandas as pd
df = pd.read_csv("Datasets/Lead Scoring.csv")

In [4]:
X = df.drop(['Converted'] , axis=1)

In [6]:
X.head().to_csv("Random_Sample_Data.csv")

In [9]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install evidently

Note: you may need to restart the kernel to use updated packages.


In [24]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from Pipelines.Full_Preprocess_Pipeline import LeadScoringPreprocessor  # Assumed to handle full preprocessing (cleaning + engineering + encoding)
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from evidently import Report
from evidently.presets import DataDriftPreset
import json
import re

# Note: Removed unused imports for LeadScoringFeatureEngineer and LeadScoringMinimalCleaner; assume they are inside LeadScoringPreprocessor

def prepare_data(df: pd.DataFrame, target_col: str = 'Converted') -> tuple:
    """
    Prepares data by preprocessing the full dataset first, then splitting to avoid encoding inconsistencies.

    Parameters:
    - df (pd.DataFrame): Input DataFrame with features and target.
    - target_col (str): Name of the target column (default: 'Converted').

    Returns:
    - Tuple: (X_preprocessed, y, X_train, X_test, y_train, y_test)

    Raises:
    - ValueError: If DataFrame is empty or target column is missing.

    Notes:
    - Drops NaNs on full df to ensure consistency.
    - Applies full preprocessing before splitting to handle OneHotEncoding uniformly.
    """
    if df.empty:
        raise ValueError("Input DataFrame is empty.")
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in DataFrame.")

    df.dropna(inplace=True)  # Handle missing values on full data (adjust if imputation is preferred)
    X = df.drop(columns=[target_col])
    y = df[target_col]  # Assuming binary target (e.g., 0/1 for 'Converted')

    # Preprocess full X to ensure consistent transformations (e.g., OneHotEncoding learns all categories)
    preprocessor = LeadScoringPreprocessor()  # Custom preprocessor (includes cleaning, engineering, encoding)
    X_preprocessed = preprocessor.fit_transform(X,y)  # Fit on full X, transform full X

    # Now split the preprocessed data
    X_train, X_test, y_train, y_test = train_test_split(
        X_preprocessed, y, test_size=0.2, random_state=42, stratify=y
    )

    return X_preprocessed, y, X_train, X_test, y_train, y_test


def check_data_drift(X_train: pd.DataFrame, X_test: pd.DataFrame) -> None:
    """
    Checks for data drift between train and test sets using Evidently, logs to MLflow.

    Parameters:
    - X_train (pd.DataFrame): Training features.
    - X_test (pd.DataFrame): Test features.

    Notes:
    - Uses DataDriftPreset for detection.
    - Sanitizes names to avoid invalid characters in MLflow logging.
    - Logs key metrics like dataset_drift and number_of_drifted_columns.
    """
    def sanitize_name(name: str) -> str:
        """Sanitizes names by replacing invalid characters with underscores."""
        return re.sub(r"[^\w\-/ .]", "_", name)

    report = Report(metrics=[DataDriftPreset()])
    snapshot = report.run(reference_data=X_train, current_data=X_test)  # Run report

    report_data = json.loads(snapshot.json())  # Parse JSON from snapshot

    mlflow.set_experiment("evidently_train_vs_test")
    with mlflow.start_run():
        for metric in report_data.get("metrics", []):
            metric_id = sanitize_name(metric.get("metric", ""))  # Sanitize metric ID
            result = metric.get("result", {})
            if "dataset_drift" in result:
                mlflow.log_metric(f"{metric_id}_dataset_drift", int(result["dataset_drift"]))
                mlflow.log_metric(f"{metric_id}_number_of_drifted_columns", result["number_of_drifted_columns"])
            elif "share_of_drifted_columns" in result:
                mlflow.log_metric(f"{metric_id}_share_of_drifted_columns", result["share_of_drifted_columns"])


def evaluate_candidates(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> tuple:
    """
    Evaluates candidate classifiers using GridSearchCV on preprocessed train data.

    Parameters:
    - X_train, y_train: Training data (preprocessed).
    - X_test, y_test: Test data (preprocessed).

    Returns:
    - Tuple: (best_model_name, best_f1, best_model_class, best_model_params)

    Notes:
    - Focuses on tree-based models for non-linear data handling.
    - Uses simplified param grids; expand for production.
    - Logs metrics to MLflow and autologs models.
    - Selects best model based on F1 score.
    """
    classifiers = {
        'RandomForest': RandomForestClassifier(random_state=42),
        'GradientBoosting': GradientBoostingClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42, verbosity=0, use_label_encoder=False, eval_metric='logloss')
    }

    param_grids = {
        # 'RandomForest': {'n_estimators': [100], 'max_depth': [10]},
        'GradientBoosting': {'n_estimators': [100], 'learning_rate': [0.1]},
        'XGBoost': {'n_estimators': [100], 'learning_rate': [0.1]}
    }

    mlflow.set_experiment("Lead_Scoring_Classification")
    mlflow.sklearn.autolog()
    mlflow.xgboost.autolog()

    best_model_name = None
    best_f1 = 0.0
    best_model_class = None
    best_model_params = {}

    for name, model in classifiers.items():
        with mlflow.start_run(run_name=f"Candidate_{name}"):
            # No preprocessor in this pipeline since data is already preprocessed
            pipeline = Pipeline([('classifier', model)])

            grid = GridSearchCV(pipeline, param_grids[name], cv=3, scoring='f1')
            grid.fit(X_train, y_train)
            y_pred = grid.predict(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)

            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("f1_score", f1)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)

            if f1 > best_f1:
                best_f1 = f1
                best_model_name = name
                best_model_class = model.__class__
                best_model_params = grid.best_params_

            print(f"\nModel: {name}")
            print(f"Best Params: {grid.best_params_}")
            print(f"Accuracy: {accuracy:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

    return best_model_name, best_f1, best_model_class, best_model_params


def retrain_best_model(X: pd.DataFrame, y: pd.Series, best_model_name: str, best_model_class: object, best_model_params: dict) -> None:
    """
    Retrains the best model on the full preprocessed dataset and logs to MLflow.

    Parameters:
    - X (pd.DataFrame): Full preprocessed features.
    - y (pd.Series): Full target.
    - best_model_name, best_model_class, best_model_params: From evaluate_candidates.

    Notes:
    - No additional preprocessing needed (already done).
    - Registers the model in MLflow.
    """
    print(f"\n✅ Best model: {best_model_name} (F1: {best_f1:.4f}) — Retraining on full dataset...")

    final_model_params = {k: v for k, v in best_model_params.items()}  # No 'classifier__' prefix since no pipeline here
    if best_model_name == 'XGBoost':
        final_model_params.update({'use_label_encoder': False, 'eval_metric': 'logloss'})
    final_model = best_model_class(random_state=42, **final_model_params)

    # Simple pipeline with just the classifier (preprocessing already done on X)
    full_pipeline = Pipeline([('classifier', final_model)])

    with mlflow.start_run(run_name=f"Final_{best_model_name}_FullData"):
        full_pipeline.fit(X, y)

        # Log model
        if best_model_name == 'XGBoost':
            mlflow.xgboost.log_model(full_pipeline, artifact_path="model", registered_model_name="BestLeadScoringModel")
        else:
            mlflow.sklearn.log_model(full_pipeline, artifact_path="model", registered_model_name="BestLeadScoringModel")

        print(f"\n📦 Final model '{best_model_name}' retrained and registered to MLflow.")


def build_pipeline_and_train(df: pd.DataFrame) -> None:
    """
    Orchestrates the full workflow: prepare data, check drift, evaluate, retrain.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Example:
    >>> df = pd.read_csv('leads_data.csv')
    >>> build_pipeline_and_train(df)
    """
    X_preprocessed, y, X_train, X_test, y_train, y_test = prepare_data(df)
    check_data_drift(X_train, X_test)
    best_model_name, best_f1, best_model_class, best_model_params = evaluate_candidates(X_train, y_train, X_test, y_test)
    retrain_best_model(X_preprocessed, y, best_model_name, best_model_class, best_model_params)


In [25]:
df = pd.read_csv("Datasets/Lead Scoring.csv")

In [32]:
build_pipeline_and_train(df)



ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 484, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 182, in patch_with_managed_run
    result = patch_function(original, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\sklearn\__init__.py", line 1724, in patched_fit
    return original(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 475, in call_original
    return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 426, in call_original_fn_with_event_logging
    original_fn_result = original_fn(*og_args, **og_kwargs)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 472, in _original_fn
    original_result = original(*_og_args, **_og_kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\pipeline.py", line 661, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 484, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 182, in patch_with_managed_run
    result = patch_function(original, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\sklearn\__init__.py", line 1724, in patched_fit
    return original(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 475, in call_original
    return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 426, in call_original_fn_with_event_logging
    original_fn_result = original_fn(*og_args, **og_kwargs)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 472, in _original_fn
    original_result = original(*_og_args, **_og_kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\ensemble\_forest.py", line 359, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2971, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1368, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1053, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 757, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'No'

--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 403, in safe_patch_function
    return original(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\pipeline.py", line 661, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 403, in safe_patch_function
    return original(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\ensemble\_forest.py", line 359, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2971, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1368, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1053, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 757, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Minfy\Desktop\Practice\Lead_Scoring_Final_Folder\.venv\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'No'


In [29]:
df = pd.read_csv("Datasets/Lead Scoring.csv")

In [23]:
df["Digital Advertisement"].value_counts()

Digital Advertisement
No     9236
Yes       4
Name: count, dtype: int64

In [None]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from Pipelines.Feature_Engineering_Pipeline import LeadScoringFeatureEngineer
from Pipelines.Minimal_Training_Pipeline import LeadScoringMinimalCleaner
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from evidently import Report
from evidently.presets import DataDriftPreset
import json
import re

def prepare_data(df: pd.DataFrame, target_col: str = 'Converted') -> tuple:
    """
    Prepares data by preprocessing the full dataset first, then splitting to avoid encoding inconsistencies.
    """
    if df.empty:
        raise ValueError("Input DataFrame is empty.")
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in DataFrame.")

    df.dropna(inplace=True)
    X = df.drop(columns=[target_col])
    y = df[target_col]

    preprocessor = Pipeline([("minimal Cleaner" , LeadScoringMinimalCleaner()),("feature engineering" , LeadScoringFeatureEngineer())])
    # preprocessor = LeadScoringPreprocessor()
    X_preprocessed = preprocessor.fit_transform(X,y)  # Fit and transform full X

    X_train, X_test, y_train, y_test = train_test_split(
        X_preprocessed, y, test_size=0.2, random_state=42, stratify=y
    )

    return X_preprocessed, y, X_train, X_test, y_train, y_test

def check_data_drift(X_train: pd.DataFrame, X_test: pd.DataFrame) -> None:
    """
    Checks for data drift between train and test sets using Evidently, logs to MLflow.
    """
    def sanitize_name(name: str) -> str:
        return re.sub(r"[^\w\-/ .]", "_", name)

    report = Report(metrics=[DataDriftPreset()])
    snapshot = report.run(reference_data=X_train, current_data=X_test)
    report_data = json.loads(snapshot.json())

    mlflow.set_experiment("evidently_train_vs_test")
    with mlflow.start_run():
        for metric in report_data.get("metrics", []):
            metric_id = sanitize_name(metric.get("metric", ""))
            result = metric.get("result", {})
            if "dataset_drift" in result:
                mlflow.log_metric(f"{metric_id}_dataset_drift", int(result["dataset_drift"]))
                mlflow.log_metric(f"{metric_id}_number_of_drifted_columns", result["number_of_drifted_columns"])
            elif "share_of_drifted_columns" in result:
                mlflow.log_metric(f"{metric_id}_share_of_drifted_columns", result["share_of_drifted_columns"])

def evaluate_candidates(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> tuple:
    """
    Evaluates candidate classifiers using GridSearchCV on preprocessed train data.
    """
    classifiers = {
        'RandomForest': RandomForestClassifier(random_state=42),
        'GradientBoosting': GradientBoostingClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42, verbosity=0, use_label_encoder=False, eval_metric='logloss')
    }

    param_grids = {
        'RandomForest': {
            'classifier__n_estimators': [100],
            'classifier__max_depth': [10]
        },
        'GradientBoosting': {
            'classifier__n_estimators': [100],
            'classifier__learning_rate': [0.1]
        },
        'XGBoost': {
            'classifier__n_estimators': [100],
            'classifier__learning_rate': [0.1]
        }
    }

    mlflow.set_experiment("Lead_Scoring_Classification")
    mlflow.sklearn.autolog()
    mlflow.xgboost.autolog()

    best_model_name = None
    best_f1 = 0.0
    best_model_class = None
    best_model_params = {}

    for name, model in classifiers.items():
        try:
            # Check if param_grid exists for this model to prevent KeyError
            if name not in param_grids:
                raise KeyError(f"Missing param_grid for model '{name}'. Check param_grids dictionary.")

            with mlflow.start_run(run_name=f"Candidate_{name}"):
                pipeline = Pipeline([('classifier', model)])

                grid = GridSearchCV(pipeline, param_grids[name], cv=3, scoring='f1')
                grid.fit(X_train, y_train)
                y_pred = grid.predict(X_test)

                accuracy = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)

                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("f1_score", f1)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)

                if f1 > best_f1:
                    best_f1 = f1
                    best_model_name = name
                    best_model_class = model.__class__
                    best_model_params = grid.best_params_

                print(f"\nModel: {name}")
                print(f"Best Params: {grid.best_params_}")
                print(f"Accuracy: {accuracy:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

        except KeyError as e:
            print(f"Error for model {name}: {e}. Skipping...")

    if best_model_name is None:
        raise ValueError("No models were successfully evaluated. Check param_grids and classifiers.")

    return best_model_name, best_f1, best_model_class, best_model_params

def retrain_best_model(X: pd.DataFrame, y: pd.Series, best_model_name: str, best_model_class: object, best_model_params: dict) -> None:
    """
    Retrains the best model on the full preprocessed dataset and logs to MLflow.
    """
    print(f"\n✅ Best model: {best_model_name} (F1: {best_f1:.4f}) — Retraining on full dataset...")

    # Strip 'classifier__' prefix from params
    final_model_params = {k.replace("classifier__", ""): v for k, v in best_model_params.items()}
    if best_model_name == 'XGBoost':
        final_model_params.update({'use_label_encoder': False, 'eval_metric': 'logloss'})
    final_model = best_model_class(random_state=42, **final_model_params)

    full_pipeline = Pipeline([('classifier', final_model)])

    with mlflow.start_run(run_name=f"Final_{best_model_name}_FullData"):
        full_pipeline.fit(X, y)

        if best_model_name == 'XGBoost':
            mlflow.xgboost.log_model(full_pipeline, artifact_path="model", registered_model_name="BestLeadScoringModel")
        else:
            mlflow.sklearn.log_model(full_pipeline, artifact_path="model", registered_model_name="BestLeadScoringModel")

        print(f"\n📦 Final model '{best_model_name}' retrained and registered to MLflow.")

def build_pipeline_and_train(df: pd.DataFrame) -> None:
    """
    Orchestrates the full workflow.
    """
    X_preprocessed, y, X_train, X_test, y_train, y_test = prepare_data(df)
    check_data_drift(X_train, X_test)
    best_model_name, best_f1, best_model_class, best_model_params = evaluate_candidates(X_train, y_train, X_test, y_test)
    retrain_best_model(X_preprocessed, y, best_model_name, best_model_class, best_model_params)
