# Installations

In [None]:
%%writefile requirements.txt
category-encoders
optuna
xgboost
scikit-learn
optuna_fast_fanova
mlflow

Overwriting requirements.txt


In [None]:
!pip install -qU -r requirements.txt

# Imports

In [None]:
# Standard Library Imports
import gc
import os
import re
import sqlite3
import sys
from math import sqrt
from pathlib import Path

# Third-Party Imports
import joblib
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import xgboost as xgb
from category_encoders import BinaryEncoder, LeaveOneOutEncoder
from optuna.importance import get_param_importances
from optuna.pruners import HyperbandPruner
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances
)
from optuna_fast_fanova import FanovaImportanceEvaluator
from scipy import stats
from scipy.stats import ks_2samp, mstats, zscore
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import (
    f_regression,
    mutual_info_regression,
    SelectKBest,
    VarianceThreshold
)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    cross_val_score,
    train_test_split
)
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import (
    LabelEncoder,
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler
)
from sklearn.svm import OneClassSVM
from statsmodels.stats.diagnostic import linear_reset
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from xgboost import XGBRegressor
import yaml

# Constants

We define a `config.yaml` file to externalize key parameters and paths.

In [None]:
config_yaml_content = """
PATH_PARTS:
  - /content
  - drive
  - MyDrive
  - Colab
  - Machine Learning
  - Regression
  - Radancy

RANDOM_STATE: 42
COMPRESS: 9
N_JOBS: -1
TARGET: "cpa"
"""

config_yaml_path = os.path.join(
    '/content',
    'drive',
    'MyDrive',
    'Colab',
    'Machine Learning',
    'Regression',
    'Radancy',
    'config.yaml'
)

if os.path.exists(config_yaml_path):
  os.remove(config_yaml_path)

with open(config_yaml_path, "w") as f:
  f.write(config_yaml_content)

# Load config
with open(config_yaml_path, "r") as file:
  config = yaml.safe_load(file)

# Cross-platform path assembly
PATH = Path().joinpath(*config["PATH_PARTS"])

# Other parameters
RANDOM_STATE = config["RANDOM_STATE"]
COMPRESS = config["COMPRESS"]
N_JOBS = config["N_JOBS"]
TARGET = config["TARGET"]

# Importing Custom Utilities Safely

We add the project root directory to `sys.path` to enable importing of our own modular code (e.g. ml_utils.py) which contains reusable functions.

In [None]:
sys.path.append(str(PATH))

In [None]:
from ml_utils import *

# Experiment Tracking with MLflow

We initialize MLflow with a local file-based tracking URI to systematically log and compare model experiments including parameters, metrics, artifacts and models.

In [None]:
mlflow.set_tracking_uri(f"file://{PATH / 'ml_experiments'}")

# Utility Functions

This defines `ml_utils.py` - a centralized, reusable module containing essential functions for the entire ML pipeline.

Rather than scattering logic across the notebook, we encapsulate key operations into well-documented, testable utilities that promote:

- **Code Reusability:**
Functions like `evaluate_and_log_metrics`, `shap_analysis` and `log_transformer_to_mlflow` can be reused across experiments and projects.

- **Experiment Tracking & Reproducibility:**
Full integration with MLflow ensures every model, parameter, metric and artifact (e.g. SHAP values, Optuna search space) is logged systematically.

- **Robust Diagnostics:**
Includes statistical checks (`VIF`, correlation analysis) to validate assumptions.

- **Safe Preprocessing Workflow:**
`apply_transformer` and `compare_features_train_validation` ensure consistent transformations across train/validation sets - preventing subtle bugs.

- **Config-Driven & Portable:**
Loads paths and settings from `config.yaml`, enabling cross-platform compatibility (e.g. local, Colab, CI/CD) without hardcoded paths.

- **Model Management:**
Functions like `load_model`, `get_best_params` and `save_object` streamline model retrieval and deployment.

In [None]:
%%writefile ml_utils.py

# Standard library imports
import json
import os
import shutil
from pathlib import Path

# Third-party imports
import joblib
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import shap
import yaml
from mlflow.tracking import MlflowClient
from scipy.stats import ks_2samp, chi2_contingency
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant

config_yaml_path = os.path.join(
    '/content',
    'drive',
    'MyDrive',
    'Colab',
    'Machine Learning',
    'Regression',
    'Radancy',
    'config.yaml'
)

# Load config
with open(config_yaml_path, "r") as file:
  config = yaml.safe_load(file)

# Cross-platform path assembly
PATH = Path().joinpath(*config["PATH_PARTS"])

# Other parameters
RANDOM_STATE = config["RANDOM_STATE"]
COMPRESS = config["COMPRESS"]
N_JOBS = config["N_JOBS"]
TARGET = config["TARGET"]

def calculate_vif(df, target_col):
  """
  Calculate Variance Inflation Factor (VIF) for numerical features in a
  DataFrame.

  Parameters:
  - df: pandas.DataFrame — the input DataFrame
  - target_col: str — name of the target column to exclude from VIF calculation

  Returns:
  - vif_data: pandas.DataFrame — DataFrame containing features and their VIF
  values, sorted in descending order

  VIF Interpretation Guide:
  -------------------------
  VIF Value   Interpretation
  ---------   ------------------------------------------------------
  1           No multicollinearity
  1 - 5       Low to moderate multicollinearity (usually acceptable)
  > 5         Potential multicollinearity issue
  > 10        Serious multicollinearity — needs attention
  """

  X = df.drop(columns=[target_col]).select_dtypes(include=['float64', 'int64'])
  # add intercept term
  X = add_constant(X)

  vif_data = pd.DataFrame()
  vif_data["feature"] = X.columns

  vif_data["VIF"] = [
      variance_inflation_factor(X.values, i) for i in range(X.shape[1])
  ]

  return vif_data.sort_values(by='VIF', ascending=False)
###############################################################################

def plot_and_save_correlation_matrix(
    df,
    save_path,
    filename='corr_matrix.csv',
    figsize=(12, 8),
    title='Correlation Matrix'
):
  """
  Computes the correlation matrix of a DataFrame, saves it as a CSV, and plots
  a heatmap.

  Parameters:
  - df: pandas.DataFrame — the input DataFrame
  - save_path: str — directory path to save the CSV file
  - filename: str — name of the CSV file to save
  - figsize: tuple — size of the heatmap figure
  - title: str — title of the heatmap plot
  """
  # Compute correlation matrix
  corr_matrix = df.corr()

  # Save correlation matrix to CSV
  os.makedirs(save_path, exist_ok=True)
  corr_matrix.to_csv(os.path.join(save_path, filename))

  # Plot heatmap
  plt.figure(figsize=figsize)
  sns.heatmap(
      corr_matrix,
      annot=True,
      fmt=".2f",
      cmap='coolwarm',
      square=True,
      linewidths=0.5
  )

  plt.title(title)
  plt.tight_layout()
  plt.show()
###############################################################################

def apply_transformer(df_train, df_val, transformer, target_column):
  """
  Apply a transformer to train and validation DataFrames, preserving the target
  column.

  Parameters:
  - df_train (pd.DataFrame): Training DataFrame
  - df_val (pd.DataFrame): validation DataFrame
  - transformer: Transformer object with a transform method
  - target_column (str): Name of the target column to preserve

  Returns:
  tuple: Transformed (df_train, df_val)
  """
  # Transform train data, excluding target column, and reattach target
  df_train_transformed = pd.concat(
      [
          transformer.transform(df_train.drop(target_column, axis=1)),
          df_train[target_column]
      ],
      axis=1
  )

  # Transform validation data, excluding target column, and reattach target
  df_val_transformed = pd.concat(
      [
          transformer.transform(df_val.drop(target_column, axis=1)),
          df_val[target_column]
      ],
      axis=1
  )

  return df_train_transformed, df_val_transformed
###############################################################################

def log_transformer_to_mlflow(
    transformer,
    experiment_name,
    model_name,
    df,
    target_col=None
):
  """
  Fit a transformer, set an MLflow experiment, and log the model and its
  parameters.

  Args:
  - transformer: The transformer object to fit and log.
  - experiment_name (str): Name of the MLflow experiment.
  - model_name (str): Name for the model artifact in MLflow.
  - df: DataFrame to fit the transformer.
  - target_col (str, optional): Name of the target column, if applicable.
  """
  # Prepare data for fitting
  X = df.drop(target_col, axis=1) if target_col else df
  y = df[target_col] if target_col else None

  # Fit the transformer
  transformer.fit(X, y)

  # Set experiment
  mlflow.set_experiment(experiment_name)

  with mlflow.start_run():
    # Log all attributes as parameters
    for attr_name, attr_value in vars(transformer).items():
      try:
        mlflow.log_param(attr_name, str(attr_value))
      except Exception as e:
        print(f"Could not log param {attr_name}: {e}")

    # Log the model
    mlflow.sklearn.log_model(
        sk_model=transformer,
        name=model_name
    )
###############################################################################

def save_object(obj, filename, path=PATH, compress=COMPRESS):
  """
  Save any object (e.g., DataFrame, estimator) to a specified path using joblib.

  Parameters:
  - obj: Object to save (e.g., pandas DataFrame, scikit-learn estimator)
  - filename (str): Name of the output file (should end with .pkl)
  - path (str): Directory path where file will be saved
  - compress (int): Compression level for joblib (default=3)
  """
  # file_path = os.path.join(path, filename)
  file_path = PATH / filename

  if os.path.exists(file_path):
    os.remove(file_path)

  joblib.dump(obj, file_path, compress=compress)
###############################################################################

def compare_features_train_validation(df_train, df_val):
  """
  Compare feature columns between training and validation datasets to ensure
  consistency. Due to preprocessing operations, the column structure of
  df_train and df_val may differ.
  This function identifies discrepancies to align the datasets by:
  - Adding interaction terms if any
  - Adding polynomial features if any
  - Dropping features due to multicollinearity if any
  - Performing feature selection

  Parameters:
  - df_train (pandas.DataFrame): Training dataset
  - df_val (pandas.DataFrame): validation dataset

  Returns:
  - set: Columns present in df_val but not in
  df_train (to be dropped from df_val)
  """
  # Identify columns in df_train but not in df_val
  train_only = set(df_train.columns) - set(df_val.columns)

  # Identify columns in df_val but not in df_train
  validation_only = set(df_val.columns) - set(df_train.columns)

  # Check for columns in df_train but not in df_val
  if train_only:
    print("Add features in df_val:", train_only)
  else:
    print("No features to be added in df_val.")

  # Check for columns in df_val but not in df_train
  if validation_only:
    print("Drop features from df_val:", validation_only)
  else:
    print("No features to be dropped from df_val.")

  return validation_only
###############################################################################

def copy_file_to_destination(source_path: str, destination_path: Path) -> None:
  """
  Copies a file to the specified destination, deleting any existing file at the
  destination.

  Args:
  - source_path (str): Path to the source file
  - destination_path (Path): Path to the destination file

  Returns:
  - None
  """
  # Check if file exists in destination and delete it
  if os.path.exists(destination_path):
    os.remove(destination_path)
    print("Existing file deleted from destination.")

  # Copy the file to destination
  shutil.copy(source_path, destination_path)
  print("File copied to destination successfully.")
###############################################################################

def log_optuna_best_trial_search_space(study):
  """
  Logs the parameter search space of the best Optuna trial to MLflow as a
  JSON artifact.

  Args:
  - study (optuna.study.Study): The Optuna study object.
  """
  # Check if there are completed trials
  if (
      study.trials
      and
      any(t.state == optuna.trial.TrialState.COMPLETE for t in study.trials)
  ):
    run_id = mlflow.active_run().info.run_id
    artifact_name = f"optuna_best_trial_search_space_{run_id}.json"

    # Log the search space distributions as a JSON artifact
    mlflow.log_dict(
        {k: str(v) for k, v in study.best_trial.distributions.items()},
        artifact_name
    )

    print(
        "Parameter search space for the best trial logged to MLflow as "
        f"artifact: {artifact_name}"
    )
  else:
    print("No completed trials available to log search space.")
###############################################################################

def evaluate_and_log_metrics(y_val, y_pred, prefix="val"):
  """
  Evaluate model predictions on both log scale and original scale,
  log the metrics to MLflow and print them.

  Parameters:
  -----------
  y_val : array-like
  - True target values (log-transformed).

  y_pred : array-like
  - Predicted target values (log-transformed).

  prefix : str, optional
  - Prefix for MLflow metric names (default: "val").
  """

  # --- Log Scale Evaluation ---
  rmse_log = np.sqrt(mean_squared_error(y_val, y_pred))
  mae_log = mean_absolute_error(y_val, y_pred)
  r2_log = r2_score(y_val, y_pred)

  mlflow.log_metric(f"{prefix}_rmse_log", rmse_log)
  mlflow.log_metric(f"{prefix}_mae_log", mae_log)
  mlflow.log_metric(f"{prefix}_r2_log", r2_log)

  print(f"\nFinal Evaluation on Hold-Out Validation Set (Log Scale):")
  print(f"Hold-Out Validation RMSE (Log Scale): {rmse_log}")
  print(f"Hold-Out Validation MAE (Log Scale): {mae_log}")
  print(f"Hold-Out Validation R² Score (Log Scale): {r2_log}")

  # --- Original Scale Evaluation ---
  y_pred_original = np.expm1(y_pred)
  y_val_original = np.expm1(y_val)

  rmse_original = np.sqrt(mean_squared_error(y_val_original, y_pred_original))
  mae_original = mean_absolute_error(y_val_original, y_pred_original)
  r2_original = r2_score(y_val_original, y_pred_original)

  mlflow.log_metric(f"{prefix}_rmse_original", rmse_original)
  mlflow.log_metric(f"{prefix}_mae_original", mae_original)
  mlflow.log_metric(f"{prefix}_r2_original", r2_original)

  print(f"\nFinal Evaluation on Hold-Out Validation Set (Original Scale):")
  print(f"Hold-Out Validation RMSE (Original Scale): {rmse_original}")
  print(f"Hold-Out Validation MAE (Original Scale): {mae_original}")
  print(f"Hold-Out Validation R² Score (Original Scale): {r2_original}")

  return y_val_original, y_pred_original
###############################################################################

def get_best_params(experiment_name):
  """
  Retrieve the best parameters from an MLflow experiment based on the lowest
  val_rmse_original.

  Parameters:
  - experiment_name (str): Name of the MLflow experiment.

  Returns:
  - dict: Best parameters from the run with the lowest val_rmse_original.
  - None: If the experiment is not found or no runs exist.
  """

  client = MlflowClient()

  # Get experiment ID
  experiment = client.get_experiment_by_name(experiment_name)

  if experiment is None:
    print(f"Experiment '{experiment_name}' not found.")
    return None

  experiment_id = experiment.experiment_id

  # Fetch all runs
  runs = client.search_runs(
      experiment_ids=[experiment_id],
      filter_string="",
      run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
      max_results=1,
      order_by=["metrics.val_rmse_original ASC"]
  )

  # Check if runs exist
  if not runs:
    print(f"No runs found for experiment '{experiment_name}'.")
    return None

  # Get best run and extract parameters
  best_run = runs[0]
  return best_run.data.params

###############################################################################
# Function to convert parameters to appropriate types
def convert_param(param, param_type, param_name):
  try:
    if param_type == 'float':
      return float(param)
    elif param_type == 'int':
      return int(float(param))  # Handle string floats
    elif param_type == 'bool':
      return param.lower() == 'true' if isinstance(param, str) else bool(param)
    else:
      return param  # Keep as is for categorical or other types
  except (ValueError, TypeError):
    raise ValueError(f"Invalid value for {param_name}: {param}")

###############################################################################
def load_model(experiment_name, model_name, criteria='latest'):
  """
  Load a model artifact from a given MLflow experiment based on specified
  criteria.

  Args:
  - experiment_name (str): The name of the MLflow experiment.
  - model_name (str): The name used as the artifact path during logging.
  - criteria (str): Selection criteria for the model ('latest' or 'min_rmse').
    Defaults to 'latest'.

  Returns:
  - model: The loaded model object.

  Raises:
  - ValueError: If experiment or runs are not found, or if invalid criteria is
    provided.
  """
  client = MlflowClient()

  # Get experiment by name
  experiment = client.get_experiment_by_name(experiment_name)
  if experiment is None:
    raise ValueError(f"Experiment '{experiment_name}' not found.")

  if criteria not in ['latest', 'min_rmse']:
    raise ValueError("Criteria must be 'latest' or 'min_rmse'.")

  if criteria == 'latest':
    # Search for the most recent run
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["start_time DESC"],
        max_results=1
    )
  else:  # criteria == 'min_rmse'
    # Search for all runs and sort by rmse metric
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["metrics.val_rmse_original ASC"],
        max_results=1
    )

  if not runs:
    raise ValueError(f"No runs found in experiment '{experiment_name}'.")

  # Get run ID from the selected run
  run_id = runs[0].info.run_id

  # Construct model URI and load model
  model_uri = f"runs:/{run_id}/{model_name}"
  model = mlflow.sklearn.load_model(model_uri)

  return model
###############################################################################

def shap_analysis(
    model,
    X,
    model_type: str,
    random_state: int = RANDOM_STATE,
    sample_size: int = 100
):
  """
  Run SHAP analysis for different model types.

  Parameters:
  model : trained model
  - Final fitted model.

  X : pd.DataFrame
  - Input features used for training or evaluation.

  model_type : str
  - One of ["linear", "tree", "kernel"] depending on algorithm.

  random_state : int, optional
  - Random state for reproducibility.

  sample_size : int, optional
  - Sample size for Kernel/Tree explainers to reduce computation.

  Returns:
  mean_abs_shap : pd.Series
  - Mean absolute SHAP values per feature.
  """

  # Choose explainer depending on model type
  if model_type == "linear":
    explainer = shap.LinearExplainer(
        model,
        X,
        feature_perturbation="interventional"
    )

    shap_values = explainer(X).values
    shap_df = pd.DataFrame(shap_values, columns=X.columns)

  elif model_type == "tree":
    X_subset = X.sample(n=min(sample_size, len(X)), random_state=random_state)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_subset)
    shap_df = pd.DataFrame(shap_values, columns=X_subset.columns)

  elif model_type == "kernel":
    X_subset = X.sample(n=min(sample_size, len(X)), random_state=random_state)

    explainer = shap.KernelExplainer(
        model.predict,
        X_subset,
        feature_perturbation="interventional"
    )

    shap_values = explainer.shap_values(X_subset, nsamples=100)
    shap_df = pd.DataFrame(shap_values, columns=X_subset.columns)

  else:
    raise ValueError(f"Unsupported model_type: {model_type}")

  # Get mean absolute SHAP value for each feature
  mean_abs_shap = shap_df.abs().mean().sort_values(ascending=False)

  # Convert to JSON for MLflow logging
  mean_abs_shap_json = json.dumps(mean_abs_shap.to_dict())

  # Log in MLflow
  mlflow.log_param("shap_mean_abs_by_feature", mean_abs_shap_json)

  print("Mean Absolute SHAP Values:")
  print(mean_abs_shap)

  return mean_abs_shap
###############################################################################

def check_feature_distribution_mismatches(
    X_train,
    X_val,
    test_type,
    p_threshold=0.05
):
  """
  Check for feature distribution mismatches between training and validation
  datasets.

  Parameters:
  - X_train (pandas.DataFrame): Training dataset features
    (all numerical or all categorical)
  - X_val (pandas.DataFrame): Validation dataset features (same type as X_train)
  - p_threshold (float): Significance level for p-value (default: 0.05)
  - test_type (str): Statistical test to use
    ("Kolmogorov-Smirnov" for numerical, "Chi-squared" for categorical)

  Returns:
  - dict: Dictionary of features with different distributions and their p-values
  """
  print("Checking feature distribution mismatches:")
  mismatched_features = {}

  # Validate test_type
  if test_type not in ["Kolmogorov-Smirnov", "Chi-squared"]:
    raise ValueError("test_type must be 'Kolmogorov-Smirnov' or 'Chi-squared'")

  for col in X_train.columns:
    if col not in X_val.columns:
      print(f"Feature {col} not found in validation set, skipping.")
      continue

    try:
      if test_type == "Chi-squared":
        # For categorical columns, use Chi-squared test
        train_counts = X_train[col].value_counts()
        val_counts = X_val[col].value_counts()

        # Align categories
        all_categories = sorted(set(train_counts.index) | set(val_counts.index))
        train_freq = [train_counts.get(cat, 0) for cat in all_categories]
        val_freq = [val_counts.get(cat, 0) for cat in all_categories]

        # Perform Chi-squared test
        contingency_table = np.array([train_freq, val_freq])
        stat, p_value, _, _ = chi2_contingency(contingency_table)
      else:
        # For numerical columns, use KS test
        stat, p_value = ks_2samp(X_train[col], X_val[col])

      if p_value < p_threshold:
        print(
            f"Feature {col} has different distributions "
            f"({test_type} test, p-value: {p_value:.4f})"
        )
        mismatched_features[col] = {'p_value': p_value, 'test': test_type}

    except Exception as e:
      print(f"Error processing feature {col}: {str(e)}")
      continue

  return mismatched_features

Overwriting ml_utils.py


In [None]:
source_path = 'ml_utils.py'
destination_path = PATH / 'ml_utils.py'

# Check if file exists in destination and delete it
if os.path.exists(PATH / 'ml_utils.py'):
  os.remove(destination_path)
  print("Existing file deleted from destination.")

# Copy the file to destination
shutil.copy(source_path, destination_path)
print("File copied to destination successfully.")

Existing file deleted from destination.
File copied to destination successfully.


# Data Loading and Preprocessing with Temporal Integrity

We define `DataLoader` as a custom scikit-learn transformer that handles:
- Loading from CSV
- Deterministic temporal splitting
- Basic cleaning and standardization

### Key Design Decisions:

- **Time-Aware Train/Val/Test Split:**  
The data is split sequentially (not randomly) by order of appearance - preserving temporal flow. This mimics real-world deployment where models predict future data based on past observations.

- **Controlled Data Ingestion:**  
`max_rows` enables fast prototyping on large datasets without memory issues - critical for iterative development.

- **Consistent Preprocessing:**  
	- Removed rows where `cost = 0`, `clicks = 0`, and `conversions = 1` to exclude invalid campaign data with applications but no associated cost or clicks
	- Fixes common data quality issues (e.g. typo `converions` to `conversions`)
	- Cleans string columns (lowercase, underscores, trim whitespace) for consistency
	- Drops irrelevant identifiers like `campaign_id` to prevent leakage

In [None]:
%%writefile data_loader.py
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class DataLoader(BaseEstimator, TransformerMixin):
  def __init__(
      self,
      file_path,
      train_size=0.7,
      val_size=0.15,
      max_rows,
      string_cols_to_clean=None
  ):
    self.file_path = file_path
    self.train_size = train_size
    self.val_size = val_size
    self.max_rows = max_rows
    self.string_cols_to_clean = string_cols_to_clean
    self.df_train = None
    self.df_val = None
    self.df_test = None

  def fit(self, X=None, y=None):
    # Load the dataset
    df_original = pd.read_csv(self.file_path)

    # Fix misspelled column if it exists
    if 'converions' in df_original.columns:
      df_original.rename(columns={"converions": "conversions"}, inplace=True)

    # Remove rows where cost and clicks are 0 and conversions is 1
    df_original = df_original[
        (df_original['cost'] != 0)
        | (df_original['clicks'] != 0)
        | (df_original['conversions'] != 1)
    ]

    # Limit to max_rows if specified
    if self.max_rows is not None:
      df_original = df_original.head(self.max_rows)

    # Calculate split indices
    total_size = len(df_original)
    train_size = int(self.train_size * total_size)
    val_size = int(self.val_size * total_size)

    # Split the data
    self.df_train = df_original[:train_size].reset_index(drop=True)

    self.df_val = (
        df_original[train_size:train_size + val_size].reset_index(drop=True)
    )

    self.df_test = df_original[train_size + val_size:].reset_index(drop=True)
    return self

  def transform(self, X):
    df = X.copy()

    # Ensure date is in datetime format and sort
    if 'date' in df.columns:
      df['date'] = pd.to_datetime(
          df['date'],
          format='%Y-%m-%d'
      )

      df = df.sort_values(by='date')

    # Drop campaign_id if it exists
    df.drop(columns=['campaign_id'], inplace=True, errors='ignore')

    # Convert and clean selected string columns
    for col in self.string_cols_to_clean:
      if col in df.columns:
        df[col] = (
            df[col]
            .astype("string")
            .str.strip()
            .str.replace(r'\s+', '_', regex=True)
            .str.lower()
        )

    return df

  def get_train_val_test(self):
    """
    - Return the train, validation, and test DataFrames.
    - Added this method in case we wish to take a look at df_train, df_val
      and df_test before it is transformed.
    """
    if self.df_train is None or self.df_val is None or self.df_test is None:
      raise ValueError("Must call fit() before get_train_val_test()")

    return self.df_train, self.df_val, self.df_test

Overwriting data_loader.py


In [None]:
copy_file_to_destination(
    source_path = 'data_loader.py',
    destination_path = PATH / 'data_loader.py'
)

Existing file deleted from destination.
File copied to destination successfully.


## Save Fitted Model

In [None]:
from data_loader import DataLoader

file_path = PATH / "data.csv"

string_cols_to_clean=[
        'category_id',
        'industry',
        'customer_id',
        'publisher',
        'market_id'
    ]

data_loader = DataLoader(
    file_path = file_path,
    train_size=0.7,
    max_rows=1000,
    string_cols_to_clean=string_cols_to_clean
)

# Fit the transformer
data_loader.fit()

df_train, df_val, df_test = data_loader.get_train_val_test()

log_transformer_to_mlflow(
    transformer=data_loader,
    experiment_name="Data_Loader",
    model_name="data_loader",
    df=df_train,
    target_col=None
)

## Save and Delete Test Data

In [None]:
save_object(df_test, 'df_test.pkl')

In [None]:
del df_test

## Transform Train and Test

In [None]:
df_train = data_loader.transform(df_train)
df_val = data_loader.transform(df_val)

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
# Check for NAs
df_train.isnull().sum()

In [None]:
df_train.info()

In [None]:
# Check unique values again
df_train['industry'].unique()

In [None]:
df_train['category_id'].unique()

In [None]:
df_train['publisher'].unique()

In [None]:
df_train['market_id'].unique()

In [None]:
# Drop duplicates if any
df_train.drop_duplicates(inplace=True)

# Target Processor

We define a custom scikit-learn transformer `CPAProcessor` to compute the **Cost Per Acquisition (CPA)** - a key business metric directly from raw columns `cost` and `conversions`.

## CPA Processor

In [None]:
%%writefile target_processor.py
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

TARGET = 'cpa'

class CPAProcessor(BaseEstimator, TransformerMixin):
  """
  A class to calculate Cost Per Acquisition (CPA) by dividing cost by
  conversions, replacing zero conversions with a specified value to avoid
  division by zero.
  """
  def __init__(self, zero_replacement=0.1):
    """
    Initialize the CPAProcessor.

    Args:
    - zero_replacement (float): Value to replace zero conversions with
    (default: 0.1).
    """
    self.zero_replacement = zero_replacement

  def fit(self, X, y=None):
    """
    Fit method for compatibility with scikit-learn pipelines. Does nothing.

    Args:
    - X (pd.DataFrame): Input DataFrame with 'cost' and 'conversions' columns.
    - y (None): Ignored, included for compatibility.

    Returns:
    - self: Returns the instance itself.
    """
    return self

  def transform(self, X):
    """
    Calculate CPA as cost / (conversions, with zeros replaced).

    Args:
    - X (pd.DataFrame): Input DataFrame with 'cost' and 'conversions' columns.

    Returns:
    - pd.DataFrame: DataFrame with an additional 'cpa' column.

    Raises:
    - ValueError: If 'cost' or 'conversions' columns are missing.
    - TypeError: If X is not a pandas DataFrame.
    """

    if not isinstance(X, pd.DataFrame):
      raise TypeError("Input X must be a pandas DataFrame")

    if not all(col in X.columns for col in ['cost', 'conversions']):
      raise ValueError(
          "Input DataFrame must contain 'cost' and 'conversions' columns."
      )

    # Create a copy to avoid modifying the original DataFrame
    X_transformed = X.copy()

    # Calculate CPA: cost / (conversions, with 0 replaced by zero_replacement)
    X_transformed[TARGET] = (
        X_transformed['cost'] / X_transformed['conversions']
        .replace(0, self.zero_replacement)
    )

    return X_transformed

## Fit and Transform Train and Test

In [None]:
from target_processor import CPAProcessor

cpa_processor = CPAProcessor(zero_replacement = 0.1)
df_train = cpa_processor.fit(df_train).transform(df_train)
df_val = cpa_processor.transform(df_val)

# Outlier Detection

Outliers can distort model training and reduce generalization performance. To address this, several outlier detection techniques were systematically evaluated:

- **Methods tried:** `IsolationForest`, `Local Outlier Factor (LOF)`, `One-Class SVM`, `DBSCAN`, `Z-Score method`  
- **Purpose:** Identify and remove anomalous data points that could negatively affect model performance, while preserving useful information.

Evaluation on the validation set:

1. **IsolationForest** consistently provided the best results, achieving the lowest cross-validated RMSE.  
2. **Followed by:** Local Outlier Factor (LOF), One-Class SVM, DBSCAN, and Z-Score method.  
3. The proportion of data removed as outliers was reasonable, ensuring that essential predictive signals were preserved.

**Decision:** The final model uses **IsolationForest** for outlier detection.

**Takeaway:** By systematically comparing multiple approaches and validating their impact on downstream performance, IsolationForest was selected as the outlier removal method, ensuring an empirically justified, robust, and interpretable preprocessing step.


In [None]:
numeric_cols = (
    df_train
    .select_dtypes(include=['int64', 'float64'])
    .columns
    .tolist()
)

X_train_full = df_train[numeric_cols]
scaler = StandardScaler()
scaler.fit(X_train_full)
X_train_full_scaled = scaler.transform(X_train_full)
X_val = df_val[numeric_cols]
X_val_scaled = scaler.transform(X_val)

n_trials = 500
n_splits = 5
n_repeats = 3
mlflow_experiment = "outlier_detection"
# SQLite storage configuration
study_name = f"{mlflow_experiment}_study"
storage = f"sqlite:///{PATH}/{study_name}.db"

def objective(trial):
  winsorize_upper = trial.suggest_float("winsorize_upper", 0.90, 1)

  # Apply winsorization to target
  y_train_full = np.log1p(
      mstats.winsorize(df_train[TARGET], limits=[0, 1-winsorize_upper])
  )

  outlier_method = trial.suggest_categorical(
      "outlier_method",
       [
           "IsolationForest",
           "LocalOutlierFactor",
           "OneClassSVM",
           "DBSCAN",
           "ZScore"
       ]
  )

  if outlier_method == "IsolationForest":
    contamination_option = trial.suggest_categorical(
        "if_contamination_option",
         ["auto", "float"]
    )

    contamination = (
        'auto'
        if contamination_option == 'auto'
        else trial.suggest_float("if_contamination", 0.01, 0.2)
    )

    detector = IsolationForest(
        n_estimators=trial.suggest_int("if_n_estimators", 50, 300),
        max_samples=trial.suggest_float("if_max_samples", 0.1, 1.0),
        contamination=contamination,
        max_features=trial.suggest_float("if_max_features", 0.1, 1.0),
        random_state=RANDOM_STATE
    )
    mask = detector.fit_predict(X_train_full_scaled) == 1

  elif outlier_method == "LocalOutlierFactor":
    contamination_option = trial.suggest_categorical(
        "lof_contamination_option",
         ["auto", "float"]
    )

    contamination = (
        'auto'
        if contamination_option == 'auto'
        else trial.suggest_float("lof_contamination", 0.01, 0.2)
    )

    detector = LocalOutlierFactor(
        n_neighbors=trial.suggest_int("lof_n_neighbors", 5, 100),
        algorithm=trial.suggest_categorical(
            "lof_algorithm",
             ["auto", "ball_tree", "kd_tree", "brute"]
        ),
        leaf_size=trial.suggest_int("lof_leaf_size", 10, 100),
        metric=trial.suggest_categorical(
            "lof_metric",
             ["euclidean", "manhattan", "chebyshev", "minkowski"]
        ),
        p=trial.suggest_int("lof_p", 1, 5),
        contamination=contamination,
        n_jobs=N_JOBS
    )
    mask = detector.fit_predict(X_train_full_scaled) == 1

  elif outlier_method == "OneClassSVM":
    detector = OneClassSVM(
        kernel=trial.suggest_categorical(
            "ocsvm_kernel",
             ["linear", "poly", "rbf", "sigmoid"]
        ),
        degree=trial.suggest_int("ocsvm_degree", 2, 5),
        gamma=trial.suggest_float("ocsvm_gamma", 0.001, 1.0, log=True),
        coef0=trial.suggest_float("ocsvm_coef0", 0.0, 1.0),
        tol=trial.suggest_float("ocsvm_tol", 1e-4, 1e-2),
        nu=trial.suggest_float("ocsvm_nu", 0.01, 0.5),
        shrinking=trial.suggest_categorical("ocsvm_shrinking", [True, False]),
        max_iter=trial.suggest_int("ocsvm_max_iter", 100, 10000),
        verbose=0
    )
    mask = detector.fit_predict(X_train_full_scaled) == 1

  elif outlier_method == "DBSCAN":
      detector = DBSCAN(
          eps=trial.suggest_float("dbscan_eps", 0.1, 5.0),
          min_samples=trial.suggest_int("dbscan_min_samples", 3, 20),
          algorithm=trial.suggest_categorical(
              "dbscan_algorithm",
               ["auto", "ball_tree", "kd_tree", "brute"]
          ),
          leaf_size=trial.suggest_int("dbscan_leaf_size", 10, 100),
          metric=trial.suggest_categorical(
              "dbscan_metric",
               ["euclidean", "manhattan", "chebyshev", "minkowski"]
          ),
          p=trial.suggest_int("dbscan_p", 1, 5),
          n_jobs=N_JOBS
      )
      cluster_labels = detector.fit_predict(X_train_full_scaled)
      mask = cluster_labels != -1

  elif outlier_method == "ZScore":
    threshold = trial.suggest_float("zscore_threshold", 2.0, 4.0)
    # This recalculates zscore on already scaled data. It may be misleading.
    # Thus commented.
    # z_scores = np.abs(zscore(X_train_full_scaled))
    # Ideally, following is correct.
    z_scores = np.abs(zscore(X_train_full, axis=0))
    mask = (z_scores < threshold).all(axis=1)

  X_clean = X_train_full[mask]

  # Convert to Pandas Series
  y_clean = pd.Series(y_train_full[mask])

  # Step 4: Repeated K-Fold Cross-Validation
  rkf = RepeatedKFold(
      n_splits=n_splits,
      n_repeats=n_repeats,
      random_state=RANDOM_STATE
  )

  rmse_list = []

  for fold_idx, (train_idx, val_idx) in enumerate(rkf.split(X_clean)):
    X_train, X_val_fold = X_clean.iloc[train_idx], X_clean.iloc[val_idx]
    y_train, y_val_fold = y_clean.iloc[train_idx], y_clean.iloc[val_idx]

    model = xgb.XGBRegressor(
        objective= trial.suggest_categorical('objective', ['reg:squarederror']),
        base_score= trial.suggest_float('base_score', 0.1, 0.9),
        booster= trial.suggest_categorical('booster', [
            'gbtree',
            'dart'
        ]),
        colsample_bylevel= trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        colsample_bynode= trial.suggest_float('colsample_bynode', 0.5, 1.0),
        colsample_bytree= trial.suggest_float('colsample_bytree', 0.5, 1.0),
        gamma= trial.suggest_float('gamma', 0, 5, log=True),
        grow_policy= trial.suggest_categorical(
            'grow_policy',
             ['depthwise', 'lossguide']
        ),
        learning_rate= trial.suggest_float('learning_rate', 0.01, 0.3),
        max_bin= trial.suggest_int('max_bin', 128, 512),
        max_cat_to_onehot= trial.suggest_int('max_cat_to_onehot', 1, 64),
        max_depth= trial.suggest_int('max_depth', 3, 10),
        max_leaves= trial.suggest_int('max_leaves', 0, 64),
        min_child_weight= trial.suggest_float('min_child_weight', 1, 10),
        n_estimators= trial.suggest_int('n_estimators', 50, 500),
        n_jobs= N_JOBS,
        num_parallel_tree= trial.suggest_int('num_parallel_tree', 1, 3),
        random_state= RANDOM_STATE,
        reg_alpha= trial.suggest_float('reg_alpha', 0.0, 1.0, log=True),
        reg_lambda= trial.suggest_float('reg_lambda', 0.0, 1.0, log=True),
        sampling_method= trial.suggest_categorical(
            'sampling_method',
             ['uniform', 'gradient_based']
        ),
        scale_pos_weight= trial.suggest_float('scale_pos_weight', 0.5, 1.5),
        subsample= trial.suggest_float('subsample', 0.5, 1.0),
        tree_method= trial.suggest_categorical(
            'tree_method',
             ['auto', 'approx', 'hist']
        ),
        validate_parameters=True,
        verbosity=0
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val_fold)

    # Step 5: Evaluate using RMSE
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    rmse_list.append(rmse)

    del X_train, X_val_fold, y_train, y_val_fold, y_pred, model
    gc.collect()

    trial.report(np.mean(rmse_list), step=fold_idx)

    if trial.should_prune():
      raise optuna.TrialPruned()

  return np.mean(rmse_list)

# Start MLflow experiment
mlflow.set_experiment(mlflow_experiment)

with mlflow.start_run():
  # - Ensure to use the same direction,
  # pruner (min_resource, max_resource, reduction_factor, etc.) if you are
  # loading an existing study
  study = optuna.create_study(
      study_name=study_name,
      storage=storage,
      direction="minimize",
      pruner=optuna.pruners.HyperbandPruner(
          min_resource=max(1, n_splits),
          max_resource=n_splits * n_repeats,
          reduction_factor=2
      ),
      sampler=optuna.samplers.TPESampler(
          n_startup_trials=max(1, int(n_trials * 0.2)),
          seed=RANDOM_STATE,
          multivariate=True,
          warn_independent_sampling = False
      ),
      load_if_exists=True
  )

  study.optimize(objective, n_trials=n_trials)
  log_optuna_best_trial_search_space(study)

  # Log Optuna storage details
  mlflow.log_param("optuna_storage", storage)
  mlflow.log_param("optuna_study_name", study_name)

  # Log experiment parameters
  mlflow.log_param("n_trials", n_trials)
  mlflow.log_param("n_splits", n_splits)
  mlflow.log_param("n_repeats", n_repeats)

  # Fanova Importance is more robust than the Mean Decrease Impurity Importance
  fanova_importances = get_param_importances(
      study,
      evaluator=FanovaImportanceEvaluator(seed = RANDOM_STATE),
      target= (
          lambda t: t.value
          if t.state == optuna.trial.TrialState.COMPLETE
          else None
      ),
      normalize = True
  )

  fanova_importances = {k: round(v, 4) for k, v in fanova_importances.items()}

  # Log Fanova importances as parameters
  for param, importance in fanova_importances.items():
    mlflow.log_param(f"fanova_{param}", importance)

  print(f'Fanova Hyperparameter Importances (rounded): {fanova_importances}')

  # Step 7: Get best IsolationForest params
  best_params = study.best_params

  for param, value in best_params.items():
    mlflow.log_param(param, value)

  mlflow.log_metric("best_cv_rmse", study.best_value)
  print("Best Params:", best_params)
  print("Best CV RMSE:", study.best_value)

  # Apply best winsorization and compute thresholds from training data
  y_train_full = np.log1p(mstats.winsorize(
      df_train[TARGET], limits=[0, 1-best_params["winsorize_upper"]]
  ))

  # Compute winsorization thresholds from training data
  # Apply fixed thresholds to validation data
  y_val = np.log1p(np.clip(
      df_val[TARGET],
      0,
      np.percentile(df_train[TARGET], best_params["winsorize_upper"] * 100)
  ))


  # Apply best outlier method on training data
  if best_params["outlier_method"] == "IsolationForest":
    contamination = (
        best_params["if_contamination"]
        if best_params["if_contamination_option"] == "float"
        else 'auto'
    )
    detector = IsolationForest(
        n_estimators=best_params["if_n_estimators"],
        max_samples=best_params["if_max_samples"],
        contamination=contamination,
        max_features=best_params["if_max_features"],
        random_state=RANDOM_STATE
    )
    mask = detector.fit_predict(X_train_full_scaled) == 1
    num_outliers = len(X_train_full_scaled) - np.sum(mask)

  elif best_params["outlier_method"] == "LocalOutlierFactor":
    contamination = (
        best_params["lof_contamination"]
        if best_params["lof_contamination_option"] == "float"
        else 'auto'
    )
    detector = LocalOutlierFactor(
        n_neighbors=best_params["lof_n_neighbors"],
        algorithm=best_params["lof_algorithm"],
        leaf_size=best_params["lof_leaf_size"],
        metric=best_params["lof_metric"],
        p=best_params["lof_p"],
        contamination=contamination,
        novelty=False
    )
    mask = detector.fit_predict(X_train_full_scaled) == 1
    num_outliers = len(X_train_full_scaled) - np.sum(mask)

  elif best_params["outlier_method"] == "OneClassSVM":
    detector = OneClassSVM(
        kernel=best_params["ocsvm_kernel"],
        degree=best_params["ocsvm_degree"],
        gamma=best_params["ocsvm_gamma"],
        coef0=best_params["ocsvm_coef0"],
        tol=best_params["ocsvm_tol"],
        nu=best_params["ocsvm_nu"],
        shrinking=best_params["ocsvm_shrinking"],
        max_iter=best_params["ocsvm_max_iter"],
        verbose=0
    )
    mask = detector.fit_predict(X_train_full_scaled) == 1
    num_outliers = len(X_train_full_scaled) - np.sum(mask)

  elif best_params["outlier_method"] == "DBSCAN":
    detector = DBSCAN(
        eps=best_params["dbscan_eps"],
        min_samples=best_params["dbscan_min_samples"],
        algorithm=best_params["dbscan_algorithm"],
        leaf_size=best_params["dbscan_leaf_size"],
        metric=best_params["dbscan_metric"],
        p=best_params["dbscan_p"]
    )
    cluster_labels = detector.fit_predict(X_train_full_scaled)
    mask = cluster_labels != -1
    num_outliers = np.sum(cluster_labels == -1)

  elif best_params["outlier_method"] == "ZScore":
    zscore_threshold = best_params["zscore_threshold"]
    z_scores = np.abs(zscore(X_train_full, axis=0))
    mask = (z_scores < zscore_threshold).all(axis=1)
    num_outliers = len(X_train_full) - np.sum(mask)

  # Log number of outliers and percentage
  mlflow.log_metric("num_outliers", num_outliers)
  print(f"Number of outliers detected: {num_outliers}")
  outlier_percentage = (num_outliers / len(X_train_full_scaled)) * 100
  mlflow.log_metric("outlier_percentage", outlier_percentage)
  print(f"Percentage of outliers removed: {outlier_percentage:.2f}%")

  df_train = df_train[mask]
  X_final = X_train_full[mask]
  y_final = y_train_full[mask]

  valid_params = {
      k: v
      for k, v in best_params.items()
      if k in xgb.XGBRegressor().get_params()
  }

  # all the constant hyperparameters should be added here manually and are not
  # part of study.best_params
  final_model = xgb.XGBRegressor(
      **valid_params,
      n_jobs=N_JOBS,
      random_state=RANDOM_STATE,
      validate_parameters=True,
      verbosity=0
  )

  X_final = X_final[sorted(X_final.columns)]
  X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)
  X_val_scaled = X_val_scaled[sorted(X_val_scaled.columns)]
  final_model.fit(X_final, y_final)
  y_pred = final_model.predict(X_val_scaled)

  evaluate_and_log_metrics(y_val, y_pred, prefix="val")

## Isolation Forest Transformer

In [None]:
%%writefile isolation_forest_transformer.py
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

class IsolationForestTransformer(BaseEstimator, TransformerMixin):
  """
  A class to remove outliers from a DataFrame using IsolationForest with
  fixed hyperparameters.

  Attributes:
  - contamination (float): Proportion of outliers in the data.
  - n_estimators (int): Number of trees in the IsolationForest.
  - max_samples (float): Proportion of samples to draw for each tree.
  - max_features (float): Proportion of features to consider for each split.
  - random_state (int): Random seed for reproducibility.
  - scaler (StandardScaler): Scaler for numerical features.
  - model (IsolationForest): Fitted IsolationForest model.
  - numerical_cols (list): List of numerical column names.
  - fitted (bool): Whether the model has been fitted.
  """

  def __init__(
      self,
      contamination,
      n_estimators,
      max_samples,
      max_features,
      random_state
  ):
    """
    Initialize the IsolationForestWrapper with fixed IsolationForest
    hyperparameters.

    Args:
    - contamination (float): Proportion of outliers in the data.
    - n_estimators (int): Number of trees in the IsolationForest.
    - max_samples (float): Proportion of samples to draw for each tree.
    - max_features (float): Proportion of features to consider for each
    split.
    - random_state (int): Random seed for reproducibility.
    """
    self.contamination = contamination
    self.n_estimators = n_estimators
    self.max_samples = max_samples
    self.max_features = max_features
    self.random_state = random_state
    self.scaler = StandardScaler()
    self.model = None
    self.numerical_cols = None
    self.fitted = False

  def fit(self, X, y=None):
    """
    Fit the IsolationForest model on numerical features of the input
    DataFrame.

    Args:
    - X (pd.DataFrame): Input DataFrame containing numerical features.
    - y (None): Ignored, included for scikit-learn compatibility.

    Returns:
    - self: Fitted IsolationForestWrapper instance.
    """
    # Select numerical columns
    self.numerical_cols = (
        X
        .select_dtypes(include=['int64', 'float64'])
        .columns
        .tolist()
    )

    # Extract numerical features
    X_num = X[self.numerical_cols]

    # Fit scaler
    self.scaler.fit(X_num)

    # Scale features
    X_scaled = self.scaler.transform(X_num)

    # Initialize and fit IsolationForest
    self.model = IsolationForest(
        contamination=self.contamination,
        n_estimators=self.n_estimators,
        max_samples=self.max_samples,
        max_features=self.max_features,
        random_state=self.random_state
    )

    self.model.fit(X_scaled)

    self.fitted = True
    return self

  def transform(self, X):
    """
    Remove outliers from the input DataFrame using the fitted
    IsolationForest model.

    Args:
    - X (pd.DataFrame): Input DataFrame to filter outliers from.

    Returns:
    - pd.DataFrame: Filtered DataFrame with outliers removed.

    Raises:
    - ValueError: If fit has not been called or if numerical columns mismatch.
    """
    if not self.fitted:
      raise ValueError("Model must be fitted before transform")

    # Extract numerical features
    X_num = X[self.numerical_cols]

    # Scale features
    X_scaled = self.scaler.transform(X_num)

    # Predict outliers (-1 for outliers, 1 for inliers)
    mask = self.model.predict(X_scaled) == 1

    # Return filtered DataFrame
    return X[mask].copy()

## Save Fitted Model

In [None]:
from isolation_forest_transformer import IsolationForestTransformer

isolation_forest_transformer = IsolationForestTransformer(
    contamination = 0.07971598036312164,
    n_estimators = 65,
    max_samples = 0.9907725638374102,
    max_features = 0.315372534187923,
    random_state = RANDOM_STATE
)

log_transformer_to_mlflow(
    transformer=isolation_forest_transformer,
    experiment_name="Isolation_Forest_Transformer",
    model_name="isolation_forest_transformer",
    df=df_train,
    target_col=None
)

## Transform Train

In [None]:
# We do not remove outliers from the validation/test data
df_train = isolation_forest_transformer.transform(df_train)

## Check Feature Distribution Mismatches

In [None]:
check_feature_distribution_mismatches(
    df_train[skewed_cols],
    df_val[skewed_cols],
    test_type="Kolmogorov-Smirnov"
)

# Temporal Feature Engineering with Leakage Prevention
This custom transformer extracts time-based features (`day_of_week`, `month`, `is_weekend`) and computes historical averages for `publisher` and `market_id` (e.g., past average clicks, popularity) - all derived **only from prior dates** to prevent data leakage.

Key design choices:

- **Temporal safety:** Rolling stats use only rows where `date < current_date`.
- **Production-ready:** Inherits from BaseEstimator/TransformerMixin for pipeline integration.
- **Robust handling:** Maps unknown categories using global averages or zero fallbacks.
- **Validation:** Checks input types, required columns, and datetime formatting.
Ideal for time-ordered advertising data where realistic feature simulation is critical.

## Temporal Feature Engineer Transformer

In [None]:
%%writefile temporal_feature_engineer_transformer.py
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineerTransformer(BaseEstimator, TransformerMixin):
  """
  A class to perform temporal feature engineering on advertising data.

  Extracts day_of_week, month, is_weekend from date, and computes rolling
  averages and popularity metrics for publisher and market_id based on clicks.

  Attributes:
  - required_cols (list): Required columns in input DataFrame.
  - publisher_avg_clicks_map (pd.Series): Mapping of publisher to mean clicks.
  - market_avg_clicks_map (pd.Series): Mapping of market_id to mean clicks.
  - publisher_popularity_map (pd.Series): Mapping of publisher to mean
  popularity.
  - market_popularity_map (pd.Series): Mapping of market_id to mean popularity.
  - fitted (bool): Whether the model has been fitted.
  """

  def __init__(
      self,
      required_cols=['date', 'publisher', 'market_id', 'clicks']
  ):
    """Initialize the TemporalFeatureEngineer.

    Args:
    required_cols (list): List of required columns in input DataFrame.
    """
    self.required_cols = required_cols
    self.publisher_avg_clicks_map = None
    self.market_avg_clicks_map = None
    self.publisher_popularity_map = None
    self.market_popularity_map = None
    self.fitted = False

  def fit(self, X, y=None):
    """
    Fit the feature engineer on training data, computing mappings.

    Args:
    - X (pd.DataFrame): Training DataFrame with required columns.
    - y (None): Ignored, included for scikit-learn compatibility.

    Returns:
    - self: Fitted TemporalFeatureEngineer instance.

    Raises:
    - TypeError: If X is not a pandas DataFrame.
    - ValueError: If required columns are missing or date is not datetime.
    """
    if not isinstance(X, pd.DataFrame):
      raise TypeError("Input X must be a pandas DataFrame")

    if not all(col in X.columns for col in self.required_cols):
      raise ValueError(
          f"Input DataFrame must contain all required columns:\n"
          f"{self.required_cols}"
      )

    if not pd.api.types.is_datetime64_any_dtype(X['date']):
      raise ValueError("Column 'date' must be of datetime type")

    # Compute global mean for fallback
    global_mean_clicks = X['clicks'].mean()

    # Initialize lists for rolling features
    rolling_publisher_avg_clicks = []
    rolling_market_avg_clicks = []
    publisher_popularity = []
    market_popularity = []

    # Iterate over rows to respect temporal order
    for idx, row in X.iterrows():
      past_data = X[X['date'] < row['date']]

      # Publisher-level rolling average
      past_publisher_data = past_data[
          past_data['publisher'] == row['publisher']
      ]

      pub_avg = (
          past_publisher_data['clicks'].mean()
          if not past_publisher_data.empty
          else global_mean_clicks
      )

      rolling_publisher_avg_clicks.append(pub_avg)

      # Market-level rolling average
      past_market_data = past_data[past_data['market_id'] == row['market_id']]

      market_avg = (
          past_market_data['clicks'].mean()
          if not past_market_data.empty
          else global_mean_clicks
      )

      rolling_market_avg_clicks.append(market_avg)

      # Publisher popularity (mean clicks per publisher in past)
      pub_pop = (
          past_publisher_data['clicks'].sum() / len(past_publisher_data)
          if not past_publisher_data.empty
          else 0
      )

      publisher_popularity.append(pub_pop)

      # Market popularity (mean clicks per market in past)
      market_pop = (
          past_market_data['clicks'].sum() / len(past_market_data)
          if not past_market_data.empty
          else 0
      )

      market_popularity.append(market_pop)

    # Create temporary DataFrame for mappings
    temp_df = X.copy()
    temp_df['publisher_avg_clicks'] = rolling_publisher_avg_clicks
    temp_df['market_avg_clicks'] = rolling_market_avg_clicks
    temp_df['publisher_popularity'] = publisher_popularity
    temp_df['market_popularity'] = market_popularity

    # Compute mappings
    self.publisher_avg_clicks_map = (
        temp_df.groupby('publisher')['clicks']
        .mean()
    )

    self.market_avg_clicks_map = temp_df.groupby('market_id')['clicks'].mean()

    self.publisher_popularity_map = (
        temp_df.groupby('publisher')['publisher_popularity']
        .mean()
    )

    self.market_popularity_map = (
        temp_df.groupby('market_id')['market_popularity']
        .mean()
    )

    self.fitted = True
    return self

  def transform(self, X):
    """
    Transform the input DataFrame by adding temporal features and applying
    mappings.

    Args:
    - X (pd.DataFrame): Input DataFrame to transform.

    Returns:
    pd.DataFrame: Transformed DataFrame with new features and date column
    dropped.

    Raises:
    - TypeError: If X is not a pandas DataFrame.
    - ValueError: If model is not fitted or required columns are missing.
    """
    if not isinstance(X, pd.DataFrame):
      raise TypeError("Input X must be a pandas DataFrame")
    if not self.fitted:
      raise ValueError("Model must be fitted before transform")
    if not all(col in X.columns for col in self.required_cols):
      raise ValueError(f"""
      Input DataFrame must contain all required columns: {self.required_cols}
      """)

    # Create a copy to avoid modifying the input
    X_transformed = X.copy()

    # Add temporal features
    X_transformed['day_of_week'] = (
        X_transformed['date']
        .dt
        .strftime('%A')
        .astype('string')
    )

    X_transformed['month'] = (
        X_transformed['date']
        .dt
        .strftime('%B')
        .astype('string')
    )

    X_transformed['is_weekend'] = (
        X_transformed['day_of_week']
        .isin(['Saturday', 'Sunday'])
    )

    # Apply mappings for rolling averages and popularity
    global_mean_clicks = (
        self.publisher_avg_clicks_map.mean()
        if self.publisher_avg_clicks_map is not None
        else 0
    )

    X_transformed['publisher_avg_clicks'] = (
        X_transformed['publisher']
        .map(self.publisher_avg_clicks_map)
        .fillna(global_mean_clicks)
    )

    X_transformed['market_avg_clicks'] = (
        X_transformed['market_id']
        .map(self.market_avg_clicks_map)
        .fillna(global_mean_clicks)
    )
    X_transformed['publisher_popularity'] = (
        X_transformed['publisher']
        .map(self.publisher_popularity_map)
        .fillna(0)
    )

    X_transformed['market_popularity'] = (
        X_transformed['market_id']
        .map(self.market_popularity_map)
        .fillna(0)
    )

    # Drop date column
    X_transformed.drop(columns=['date'], inplace=True)
    return X_transformed

In [None]:
copy_file_to_destination(
    source_path = 'temporal_feature_engineer_transformer.py',
    destination_path = PATH / 'temporal_feature_engineer_transformer.py'
)

## Save Fitted Model

In [None]:
from temporal_feature_engineer_transformer import (
    TemporalFeatureEngineerTransformer
)

temporal_feature_engineer_transformer = TemporalFeatureEngineerTransformer(
    required_cols=['date', 'publisher', 'market_id', 'clicks']
)

log_transformer_to_mlflow(
    transformer=temporal_feature_engineer_transformer,
    experiment_name="Temporal_Feature_Engineer_Transformer",
    model_name="temporal_feature_engineer_transformer",
    df=df_train,
    target_col=TARGET
)

## Transform Train and Validation

In [None]:
df_train, df_val = apply_transformer(
    df_train,
    df_val,
    temporal_feature_engineer_transformer,
    TARGET
)

## Save Train and Validation Data for EDA

In [None]:
save_object(df_train, 'df_train_eda.pkl')
save_object(df_val, 'df_val_eda.pkl')

# Dropping Irrelevant Columns  

Before training the model, some columns should be removed because they are either not available before campaign launch or do not provide predictive value:  

- **`cost`**, **`conversions`**, and **`clicks`**: These columns contain information that is only available **after** a campaign has been launched. Since we want to predict outcomes beforehand, they must be excluded.  
- **`customer_id`**: This is simply a random identifier with no predictive power and should therefore be dropped.  

The `DropColumnsTransformer` class below implements this logic as a scikit-learn compatible transformer.


## Drop Columns Transformer

In [None]:
%%writefile drop_columns_transformer.py
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

# - drop the columns cost, conversions and clicks as you will not have them
# before the campaign is launched.
# - customer_id does not have any predictive power as it is just a random id.
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
  """
  A scikit-learn compatible transformer to drop specified columns from
  a DataFrame.
  """

  def __init__(
      self,
      columns_to_drop=['cost', 'conversions', 'customer_id', 'clicks']
  ):
    """
    Initialize the transformer with a list of columns to drop.

    Parameters:
    columns_to_drop : list,
    default=['cost', 'conversions', 'customer_id', 'clicks']
    List of column names to drop from the DataFrame.
    """
    self.columns_to_drop = columns_to_drop

  def fit(self, X, y=None):
    """
    Fit method (no-op, included for scikit-learn compatibility).

    Parameters:
    - X : pandas DataFrame, Input data.

    - y : None or array-like, default=None, Ignored. Included for compatibility.

    Returns:
    self : DropColumnsTransformer
    Returns the instance itself.
    """
    # Validate input
    if not isinstance(X, pd.DataFrame):
      raise ValueError("Input X must be a pandas DataFrame")

    # Check which columns to drop exist in X
    self.columns_to_drop_ = [
        col for col in self.columns_to_drop if col in X.columns
    ]

    return self

  def transform(self, X):
    """
    Transform method to drop specified columns from the DataFrame.

    Parameters:
    X : pandas DataFrame
    Input data.

    Returns:
    X_transformed : pandas DataFrame
    DataFrame with specified columns dropped.
    """
    if not isinstance(X, pd.DataFrame):
      raise ValueError("Input X must be a pandas DataFrame")

    # Create a copy to avoid modifying the original DataFrame
    X_transformed = X.copy()

    # Drop columns that exist in the DataFrame
    X_transformed.drop(
        columns=self.columns_to_drop_,
        inplace=True,
        errors='ignore'
    )

    return X_transformed

In [None]:
copy_file_to_destination(
    source_path = 'drop_columns_transformer.py',
    destination_path = PATH / 'drop_columns_transformer.py'
)

## Save Fitted Model

In [None]:
from drop_columns_transformer import DropColumnsTransformer

drop_columns_transformer = DropColumnsTransformer(
    columns_to_drop=['cost', 'conversions', 'customer_id', 'clicks']
)

log_transformer_to_mlflow(
    transformer=drop_columns_transformer,
    experiment_name="Drop_Columns",
    model_name="drop_columns_transformer",
    df=df_train,
    target_col=TARGET
)

## Transform Train and Validation

In [None]:
df_train, df_val = apply_transformer(
    df_train,
    df_val,
    drop_columns_transformer,
    TARGET
)

# Outlier Detection After Feature Engineering  

After feature engineering, **new columns are introduced** that were not present in the original dataset.  
It is important to re-check for outliers at this stage because:  

- Outliers should now be detected **only on the newly engineered columns**, not on the original ones.  
- If ignored, these new features may introduce **skewness or distortions** into the data distribution, which can negatively affect model training and performance.  

The following code performs outlier detection again, focusing specifically on the engineered features.


In [None]:
# Define columns for outlier detection
outlier_cols = [
    'publisher_avg_clicks',
    'market_avg_clicks',
    'publisher_popularity',
    'market_popularity'
]

# Select all numeric columns for model training, but only scale outlier_cols
# for oulier detection
numeric_cols = (
    df_train
    .select_dtypes(include=['int64', 'float64'])
    .columns
    .tolist()
)

# Ensure outlier_cols exist in numeric_cols
outlier_cols = [col for col in outlier_cols if col in numeric_cols]

X_train_outlier = df_train[outlier_cols]
X_train_full = df_train[numeric_cols]
scaler = StandardScaler()
scaler.fit(X_train_outlier)
X_train_outlier_scaled = scaler.transform(X_train_outlier)
X_val = df_val[numeric_cols]

n_trials = 500
n_splits = 5
n_repeats = 3
mlflow_experiment = "Outlier_Detection_Again"

# SQLite storage configuration
study_name = f"{mlflow_experiment}_study"
storage = f"sqlite:///{PATH}/{study_name}.db"

def objective(trial):
  winsorize_upper = trial.suggest_float("winsorize_upper", 0.90, 1)

  # Apply winsorization to target
  y_train_full = np.log1p(
      mstats.winsorize(df_train[TARGET], limits=[0, 1-winsorize_upper])
  )

  outlier_method = trial.suggest_categorical(
      "outlier_method",
      [
          "IsolationForest",
          "LocalOutlierFactor",
          "OneClassSVM",
          "DBSCAN",
          "ZScore"
      ]
  )

  if outlier_method == "IsolationForest":
    contamination_option = trial.suggest_categorical(
        "if_contamination_option",
        ["float"]
    )
    contamination = trial.suggest_float("if_contamination", 0.01, 0.2)

    detector = IsolationForest(
        n_estimators=trial.suggest_int("if_n_estimators", 50, 300),
        max_samples=trial.suggest_float("if_max_samples", 0.1, 1.0),
        contamination=contamination,
        max_features=trial.suggest_float("if_max_features", 0.1, 1.0),
        random_state=RANDOM_STATE
    )

    mask = detector.fit_predict(X_train_outlier_scaled) == 1

  elif outlier_method == "LocalOutlierFactor":
    contamination_option = trial.suggest_categorical(
        "lof_contamination_option",
        ["float"]
    )

    contamination = trial.suggest_float("lof_contamination", 0.01, 0.2)

    detector = LocalOutlierFactor(
        n_neighbors=trial.suggest_int("lof_n_neighbors", 5, 150),
        algorithm=trial.suggest_categorical(
            "lof_algorithm",
            ["auto", "ball_tree", "kd_tree", "brute"]
        ),

        leaf_size=trial.suggest_int("lof_leaf_size", 10, 100),
        metric=trial.suggest_categorical(
            "lof_metric",
            ["euclidean", "manhattan", "chebyshev", "minkowski"]
        ),

        p=trial.suggest_int("lof_p", 1, 5),
        contamination=contamination,
        n_jobs=N_JOBS
    )

    mask = detector.fit_predict(X_train_outlier_scaled) == 1

  elif outlier_method == "OneClassSVM":
    detector = OneClassSVM(
        kernel=trial.suggest_categorical(
            "ocsvm_kernel",
            ["linear", "poly", "rbf", "sigmoid"]
        ),

        degree=trial.suggest_int("ocsvm_degree", 2, 5),
        gamma=trial.suggest_float("ocsvm_gamma", 0.001, 1.0, log=True),
        coef0=trial.suggest_float("ocsvm_coef0", 0.0, 1.0),
        tol=trial.suggest_float("ocsvm_tol", 1e-4, 1e-2),
        nu=trial.suggest_float("ocsvm_nu", 0.01, 0.5),
        shrinking=trial.suggest_categorical("ocsvm_shrinking", [True, False]),
        max_iter=trial.suggest_int("ocsvm_max_iter", 100, 10000),
        verbose=0
    )
    mask = detector.fit_predict(X_train_outlier_scaled) == 1

  elif outlier_method == "DBSCAN":
    detector = DBSCAN(
        eps=trial.suggest_float("dbscan_eps", 0.1, 5.0),
        min_samples=trial.suggest_int("dbscan_min_samples", 3, 20),
        algorithm=trial.suggest_categorical(
            "dbscan_algorithm",
            ["auto", "ball_tree", "kd_tree", "brute"]
        ),

        leaf_size=trial.suggest_int("dbscan_leaf_size", 10, 100),
        metric=trial.suggest_categorical(
            "dbscan_metric",
            ["euclidean", "manhattan", "chebyshev", "minkowski"]
        ),

        p=trial.suggest_int("dbscan_p", 1, 5),
        n_jobs=N_JOBS
    )

    cluster_labels = detector.fit_predict(X_train_outlier_scaled)
    mask = cluster_labels != -1

  elif outlier_method == "ZScore":
    threshold = trial.suggest_float("zscore_threshold", 2.0, 4.0)
    z_scores = np.abs(zscore(X_train_outlier, axis=0))
    mask = (z_scores < threshold).all(axis=1)

  # Apply mask to full dataset
  X_clean = X_train_full[mask]
  y_clean = pd.Series(y_train_full[mask])

  # Repeated K-Fold Cross-Validation
  rkf = RepeatedKFold(
      n_splits=n_splits,
      n_repeats=n_repeats,
      random_state=RANDOM_STATE
  )

  rmse_list = []

  for fold_idx, (train_idx, val_idx) in enumerate(rkf.split(X_clean)):
    X_train, X_val_fold = X_clean.iloc[train_idx], X_clean.iloc[val_idx]
    y_train, y_val_fold = y_clean.iloc[train_idx], y_clean.iloc[val_idx]

    model = xgb.XGBRegressor(
        objective=trial.suggest_categorical('objective', ['reg:squarederror']),
        base_score=trial.suggest_float('base_score', 0.1, 0.9),
        booster=trial.suggest_categorical('booster', ['gbtree', 'dart']),
        colsample_bylevel=trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        colsample_bynode=trial.suggest_float('colsample_bynode', 0.5, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
        gamma=trial.suggest_float('gamma', 1e-6, 5, log=True),
        grow_policy=trial.suggest_categorical(
            'grow_policy',
             ['depthwise', 'lossguide']
        ),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        max_bin=trial.suggest_int('max_bin', 128, 512),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        max_leaves=trial.suggest_int('max_leaves', 0, 64),
        min_child_weight=trial.suggest_float('min_child_weight', 1, 10),
        n_estimators=trial.suggest_int('n_estimators', 50, 500),
        n_jobs=N_JOBS,
        num_parallel_tree=trial.suggest_int('num_parallel_tree', 1, 3),
        random_state=RANDOM_STATE,
        reg_alpha=trial.suggest_float('reg_alpha', 1e-6, 1.0, log=True),
        reg_lambda=trial.suggest_float('reg_lambda', 1e-6, 1.0, log=True),
        #GPU is needed to use 'gradient_based' as sampling_method
        sampling_method='uniform',
        scale_pos_weight=trial.suggest_float('scale_pos_weight', 0.5, 1.5),
        subsample=trial.suggest_float('subsample', 0.5, 1.0),
        tree_method=trial.suggest_categorical(
            'tree_method',
             ['auto', 'approx', 'hist']
        ),

        validate_parameters=True,
        verbosity=0
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val_fold)
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    rmse_list.append(rmse)

    del X_train, X_val_fold, y_train, y_val_fold, y_pred, model
    gc.collect()

    trial.report(np.mean(rmse_list), step=fold_idx)

    if trial.should_prune():
      raise optuna.TrialPruned()

  return np.mean(rmse_list)

# Start MLflow experiment
mlflow.set_experiment(mlflow_experiment)

with mlflow.start_run():
  # - Ensure to use the same direction,
  # pruner (min_resource, max_resource, reduction_factor, etc.) if you are
  # loading an existing study
  study = optuna.create_study(
      study_name=study_name,
      storage=storage,
      direction="minimize",
      pruner=optuna.pruners.HyperbandPruner(
          min_resource=max(1, n_splits),
          max_resource=n_splits * n_repeats,
          reduction_factor=2
      ),
      sampler=optuna.samplers.TPESampler(
          n_startup_trials=max(1, int(n_trials * 0.2)),
          seed=RANDOM_STATE,
          multivariate=True
      ),
      load_if_exists=True
  )

  study.optimize(objective, n_trials=n_trials)
  log_optuna_best_trial_search_space(study)

  # Log Optuna storage details
  mlflow.log_param("optuna_storage", storage)
  mlflow.log_param("optuna_study_name", study_name)

  # Log experiment parameters
  mlflow.log_param("n_trials", n_trials)
  mlflow.log_param("n_splits", n_splits)
  mlflow.log_param("n_repeats", n_repeats)

  fanova_importances = get_param_importances(
      study,
      evaluator=FanovaImportanceEvaluator(seed=RANDOM_STATE),
      target=(
          lambda t: t.value
          if t.state == optuna.trial.TrialState.COMPLETE
          else None
      ),
      normalize=True
  )

  fanova_importances = {k: round(v, 4) for k, v in fanova_importances.items()}

  for param, importance in fanova_importances.items():
    mlflow.log_param(f"fanova_{param}", importance)

  print(f'Fanova Hyperparameter Importances (rounded): {fanova_importances}')

  best_params = study.best_params

  for param, value in best_params.items():
    mlflow.log_param(param, value)

  mlflow.log_metric("best_cv_rmse", study.best_value)
  print("Best Params:", best_params)
  print("Best CV RMSE:", study.best_value)

  # Apply best winsorization and compute thresholds from training data
  y_train_full = np.log1p(mstats.winsorize(
      df_train[TARGET],
      limits=[0, 1-best_params["winsorize_upper"]]
  ))

  # Compute winsorization thresholds from training data
  # Apply fixed thresholds to validation data
  y_val = np.log1p(np.clip(
      df_val[TARGET],
      0,
      np.percentile(df_train[TARGET], best_params["winsorize_upper"] * 100)
  ))

  # Apply best outlier method
  if best_params["outlier_method"] == "IsolationForest":
    contamination = (
        best_params["if_contamination"]
        if best_params["if_contamination_option"] == "float"
        else 'auto'
    )

    detector = IsolationForest(
        n_estimators=best_params["if_n_estimators"],
        max_samples=best_params["if_max_samples"],
        contamination=contamination,
        max_features=best_params["if_max_features"],
        random_state=RANDOM_STATE
    )

    mask = detector.fit_predict(X_train_outlier_scaled) == 1
    num_outliers = len(X_train_outlier_scaled) - np.sum(mask)

  elif best_params["outlier_method"] == "LocalOutlierFactor":
    contamination = (
        best_params["lof_contamination"]
        if best_params["lof_contamination_option"] == "float"
        else 'auto'
    )

    detector = LocalOutlierFactor(
        n_neighbors=best_params["lof_n_neighbors"],
        algorithm=best_params["lof_algorithm"],
        leaf_size=best_params["lof_leaf_size"],
        metric=best_params["lof_metric"],
        p=best_params["lof_p"],
        contamination=contamination,
        novelty=False
    )

    mask = detector.fit_predict(X_train_outlier_scaled) == 1
    num_outliers = len(X_train_outlier_scaled) - np.sum(mask)

  elif best_params["outlier_method"] == "OneClassSVM":
    detector = OneClassSVM(
        kernel=best_params["ocsvm_kernel"],
        degree=best_params["ocsvm_degree"],
        gamma=best_params["ocsvm_gamma"],
        coef0=best_params["ocsvm_coef0"],
        tol=best_params["ocsvm_tol"],
        nu=best_params["ocsvm_nu"],
        shrinking=best_params["ocsvm_shrinking"],
        max_iter=best_params["ocsvm_max_iter"],
        verbose=0
    )

    mask = detector.fit_predict(X_train_outlier_scaled) == 1
    num_outliers = len(X_train_outlier_scaled) - np.sum(mask)

  elif best_params["outlier_method"] == "DBSCAN":
    detector = DBSCAN(
        eps=best_params["dbscan_eps"],
        min_samples=best_params["dbscan_min_samples"],
        algorithm=best_params["dbscan_algorithm"],
        leaf_size=best_params["dbscan_leaf_size"],
        metric=best_params["dbscan_metric"],
        p=best_params["dbscan_p"]
    )

    cluster_labels = detector.fit_predict(X_train_outlier_scaled)
    mask = cluster_labels != -1
    num_outliers = np.sum(cluster_labels == -1)

  elif best_params["outlier_method"] == "ZScore":
    zscore_threshold = best_params["zscore_threshold"]
    z_scores = np.abs(zscore(X_train_outlier, axis=0))
    mask = (z_scores < zscore_threshold).all(axis=1)
    num_outliers = len(X_train_outlier) - np.sum(mask)

  mlflow.log_metric("num_outliers", num_outliers)
  print(f"Number of outliers detected: {num_outliers}")
  outlier_percentage = (num_outliers / len(X_train_outlier_scaled)) * 100
  mlflow.log_metric("outlier_percentage", outlier_percentage)
  print(f"Percentage of outliers removed: {outlier_percentage:.2f}%")

  # Apply mask to full dataset
  df_train = df_train[mask]
  X_final = X_train_full[mask]
  y_final = y_train_full[mask]

  valid_params = {
      k: v
      for k, v in best_params.items()
      if k in xgb.XGBRegressor().get_params()
  }

  # all the constant hyperparameters should be added here manually and are not
  # part of study.best_params
  final_model = xgb.XGBRegressor(
      **valid_params,
      n_jobs=N_JOBS,
      random_state=RANDOM_STATE,
      validate_parameters=True,
      verbosity=0
  )

  X_final = X_final[sorted(X_final.columns)]
  X_val = X_val[sorted(X_val.columns)]
  final_model.fit(X_final, y_final)
  y_pred = final_model.predict(X_val)

  evaluate_and_log_metrics(y_val, y_pred, prefix="val")

## ZScore Transformer

In [None]:
%%writefile zscore_transformer.py
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class ZScoreTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, columns, threshold):
    """
    Initialize the ZScoreTransformer to remove outliers using Z-scores.

    Parameters:
    - columns : list
    List of column names to apply Z-score outlier detection.
    - threshold : float, optional (default=3.0)
    Z-score threshold for identifying outliers. Rows with any Z-score
    exceeding this threshold in the specified columns are removed.

    Notes:
    - This transformer removes rows identified as outliers, which may result
    in different row counts between training and validation sets. Ensure this
    behavior aligns with your pipeline, as it may affect index alignment
    or downstream tasks expecting consistent row counts.
    - Columns with zero variance (standard deviation = 0) are excluded from
    outlier detection, as they contain no outliers by definition.
    """
    self.columns = columns
    self.threshold = threshold
    self.means_ = None
    self.stds_ = None
    self.valid_columns_ = None

  def fit(self, X, y=None):
    """
    Fit the transformer by computing mean and std for specified columns.

    Parameters:
    - X : pandas DataFrame
    Input DataFrame containing the columns to check for outliers.
    - y : None
    Ignored. For scikit-learn compatibility.

    Returns:
    self : ZScoreTransformer
    Returns the instance itself.

    Raises:
    ValueError
    If X is not a pandas DataFrame or if specified columns are missing.
    """
    if not isinstance(X, pd.DataFrame):
      raise ValueError("Input X must be a pandas DataFrame")

    # Ensure all specified columns exist in X
    missing_cols = [col for col in self.columns if col not in X.columns]

    if missing_cols:
      raise ValueError(f"Columns {missing_cols} not found in input DataFrame")

    # Compute mean and std for specified columns
    self.means_ = X[self.columns].mean()
    self.stds_ = X[self.columns].std()

    # Identify columns with non-zero standard deviation
    self.valid_columns_ = [
        col for col in self.columns if self.stds_[col] != 0
    ]

    if not self.valid_columns_:
      print("""
      Warning: All specified columns have zero variance. No outlier detection
      will be performed.
      """)

    return self

  def transform(self, X):
    """
    Transform the input DataFrame by removing outliers based on Z-scores.

    Parameters:
    X : pandas DataFrame
    Input DataFrame to transform.

    Returns:
    X_transformed : pandas DataFrame
    DataFrame with outlier rows removed based on Z-score threshold.

    Raises:
    ValueError
    If X is not a pandas DataFrame, transformer is not fitted, or specified
    columns are missing.
    """
    if not isinstance(X, pd.DataFrame):
      raise ValueError("Input X must be a pandas DataFrame")

    if (
        self.means_ is None
        or self.stds_ is None
        or self.valid_columns_ is None
    ):
      raise ValueError("Transformer not fitted. Call fit() first.")

    # Ensure all specified columns exist in X
    missing_cols = [col for col in self.columns if col not in X.columns]

    if missing_cols:
      raise ValueError(f"Columns {missing_cols} not found in input DataFrame")

    if not self.valid_columns_:
      # If no valid columns, return copy of input DataFrame
      return X.copy()

    # Calculate Z-scores for valid columns
    z_scores = np.abs(
        (X[self.valid_columns_] - self.means_[self.valid_columns_])
        / self.stds_[self.valid_columns_]
    )

    # Create mask where all Z-scores are below threshold
    mask = (z_scores < self.threshold).all(axis=1)

    # Return DataFrame with non-outlier rows
    return X[mask].copy()

## Save Fitted Model

In [None]:
from zscore_transformer import ZScoreTransformer

zscore_transformer = ZScoreTransformer(
    columns=[
        'publisher_avg_clicks',
        'market_avg_clicks',
        'publisher_popularity',
        'market_popularity'
    ],
    threshold=3.3248732323726267
)

log_transformer_to_mlflow(
    transformer=zscore_transformer,
    experiment_name="ZScore_Transformer",
    model_name="zscore_transformer",
    df=df_train,
    target_col=None
)

## Transform Train

In [None]:
# We do not remove outliers from the validation/test data
df_train = zscore_transformer.transform(df_train)

## Check Feature Distribution Mismatches

In [None]:
check_feature_distribution_mismatches(
    df_train[skewed_cols],
    df_val[skewed_cols],
    test_type="Kolmogorov-Smirnov"
)

# Skewed Features

Some numerical features in the dataset were **highly skewed** which can negatively impact model performance, especially for tree-based and linear models.

To address this, several transformations were evaluated:

- **Methods tried:** `Yeo-Johnson`, `Logarithmic (log)`, `Square Root (sqrt)`, `Box-Cox`, `Reciprocal`, `None`
- **Purpose:** Reduce skewness in the data to improve model stability and predictive performance.

Evaluation on the validation set:

1. **Log transformation** produced the best results.
2. **Followed by:** Yeo-Johnson, Box-Cox, Square Root (sqrt) and Reciprocal.  

**Decision:** The final model used the **log transformation** on skewed features as it consistently reduced skewness and improved cross-validated RMSE.  

**Takeaway:** This approach ensures that features are closer to a normal distribution which can improve model convergence, stability and overall predictive performance.


In [None]:
# Visualize the distribution of continuous features
numeric_cols = (
    df_train
    .select_dtypes(include=['int64', 'float64'])
    .columns
)

In [None]:
# Plot distributions for continuous features
for col in numeric_cols:
  sns.histplot(df_train[col], kde=True)
  plt.title(f"Distribution of {col}")
  plt.show()

In [None]:
numeric_cols = (
    df_train
    .select_dtypes(include=['int64', 'float64'])
    .columns
    .drop([TARGET])
    .tolist()
)

print(f'Numerical columns: {numeric_cols}')
skewed_cols = df_train[numeric_cols].skew().abs()
cols_to_transform = skewed_cols[skewed_cols > 0.5].index.tolist()
print(f'Skewed columns to transform: {cols_to_transform}')
X_train_full = df_train[numeric_cols]
X_val = df_val[numeric_cols]

n_trials = 500
n_splits = 5
n_repeats = 3
mlflow_experiment = "skewness_detection"

# SQLite storage configuration
study_name = f"{mlflow_experiment}_study"
storage = f"sqlite:///{PATH}/{study_name}.db"

def objective(trial):
  # Tune transformation method
  transform_method = trial.suggest_categorical(
      'transform_method',
      ['yeo-johnson', 'log', 'sqrt', 'box-cox', 'reciprocal', 'none']
  )

  yeo_johnson_standardize = (
      trial.suggest_categorical('yeo_johnson_standardize', [True, False])
      if transform_method == 'yeo-johnson'
      else False
  )

  box_cox_standardize = (
      trial.suggest_categorical('box_cox_standardize', [True, False])
      if transform_method == 'box-cox'
      else False
  )

  # Tune winsorization limits
  winsorize_upper = trial.suggest_float('winsorize_upper', 0.9, 1)

  # Apply winsorization to target
  y_train_full = np.log1p(
      mstats.winsorize(df_train[TARGET], limits=(0, 1-winsorize_upper))
  )

  # Apply transformation to training data
  X_train_transformed = X_train_full.copy()
  transformed_columns = [f'{col}_transformed' for col in cols_to_transform]

  if transform_method == 'yeo-johnson':
    skewness_transformer = PowerTransformer(
        method='yeo-johnson',
        standardize=yeo_johnson_standardize
    )

    X_train_transformed[transformed_columns] = (
        skewness_transformer
        .fit_transform(X_train_full[cols_to_transform])
    )

  elif transform_method == 'log':
    shift = (
        abs(X_train_full[cols_to_transform].min().min()) + 1e-10
        if X_train_full[cols_to_transform].min().min() <= 0
        else 0
    )

    X_train_transformed[transformed_columns] = np.log1p(
        X_train_full[cols_to_transform] + shift
    )

    trial.set_user_attr('shift', shift)

  elif transform_method == 'sqrt':
    shift = (
        abs(X_train_full[cols_to_transform].min().min())
        if X_train_full[cols_to_transform].min().min() < 0
        else 0
    )

    X_train_transformed[transformed_columns] = np.sqrt(
        X_train_full[cols_to_transform] + shift
    )

    trial.set_user_attr('shift', shift)

  elif transform_method == 'box-cox':
    shift = (
        abs(X_train_full[cols_to_transform].min().min()) + 1e-10
        if X_train_full[cols_to_transform].min().min() <= 0
        else 0
    )

    skewness_transformer = PowerTransformer(
        method='box-cox',
        standardize=box_cox_standardize
    )

    X_train_transformed[transformed_columns] = (
        skewness_transformer
        .fit_transform(X_train_full[cols_to_transform] + shift)
    )

    trial.set_user_attr('shift', shift)

  elif transform_method == 'reciprocal':
    shift = (
        1e-10
        if X_train_full[cols_to_transform].eq(0).any().any()
        else 0
    )

    X_train_transformed[transformed_columns] = (
        1 / (X_train_full[cols_to_transform] + shift + 1e-10)
    )

    trial.set_user_attr('shift', shift)

  if transform_method != 'none':
      X_train_transformed.drop(columns=cols_to_transform, inplace=True)

  # Repeated K-Fold Cross-Validation
  rkf = RepeatedKFold(
      n_splits=n_splits,
      n_repeats=n_repeats,
      random_state=RANDOM_STATE
  )

  rmse_list = []

  for fold_idx, (train_idx, val_idx) in enumerate(
      rkf.split(X_train_transformed)
  ):
    X_train, X_val_fold = (
        X_train_transformed.iloc[train_idx],
        X_train_transformed.iloc[val_idx]
    )

    y_train, y_val_fold = y_train_full[train_idx], y_train_full[val_idx]

    model = xgb.XGBRegressor(
        objective=trial.suggest_categorical(
            'objective',
              ['reg:squarederror']
        ),
        booster=trial.suggest_categorical('booster', ['gbtree', 'dart']),
        colsample_bylevel=trial.suggest_float('colsample_bylevel', 0.2, 0.8),
        colsample_bynode=trial.suggest_float('colsample_bynode', 0.2, 0.8),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.2, 0.8),
        gamma=trial.suggest_float('gamma', 1e-3, 30, log=True),
        grow_policy=trial.suggest_categorical(
            'grow_policy',
              ['depthwise', 'lossguide']
        ),
        learning_rate=trial.suggest_float(
            'learning_rate',
            0.001,
            0.2,
            log=True
        ),
        max_bin=trial.suggest_int('max_bin', 128, 512),
        max_depth=trial.suggest_int('max_depth', 2, 6),
        min_child_weight=trial.suggest_float('min_child_weight', 1, 20),
        n_estimators=trial.suggest_int('n_estimators', 100, 3000),
        n_jobs=N_JOBS,
        random_state=RANDOM_STATE,
        sampling_method=trial.suggest_categorical(
            'sampling_method',
              ["uniform"]
        ),
        subsample=trial.suggest_float('subsample', 0.2, 0.8),
        tree_method=trial.suggest_categorical(
            'tree_method',
              ["auto", "hist"]
        ),
        base_score=trial.suggest_float('base_score', 0.1, 0.9),
        max_leaves=trial.suggest_int('max_leaves', 0, 16),
        num_parallel_tree=trial.suggest_int('num_parallel_tree', 1, 3),
        reg_alpha=trial.suggest_float('reg_alpha', 1e-6, 20, log=True),
        reg_lambda=trial.suggest_float('reg_lambda', 1e-6, 20, log=True),
        scale_pos_weight=trial.suggest_float('scale_pos_weight', 0.5, 1.5),
        validate_parameters=True,
        verbosity=0
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val_fold)
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    rmse_list.append(rmse)

    del X_train, X_val_fold, y_train, y_val_fold, y_pred, model
    gc.collect()

    trial.report(np.mean(rmse_list), step=fold_idx)

    if trial.should_prune():
      raise optuna.TrialPruned()

  return np.mean(rmse_list)

# Start MLflow experiment
mlflow.set_experiment(mlflow_experiment)

with mlflow.start_run():
  # - Ensure to use the same direction,
  # pruner (min_resource, max_resource, reduction_factor, etc.) if you are
  # loading an existing study
  study = optuna.create_study(
      study_name=study_name,
      storage=storage,
      direction="minimize",
      pruner=optuna.pruners.HyperbandPruner(
          min_resource=max(1, n_splits),
          max_resource=n_splits * n_repeats,
          reduction_factor=2
      ),
      sampler=optuna.samplers.TPESampler(
          n_startup_trials=max(1, int(n_trials * 0.2)),
          seed=RANDOM_STATE,
          multivariate=True
      ),
      load_if_exists=True
  )

  study.optimize(objective, n_trials=n_trials)
  log_optuna_best_trial_search_space(study)

  # Log Optuna storage details
  mlflow.log_param("optuna_storage", storage)
  mlflow.log_param("optuna_study_name", study_name)

  # Log experiment parameters
  mlflow.log_param("n_trials", n_trials)
  mlflow.log_param("n_splits", n_splits)
  mlflow.log_param("n_repeats", n_repeats)

  # Log cols_to_transform
  mlflow.log_param("cols_to_transform", ",".join(cols_to_transform))

  fanova_importances = get_param_importances(
      study,
      evaluator=FanovaImportanceEvaluator(seed=RANDOM_STATE),
      target=(
          lambda t: t.value
          if t.state == optuna.trial.TrialState.COMPLETE
          else None
      ),
      normalize=True
  )

  fanova_importances = {k: round(v, 4) for k, v in fanova_importances.items()}

  for param, importance in fanova_importances.items():
    mlflow.log_param(f"fanova_{param}", importance)

  print(f'Fanova Hyperparameter Importances (rounded): {fanova_importances}')

  # Get best parameters and trial attributes
  best_params = study.best_params
  best_trial = study.best_trial

  for param, value in best_params.items():
    mlflow.log_param(param, value)

  mlflow.log_metric("best_cv_rmse", study.best_value)

  print("Best Params:", best_params)
  print("Best CV RMSE:", study.best_value)

  # Apply best winsorization and transformation to final training and validation sets
  y_train_full = np.log1p(mstats.winsorize(
      df_train[TARGET],
      limits=(0, 1-best_params['winsorize_upper'])
  ))

  # Compute winsorization thresholds from training data
  # Apply fixed thresholds to validation data
  y_val = np.log1p(np.clip(
      df_val[TARGET],
      0,
      np.percentile(df_train[TARGET], best_params['winsorize_upper'] * 100)
  ))

  # Apply best transformation to training data
  best_transform_method = best_params['transform_method']

  standardize = (
      best_params['yeo_johnson_standardize']
      if best_transform_method == 'yeo-johnson'
      else best_params.get('box_cox_standardize', False)
      if best_transform_method == 'box-cox'
      else False
  )

  X_final = X_train_full.copy()
  transformed_columns = [f'{col}_transformed' for col in cols_to_transform]
  shift = best_trial.user_attrs.get('shift', 0)

  if best_transform_method == 'yeo-johnson':
    skewness_transformer = PowerTransformer(
        method='yeo-johnson',
        standardize=standardize
    )

    X_final[transformed_columns] = (
        skewness_transformer.fit_transform(X_train_full[cols_to_transform])
    )

  elif best_transform_method == 'log':
    X_final[transformed_columns] = (
        np.log1p(X_train_full[cols_to_transform] + shift)
    )

  elif best_transform_method == 'sqrt':
    X_final[transformed_columns] = (
        np.sqrt(X_train_full[cols_to_transform] + shift)
    )

  elif best_transform_method == 'box-cox':
    skewness_transformer = PowerTransformer(
        method='box-cox',
        standardize=standardize
    )

    X_final[transformed_columns] = (
        skewness_transformer
        .fit_transform(X_train_full[cols_to_transform] + shift)
    )

  elif best_transform_method == 'reciprocal':
    X_final[transformed_columns] = (
        1 / (X_train_full[cols_to_transform] + shift + 1e-10)
    )

  if best_params['transform_method'] != 'none':
    X_final.drop(columns=cols_to_transform, inplace=True)

  # Apply transformation to validation data
  X_val_transformed = X_val.copy()
  if best_transform_method == 'yeo-johnson':
    X_val_transformed[transformed_columns] = (
        skewness_transformer.transform(X_val[cols_to_transform])
    )

  elif best_transform_method == 'log':
    X_val_transformed[transformed_columns] = (
        np.log1p(X_val[cols_to_transform] + shift)
    )

  elif best_transform_method == 'sqrt':
    X_val_transformed[transformed_columns] = (
        np.sqrt(X_val[cols_to_transform] + shift)
    )

  elif best_transform_method == 'box-cox':
    X_val_transformed[transformed_columns] = (
        skewness_transformer.transform(X_val[cols_to_transform] + shift)
    )

  elif best_transform_method == 'reciprocal':
    X_val_transformed[transformed_columns] = (
        1 / (X_val[cols_to_transform] + shift + 1e-10)
    )

  X_val_transformed.drop(columns=cols_to_transform, inplace=True)

  # Train final model with best XGBoost parameters
  filtered_params = {
      k: v
      for k, v in best_params.items()
      if k in xgb.XGBRegressor().get_params()
  }

  # all the constant hyperparameters should be added here manually and are not
  # part of study.best_params
  final_model = xgb.XGBRegressor(
      **filtered_params,
      n_jobs=N_JOBS,
      random_state=RANDOM_STATE,
      validate_parameters=True,
      verbosity=0
  )

  X_final = X_final[sorted(X_final.columns)]
  final_model.fit(X_final, y_train_full)
  X_val_transformed = X_val_transformed[sorted(X_val_transformed.columns)]
  y_pred = final_model.predict(X_val_transformed)

  evaluate_and_log_metrics(y_val, y_pred, prefix="val")

## Skew Log Transformer

In [None]:
%%writefile log_transformer.py
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class LogTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, skew_threshold=0.5):
    """
    Initialize the transformer.

    Parameters:
    skew_threshold : float, default=0.5
    Absolute skewness threshold to determine which columns to transform.
    """
    self.skew_threshold = skew_threshold
    self.numeric_cols_ = None
    self.cols_to_transform_ = None
    self.shift_ = None
    self.transformed_columns_ = None

  def fit(self, X, y=None):
    """
    Fit the transformer by identifying numerical columns, skewed columns and
    computing shift.

    Parameters:
    - X : pandas DataFrame
    Input data to fit.
    y : None
    - Ignored. Included for scikit-learn compatibility.

    Returns:
    self : SkewLogTransformer
    Returns the instance itself.
    """
    if not isinstance(X, pd.DataFrame):
      raise ValueError("Input X must be a pandas DataFrame.")

    # Identify numerical columns
    self.numeric_cols_ = (
        X.select_dtypes(include=['int64', 'float64'])
        .columns
        .tolist()
    )

    if not self.numeric_cols_:
      self.cols_to_transform_ = []
      self.shift_ = 0
      self.transformed_columns_ = []
      return self

    # Calculate skewness and identify columns to transform
    skewed_cols = X[self.numeric_cols_].skew().abs()

    self.cols_to_transform_ = (
        skewed_cols[skewed_cols > self.skew_threshold]
        .index
        .tolist()
    )

    # Compute shift to ensure positive values for log1p
    if self.cols_to_transform_:
      self.shift_ = (
          abs(X[self.cols_to_transform_].min().min()) + 1e-10
          if X[self.cols_to_transform_].min().min() <= 0
          else 0
      )
    else:
      self.shift_ = 0

    # Define transformed column names
    self.transformed_columns_ = [
        f'{col}_transformed' for col in self.cols_to_transform_
    ]

    return self

  def transform(self, X):
    """
    Transform the input data by applying log1p transformation to skewed columns.

    Parameters:
    X : pandas DataFrame
    Input data to transform.

    Returns:
    X_transformed : pandas DataFrame
    Transformed data with skewed columns replaced by their log1p
    transformations.
    """
    if not isinstance(X, pd.DataFrame):
      raise ValueError("Input X must be a pandas DataFrame.")

    if self.cols_to_transform_ is None:
      raise ValueError("Transformer has not been fitted. Call fit() first.")

    X_transformed = X.copy()

    # Apply transformation only to columns present in X
    cols_to_transform = [
        col for col in self.cols_to_transform_ if col in X.columns
    ]

    if cols_to_transform:
      transformed_columns = [f'{col}_transformed' for col in cols_to_transform]

      X_transformed[transformed_columns] = np.log1p(
          X[cols_to_transform] + self.shift_
      )

      # Drop original skewed columns
      X_transformed.drop(columns=cols_to_transform, inplace=True)

    return X_transformed

In [None]:
copy_file_to_destination(
    source_path = 'log_transformer.py',
    destination_path = PATH / 'log_transformer.py'
)

## Save Fitted Model

In [None]:
from log_transformer import LogTransformer

log_transformer = LogTransformer()

log_transformer_to_mlflow(
    transformer=log_transformer,
    experiment_name="Log_Transformer",
    model_name="log_transformer",
    df=df_train,
    target_col=TARGET
)

## Transform Train and Test

In [None]:
df_train, df_val = apply_transformer(
    df_train,
    df_val,
    log_transformer,
    TARGET
)

## Check Feature Distribution Mismatches

In [None]:
check_feature_distribution_mismatches(
    df_train[skewed_cols],
    df_val[skewed_cols],
    test_type="Kolmogorov-Smirnov"
)

# Boolean to String

## Boolean To String Transformer

In [None]:
%%writefile boolean_to_string_transformer.py
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class BooleanToStringTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, column_name):
    self.column_name = column_name

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    X_copy = X.copy()
    X_copy[self.column_name] = X_copy[self.column_name].astype('string')
    return X_copy

In [None]:
copy_file_to_destination(
    source_path = 'boolean_to_string_transformer.py',
    destination_path = PATH / 'boolean_to_string_transformer.py'
)

## Save Fitted Model

In [None]:
from boolean_to_string_transformer import BooleanToStringTransformer

boolean_to_string_transformer = BooleanToStringTransformer(
    column_name = 'is_weekend'
)

log_transformer_to_mlflow(
    transformer=boolean_to_string_transformer,
    experiment_name="Boolean_To_String",
    model_name="boolean_to_string_transformer",
    df=df_train,
    target_col=TARGET
)

## Transform Train and Test

In [None]:
df_train, df_val = apply_transformer(
    df_train,
    df_val,
    boolean_to_string_transformer,
    TARGET
)

# Encoding Categorical Features

Several categorical encoding techniques were evaluated to handle the non-numerical features:

- **Methods tried:** `Target Encoding`, `Leave-One-Out (LOO) Encoding`, `Binary Encoding`, `Frequency Encoding`, `Label Encoding`  
- **Purpose:** Convert categorical variables into numerical representations while preserving predictive information and avoiding data leakage.

Evaluation on the validation set:

1. **Target Encoding** performed the best, providing the lowest cross-validated RMSE.  
2. **Followed by:** Leave-One-Out (LOO) Encoding, Binary Encoding, Frequency Encoding and Label Encoding.

**Decision:** The final model uses **Target Encoding** for categorical variables.

**Takeaway:** This method captures the relationship between categorical features and the target variable more effectively than other encodings which improves model performance, especially in regression tasks.


In [None]:
# Note: Weight of Evidence (WoE) Encoding is best for the binary classification
# problems. Not using it here as this one is a regression problem.

cols_to_encode = df_train.select_dtypes(include=['string']).columns.tolist()
print(f'Categorical columns: {cols_to_encode}')
X_train_full = df_train.drop(TARGET, axis=1)
X_val = df_val.drop(TARGET, axis=1)

n_trials = 500
n_splits = 5
n_repeats = 3
mlflow_experiment = "Encoding"

# Number of inner folds for k-fold target encoding
inner_k_splits = 5

# SQLite storage configuration
study_name = f"{mlflow_experiment}_study"
storage = f"sqlite:///{PATH}/{study_name}.db"

def frequency_encode(df, columns, df_val=None, noise_level=0.0):
  """
  Apply frequency encoding to specified columns with optional noise.
  """
  df_encoded = df.copy()
  df_val_encoded = df_val.copy() if df_val is not None else None

  for col in columns:
    # Calculate frequency counts from training data
    freq = df[col].value_counts(normalize=True)

    # Map frequencies to training data with optional noise
    df_encoded[f'{col}_encoded'] = df[col].map(freq)

    if noise_level > 0:
      df_encoded[f'{col}_encoded'] += np.random.normal(
          0,
          noise_level,
          len(df_encoded)
      )

      # Convert to NumPy array with float64 dtype for winsorize
      df_encoded[f'{col}_encoded'] = mstats.winsorize(
          df_encoded[f'{col}_encoded'].to_numpy(dtype=np.float64),
          limits=(0.01, 0.01)
      )

    # Map frequencies to validation data if provided
    if df_val is not None:
      df_val_encoded[f'{col}_encoded'] = (
          df_val[col]
          .map(freq)
          .fillna(freq.mean())
      )

    # Drop original categorical column
    df_encoded = df_encoded.drop(columns=[col])

    if df_val is not None:
      df_val_encoded = df_val_encoded.drop(columns=[col])

  return df_encoded, df_val_encoded

def kfold_target_encode(
    X_train,
    y_train,
    X_val_fold,
    columns,
    smoothing,
    df_val=None,
    n_splits=5
):
  """
  Apply k-fold target encoding to specified columns to prevent data leakage.
  """
  global_mean = y_train.mean()

  # Validate input columns
  for col in columns:
    if col not in X_train.columns:
      raise ValueError(f"Column '{col}' not found in X_train")
    if col not in X_val_fold.columns:
      raise ValueError(f"Column '{col}' not found in X_val_fold")
    if df_val is not None and col not in df_val.columns:
      raise ValueError(f"Column '{col}' not found in df_val")

  X_train_encoded = X_train.copy()
  X_val_encoded = X_val_fold.copy()
  df_val_encoded = df_val.copy() if df_val is not None else None

  # Check for empty validation set
  if df_val is not None and len(df_val) == 0:
    raise ValueError("Validation set (df_val) is empty")

  for col in columns:
    # Initialize array to store encodings
    train_encodings = np.zeros(len(X_train))
    val_encodings = np.zeros(len(X_val_fold))
    test_encodings = np.zeros(len(df_val)) if df_val is not None else None

    # Inner k-fold for training data
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

    for train_idx, val_idx in kf.split(X_train):
      X_inner_train, X_inner_val = (
          X_train.iloc[train_idx],
          X_train.iloc[val_idx]
      )

      y_inner_train = y_train.iloc[train_idx]

      # Calculate target mean per category in inner training fold
      target_means = y_inner_train.groupby(X_inner_train[col]).mean()
      category_counts = X_inner_train[col].value_counts()

      # Apply smoothing
      weighted_mean = (
          (category_counts * target_means + smoothing * global_mean)
          / (category_counts + smoothing)
      )

      # Map to inner validation fold
      train_encodings[val_idx] = (
          X_inner_val[col]
          .map(weighted_mean)
          .fillna(global_mean)
      )

    target_means = y_train.groupby(X_train[col]).mean()
    category_counts = X_train[col].value_counts()

    weighted_mean = (
        (category_counts * target_means + smoothing * global_mean)
        / (category_counts + smoothing)
    )

    # Map to training and validation data
    train_encodings = (
        pd.Series(train_encodings, index=X_train.index)
        .fillna(global_mean)
    )

    X_train_encoded[f'{col}_encoded'] = train_encodings

    X_val_encoded[f'{col}_encoded'] = (
        X_val_fold[col]
        .map(weighted_mean)
        .fillna(global_mean)
    )

    if df_val is not None:
      df_val_encoded[f'{col}_encoded'] = (
          df_val[col]
          .map(weighted_mean)
          .fillna(global_mean)
      )

    # Drop original categorical column
    X_train_encoded = X_train_encoded.drop(columns=[col])
    X_val_encoded = X_val_encoded.drop(columns=[col])

    if df_val is not None:
      df_val_encoded = df_val_encoded.drop(columns=[col])

  return X_train_encoded, X_val_encoded, df_val_encoded

def loo_encode(X_train, y_train, X_val_fold, columns, df_val=None, sigma=0.0):
  """
  Apply Leave-One-Out Encoding to specified columns with optional noise.
  """
  loo = LeaveOneOutEncoder(
      cols=columns,
      sigma=sigma,
      random_state=RANDOM_STATE
  )

  X_train_encoded = loo.fit_transform(X_train, y_train)
  X_val_encoded = loo.transform(X_val_fold)
  df_val_encoded = loo.transform(df_val) if df_val is not None else None
  return X_train_encoded, X_val_encoded, df_val_encoded

def binary_encode(X_train, columns, df_val=None):
  """
  Apply Binary Encoding to specified columns.
  """
  binary = BinaryEncoder(cols=columns, return_df=True)
  X_train_encoded = binary.fit_transform(X_train)
  df_val_encoded = binary.transform(df_val) if df_val is not None else None
  return X_train_encoded, df_val_encoded

def label_encode(X_train, columns, df_val=None):
  """
  Apply Label Encoding to specified columns.
  """
  X_train_encoded = X_train.copy()
  df_val_encoded = df_val.copy() if df_val is not None else None
  label_encoders = {}

  for col in columns:
    le = LabelEncoder()
    X_train_encoded[f'{col}_encoded'] = le.fit_transform(X_train[col])
    label_encoders[col] = le

    if df_val is not None:
      # Handle unseen categories in validation set
      df_val_encoded[f'{col}_encoded'] = (
          df_val[col]
          .map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
      )

    X_train_encoded = X_train_encoded.drop(columns=[col])

    if df_val is not None:
      df_val_encoded = df_val_encoded.drop(columns=[col])

  return X_train_encoded, df_val_encoded, label_encoders

def objective(trial):
  # Tune winsorization limits
  winsorize_upper = trial.suggest_float('winsorize_upper', 0.9, 1)

  y_train_full = np.log1p(mstats.winsorize(
      df_train[TARGET].to_numpy(dtype=np.float64),
      limits=(0, 1-winsorize_upper)
  ))

  # Choose encoding method
  encoding_method = trial.suggest_categorical(
      'encoding_method',
      ['frequency', 'target', 'loo', 'binary', 'label']
  )

  # Tune encoding parameters
  if encoding_method == 'target':
    smoothing = trial.suggest_float('target_smoothing', 0.1, 10.0, log=True)
  elif encoding_method == 'frequency':
    noise_level = trial.suggest_float('freq_noise_level', 0.0, 0.1)
  elif encoding_method == 'loo':
    sigma = trial.suggest_float('loo_sigma', 0.0, 0.1)  # Noise for LOO
  else:
    # binary and label encoding have no tunable parameters
    pass

  # Repeated K-Fold Cross-Validation
  rkf = RepeatedKFold(
      n_splits=n_splits,
      n_repeats=n_repeats,
      random_state=RANDOM_STATE
  )

  rmse_list = []

  for fold_idx, (train_idx, val_idx) in enumerate(rkf.split(X_train_full)):
    X_train, X_val_fold = X_train_full.iloc[train_idx], X_train_full.iloc[val_idx]
    y_train, y_val_fold = y_train_full[train_idx], y_train_full[val_idx]

    # Apply encoding based on method
    if encoding_method == 'frequency':
      X_train_encoded, X_val_encoded = frequency_encode(
          X_train,
          cols_to_encode,
          X_val_fold,
          noise_level=noise_level
      )
    elif encoding_method == 'target':
      y_train_series = pd.Series(y_train, index=X_train.index, name=TARGET)

      X_train_encoded, X_val_encoded, _ = kfold_target_encode(
          X_train,
          y_train_series,
          X_val_fold,
          cols_to_encode,
          smoothing=smoothing,
          n_splits=inner_k_splits
      )
    elif encoding_method == 'loo':
      y_train_series = pd.Series(y_train, index=X_train.index, name=TARGET)

      X_train_encoded, X_val_encoded, _ = loo_encode(
          X_train,
          y_train_series,
          X_val_fold,
          cols_to_encode,
          sigma=sigma
      )
    elif encoding_method == 'binary':
      X_train_encoded, X_val_encoded = binary_encode(
          X_train,
          cols_to_encode,
          X_val_fold
      )
    elif encoding_method == 'label':
      X_train_encoded, X_val_encoded, _ = label_encode(
          X_train,
          cols_to_encode,
          X_val_fold
      )

    model = xgb.XGBRegressor(
        objective=trial.suggest_categorical('objective', ['reg:squarederror']),
        booster=trial.suggest_categorical('booster', ['gbtree', 'dart']),
        colsample_bylevel=trial.suggest_float('colsample_bylevel', 0.2, 0.8),
        colsample_bynode=trial.suggest_float('colsample_bynode', 0.2, 0.8),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.2, 0.8),
        gamma=trial.suggest_float('gamma', 1e-1, 30, log=True),
        grow_policy=trial.suggest_categorical(
            'grow_policy',
            ['depthwise', 'lossguide']
        ),
        learning_rate=trial.suggest_float(
            'learning_rate',
            0.001,
            0.2,
            log=True
        ),
        max_bin=trial.suggest_int('max_bin', 128, 512),
        max_depth=trial.suggest_int('max_depth', 2, 6),
        min_child_weight=trial.suggest_float('min_child_weight', 1, 20),
        n_estimators=trial.suggest_int('n_estimators', 100, 3000),
        n_jobs=N_JOBS,
        random_state=RANDOM_STATE,
        sampling_method=trial.suggest_categorical(
            'sampling_method',
            ["uniform"]
        ),
        subsample=trial.suggest_float('subsample', 0.2, 0.8),
        tree_method=trial.suggest_categorical('tree_method', ["auto", "hist"]),
        base_score=trial.suggest_float('base_score', 0.1, 0.9),
        max_leaves=trial.suggest_int('max_leaves', 0, 16),
        num_parallel_tree=trial.suggest_int('num_parallel_tree', 1, 3),
        reg_alpha=trial.suggest_float('reg_alpha', 1e-1, 20, log=True),
        reg_lambda=trial.suggest_float('reg_lambda', 1e-1, 20, log=True),
        scale_pos_weight=trial.suggest_float('scale_pos_weight', 0.5, 1.5),
        validate_parameters=True,
        verbosity=0
    )

    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_val_encoded)
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    rmse_list.append(rmse)

    del X_train, X_val_fold, y_train, y_val_fold, y_pred, model
    gc.collect()

    trial.report(np.mean(rmse_list), step=fold_idx)

    if trial.should_prune():
      raise optuna.TrialPruned()

  return np.mean(rmse_list)

# Start MLflow experiment
mlflow.set_experiment(mlflow_experiment)

with mlflow.start_run():
  # - Ensure to use the same direction,
  # pruner (min_resource, max_resource, reduction_factor, etc.) if you are
  # loading an existing study
  study = optuna.create_study(
      study_name=study_name,
      storage=storage,
      direction="minimize",
      pruner=optuna.pruners.HyperbandPruner(
          min_resource=max(1, n_splits),
          max_resource=n_splits * n_repeats,
          reduction_factor=2
      ),
      sampler=optuna.samplers.TPESampler(
          n_startup_trials=max(1, int(n_trials * 0.2)),
          seed=RANDOM_STATE,
          multivariate=True
      ),
      load_if_exists=True
  )

  study.optimize(objective, n_trials=n_trials)
  log_optuna_best_trial_search_space(study)

  # Log Optuna storage details
  mlflow.log_param("optuna_storage", storage)
  mlflow.log_param("optuna_study_name", study_name)

  # Log experiment parameters
  mlflow.log_param("n_trials", n_trials)
  mlflow.log_param("n_splits", n_splits)
  mlflow.log_param("n_repeats", n_repeats)

  # Log cols_to_transform
  mlflow.log_param("cols_to_encode", ",".join(cols_to_encode))

  fanova_importances = get_param_importances(
      study,
      evaluator=FanovaImportanceEvaluator(seed=RANDOM_STATE),
      target=(
          lambda t: t.value
          if t.state == optuna.trial.TrialState.COMPLETE
          else None
      ),
      normalize=True
  )

  fanova_importances = {k: round(v, 4) for k, v in fanova_importances.items()}

  for param, importance in fanova_importances.items():
    mlflow.log_param(f"fanova_{param}", importance)

  print(f'Fanova Hyperparameter Importances (rounded): {fanova_importances}')

  # Get best parameters and trial attributes
  best_params = study.best_params
  best_trial = study.best_trial

  for param, value in best_params.items():
    mlflow.log_param(param, value)

  mlflow.log_metric("best_cv_rmse", study.best_value)

  print("Best Params:", best_params)
  print("Best CV RMSE:", study.best_value)

  # Apply best winsorization and transformation to final training and validation
  # sets
  y_train_full = np.log1p(mstats.winsorize(
      df_train[TARGET].to_numpy(dtype=np.float64),
      limits=(0, 1-best_params['winsorize_upper'])
  ))

  # Compute winsorization thresholds from training data
  # Apply fixed thresholds to validation data
  y_val = np.log1p(np.clip(
      df_val[TARGET],
      0,
      np.percentile(df_train[TARGET], best_params['winsorize_upper'] * 100)
  ))

  # Apply best encoding to final training and validation sets
  if best_params['encoding_method'] == 'frequency':
    X_final, X_val_encoded = frequency_encode(
        X_train_full,
        cols_to_encode,
        X_val,
        noise_level=best_params['freq_noise_level']
    )

  elif best_params['encoding_method'] == 'target':
    y_train_series = pd.Series(
        y_train_full,
        index=X_train_full.index,
        name=TARGET
    )

    X_final, _, X_val_encoded = kfold_target_encode(
        X_train_full,
        y_train_series,
        X_train_full,
        cols_to_encode,
        smoothing=best_params['target_smoothing'],
        df_val=X_val,
        n_splits=inner_k_splits
    )

  elif best_params['encoding_method'] == 'loo':
    y_train_series = pd.Series(
        y_train_full,
        index=X_train_full.index,
        name=TARGET
    )

    X_final, _, X_val_encoded = loo_encode(
        X_train_full,
        y_train_series,
        X_train_full,
        cols_to_encode,
        df_val=X_val,
        sigma=best_params['loo_sigma']
    )

  elif best_params['encoding_method'] == 'binary':
    X_final, X_val_encoded = binary_encode(
        X_train_full,
        cols_to_encode,
        X_val
    )

  elif best_params['encoding_method'] == 'label':
    X_final, X_val_encoded, _ = label_encode(
        X_train_full,
        cols_to_encode,
        X_val
    )

  # Train final model with best XGBoost parameters
  filtered_params = {
      k: v for k, v in best_params.items()
      if k in xgb.XGBRegressor().get_params()
  }

  # all the constant hyperparameters should be added here manually and are not
  # part of study.best_params
  final_model = xgb.XGBRegressor(
      **filtered_params,
      n_jobs=N_JOBS,
      random_state=RANDOM_STATE,
      validate_parameters=True,
      verbosity=0
  )

  X_final = X_final[sorted(X_final.columns)]
  final_model.fit(X_final, y_train_full)
  X_val_encoded = X_val_encoded[sorted(X_val_encoded.columns)]
  y_pred = final_model.predict(X_val_encoded)

  evaluate_and_log_metrics(y_val, y_pred, prefix="val")

## Target Encoder Transformer

In [None]:
%%writefile target_encoder_transformer.py
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin

RANDOM_STATE = 42

class TargetEncoderTransformer(BaseEstimator, TransformerMixin):
  """
  Scikit-learn compatible class for k-fold target encoding.
  """
  def __init__(
      self,
      columns,
      smoothing=1.0,
      n_splits=5,
      random_state=RANDOM_STATE
  ):
    """
    Initialize the KFoldTargetEncoder.

    Parameters:
    - columns : list
    List of column names to apply target encoding.
    - smoothing : float, default=1.0
    Smoothing parameter to balance category mean and global mean.
    - n_splits : int, default=5
    Number of folds for k-fold encoding.
    - random_state : int, default=None
    Random state for KFold reproducibility.
    """
    self.columns = columns
    self.smoothing = smoothing
    self.n_splits = n_splits
    self.random_state = random_state
    self.global_mean_ = None
    self.target_means_ = {}
    self.category_counts_ = {}

  def fit(self, X, y):
    """
    Fit the encoder using training data.

    Parameters:
    - X : pandas DataFrame
    Training data with categorical columns.
    - y : pandas Series or array-like
    Target variable.

    Returns:
    self : KFoldTargetEncoder
    Fitted encoder.
    """

    # Validate inputs
    if not isinstance(X, pd.DataFrame):
      raise ValueError("X must be a pandas DataFrame")
    if not isinstance(y, (pd.Series, np.ndarray)):
      raise ValueError("y must be a pandas Series or numpy array")

    y = pd.Series(y, index=X.index) if isinstance(y, np.ndarray) else y

    for col in self.columns:
      if col not in X.columns:
        raise ValueError(f"Column '{col}' not found in X")

    # Store global mean for test data encoding
    self.global_mean_ = np.mean(y)

    # Store target means and counts for each column (for test data)
    for col in self.columns:
      target_means = y.groupby(X[col]).mean()
      category_counts = X[col].value_counts()
      self.target_means_[col] = target_means
      self.category_counts_[col] = category_counts

    return self

  def transform(self, X, y=None):
    """
    Transform data using fitted encoder.

    Parameters:
    - X : pandas DataFrame
    Data to transform.
    - y : pandas Series or array-like, default=None
    Target variable (required for training data, ignored for test data).

    Returns:
    X_encoded : pandas DataFrame
    Transformed data with encoded columns.
    """

    if not isinstance(X, pd.DataFrame):
      raise ValueError("X must be a pandas DataFrame")

    for col in self.columns:
      if col not in X.columns:
        raise ValueError(f"Column '{col}' not found in X")

    X_encoded = X.copy()

    # Training/validation data
    if y is not None:
      if not isinstance(y, (pd.Series, np.ndarray)):
        raise ValueError("y must be a pandas Series or numpy array")

      y = pd.Series(y, index=X.index) if isinstance(y, np.ndarray) else y

      # Initialize encoding arrays for each column
      train_encodings = {col: np.zeros(len(X)) for col in self.columns}

      kf = KFold(
          n_splits=self.n_splits,
          shuffle=True,
          random_state=self.random_state
      )

      for train_idx, val_idx in kf.split(X):
        X_inner_train, X_inner_val = X.iloc[train_idx], X.iloc[val_idx]
        y_inner_train = y.iloc[train_idx]

        # Compute fold-specific global mean
        fold_global_mean = np.mean(y_inner_train)

        for col in self.columns:
          target_means = y_inner_train.groupby(X_inner_train[col]).mean()
          category_counts = X_inner_train[col].value_counts()

          weighted_mean = (
              (
                  category_counts
                  * target_means
                  + self.smoothing
                  * fold_global_mean
              )
              / (category_counts + self.smoothing)
          )

          encoded_values = (
              X_inner_val[col]
              .map(weighted_mean)
              .fillna(fold_global_mean)
          )

          train_encodings[col][val_idx] = encoded_values

          X_encoded.loc[X.index[val_idx], f'{col}_encoded'] = (
              train_encodings[col][val_idx]
          )
    else:
      # Test data
      for col in self.columns:
        weighted_mean = (
            (
                self.category_counts_[col]
                * self.target_means_[col]
                + self.smoothing
                * self.global_mean_
            )
            / (self.category_counts_[col] + self.smoothing)
        )

        X_encoded[f'{col}_encoded'] = (
            X[col]
            .map(weighted_mean)
            .fillna(self.global_mean_)
        )

    # Drop original categorical columns
    X_encoded = X_encoded.drop(columns=self.columns)

    return X_encoded

  def fit_transform(self, X, y):
    """
    Fit the encoder and transform the training data.

    Parameters:
    - X : pandas DataFrame
    Training data with categorical columns.
    - y : pandas Series or array-like
    Target variable.

    Returns:
    X_encoded : pandas DataFrame
    Transformed training data.
    """

    return self.fit(X, y).transform(X, y)

In [None]:
copy_file_to_destination(
    source_path = 'target_encoder_transformer.py',
    destination_path = PATH / 'target_encoder_transformer.py'
)

## Save Fitted Model

In [None]:
from target_encoder_transformer import TargetEncoderTransformer

target_encoder_transformer = TargetEncoderTransformer(
    columns = df_train.select_dtypes(include=['string']).columns.tolist(),
    smoothing=0.2922108633966015
)

log_transformer_to_mlflow(
    transformer=target_encoder_transformer,
    experiment_name="Target_Encoder_Transformer",
    model_name="target_encoder_transformer",
    df=df_train,
    target_col=TARGET
)

## Transform Train and Test

In [None]:
df_train, df_val = apply_transformer(
    df_train,
    df_val,
    target_encoder_transformer,
    TARGET
)

## Check Feature Distribution Mismatches

In [None]:
check_feature_distribution_mismatches(
    df_train[cols_to_encode],
    df_val[cols_to_encode],
    test_type="Chi-squared"
)

# Scaling Numerical Features

Multiple scaling techniques were tested to transform the numerical features:

- **Methods tried:** `StandardScaler`, `RobustScaler`, `MinMaxScaler`, `MaxAbsScaler`, `QuantileTransformer`, `Normalizer`, `log1p`  
- **Purpose:** Ensure numerical features are on comparable scales, reduce the effect of outliers and improve model convergence.

Evaluation on validation data:

1. **StandardScaler** achieved the best performance in terms of cross-validated RMSE.  
2. **Followed by:** RobustScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer, Normalizer, and log1p transformation.

**Decision:** The final model uses **StandardScaler** for scaling numerical features.

**Takeaway:** This approach standardizes features to have zero mean and unit variance which is generally effective for tree-based and gradient-boosting models in regression tasks.

In [None]:
# Columns to scale
columns_to_scale = (
    df_train
    .select_dtypes(include=['int64', 'float64'])
    .drop(columns=[TARGET])
    .columns
    .tolist()
)

print(f'Columns to scale: {columns_to_scale}')
X_train_full = df_train.drop(TARGET, axis=1)
X_val = df_val.drop(TARGET, axis=1)

n_trials = 500
n_splits = 5
n_repeats = 3
mlflow_experiment = "Scaling"

# SQLite storage configuration
study_name = f"{mlflow_experiment}_study"
storage = f"sqlite:///{PATH}/{study_name}.db"

def apply_log1p(X, columns, clip_negative=False):
  """
  Apply log1p transformation to specified columns, handling negative values.
  """
  X_transformed = X.copy()

  for col in columns:
    # Clip negative values to zero before log1p
    data = np.maximum(X[col].to_numpy(), 0)
    transformed = np.log1p(data)
    X_transformed[f"{col}_scaled"] = transformed
    X_transformed = X_transformed.drop(columns=[col])

  return X_transformed

def objective(trial):
  # Tune winsorization limits
  winsorize_upper = trial.suggest_float('winsorize_upper', 0.9, 1)

  y_train_full = np.log1p(mstats.winsorize(
      df_train[TARGET].to_numpy(dtype=np.float64),
      limits=(0, 1-winsorize_upper)
  ))

  # Choose scaling method
  scaling_method = trial.suggest_categorical(
      'scaling_method',
      [
          'standard',
          'robust',
          'minmax',
          'maxabs',
          'quantile',
          'normalizer',
          'log1p'
      ]
  )

  # Tune scaler-specific parameters
  scaler_params = {}

  if scaling_method == 'robust':
    quantile_lower = trial.suggest_float('robust_quantile_lower', 10, 40)
    scaler_params['quantile_range'] = (quantile_lower, 100 - quantile_lower)

    scaler_params['unit_variance'] = trial.suggest_categorical(
        'robust_unit_variance',
         [True, False]
    )

  elif scaling_method == 'minmax':
    feature_range_min = trial.suggest_float('minmax_feature_range_min', -1, 0)

    scaler_params['feature_range'] = (
        feature_range_min,
        feature_range_min + 1
    )

    scaler_params['clip'] = trial.suggest_categorical(
        'minmax_clip',
          [True, False]
    )

  elif scaling_method == 'quantile':
    scaler_params['n_quantiles'] = trial.suggest_int(
        'quantile_n_quantiles', 100, 1000
    )

    scaler_params['output_distribution'] = trial.suggest_categorical(
        'quantile_output_distribution', ['uniform', 'normal']
    )

    scaler_params['subsample'] = trial.suggest_int(
        'quantile_subsample', int(1e4), int(1e6)
    )

  elif scaling_method == 'normalizer':
    scaler_params['norm'] = trial.suggest_categorical(
        'normalizer_norm', ['l1', 'l2', 'max']
    )

  elif scaling_method == 'log1p':
    scaler_params['clip_negative'] = trial.suggest_categorical(
        'log1p_clip_negative', [True, False]
    )

  # Repeated K-Fold Cross-Validation
  rkf = RepeatedKFold(
      n_splits=n_splits,
      n_repeats=n_repeats,
      random_state=RANDOM_STATE
  )

  rmse_list = []

  # Update columns_to_scale with scaled names for model training
  scaled_columns = [f"{col}_scaled" for col in columns_to_scale]

  for fold_idx, (train_idx, val_idx) in enumerate(rkf.split(X_train_full)):
    X_train, X_val_fold = (
        X_train_full.iloc[train_idx],
        X_train_full.iloc[val_idx]
    )
    y_train, y_val_fold = y_train_full[train_idx], y_train_full[val_idx]

    # Apply selected scaler
    X_train_scaled = X_train.copy()
    X_val_fold_scaled = X_val_fold.copy()

    if columns_to_scale and scaling_method != 'log1p':
      # Initialize scaler for this fold
      if scaling_method == 'standard':
        scaler = StandardScaler()

      elif scaling_method == 'robust':
        scaler = RobustScaler(**scaler_params)

      elif scaling_method == 'minmax':
        scaler = MinMaxScaler(**scaler_params)

      elif scaling_method == 'maxabs':
        scaler = MaxAbsScaler()

      elif scaling_method == 'quantile':
        scaler = QuantileTransformer(**scaler_params, random_state=RANDOM_STATE)

      elif scaling_method == 'normalizer':
        scaler = Normalizer(**scaler_params)

      # Apply scaling and rename columns
      scaled_data = scaler.fit_transform(X_train[columns_to_scale])
      X_train_scaled = X_train_scaled.drop(columns=columns_to_scale)
      X_train_scaled[scaled_columns] = scaled_data

      scaled_data_val = scaler.transform(X_val_fold[columns_to_scale])
      X_val_fold_scaled = X_val_fold_scaled.drop(columns=columns_to_scale)
      X_val_fold_scaled[scaled_columns] = scaled_data_val

    elif columns_to_scale and scaling_method == 'log1p':
      X_train_scaled = apply_log1p(
          X_train, columns_to_scale, scaler_params.get('clip_negative', False)
      )
      X_val_fold_scaled = apply_log1p(
          X_val_fold, columns_to_scale, scaler_params.get('clip_negative', False)
      )

    model = xgb.XGBRegressor(
        objective=trial.suggest_categorical('objective', ['reg:squarederror']),
        booster=trial.suggest_categorical('booster', ['gbtree', 'dart']),
        colsample_bylevel=trial.suggest_float('colsample_bylevel', 0.2, 0.8),
        colsample_bynode=trial.suggest_float('colsample_bynode', 0.2, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.2, 0.8),
        gamma=trial.suggest_float('gamma', 1e-8, 30, log=True),
        grow_policy=trial.suggest_categorical(
            'grow_policy',
            ['depthwise', 'lossguide']
        ),
        learning_rate=trial.suggest_float(
            'learning_rate',
            0.001,
            0.2,
            log=True
        ),
        max_bin=trial.suggest_int('max_bin', 128, 512),
        max_depth=trial.suggest_int('max_depth', 2, 6),
        min_child_weight=trial.suggest_float('min_child_weight', 1, 20),
        n_estimators=trial.suggest_int('n_estimators', 100, 3000),
        n_jobs=N_JOBS,
        random_state=RANDOM_STATE,
        sampling_method=trial.suggest_categorical(
            'sampling_method',
            ["uniform"]
        ),
        subsample=trial.suggest_float('subsample', 0.2, 0.8),
        tree_method=trial.suggest_categorical('tree_method', ["auto", "hist"]),
        base_score=trial.suggest_float('base_score', 0.1, 0.9),
        max_leaves=trial.suggest_int('max_leaves', 0, 16),
        num_parallel_tree=trial.suggest_int('num_parallel_tree', 1, 3),
        reg_alpha=trial.suggest_float('reg_alpha', 1e-8, 20, log=True),
        reg_lambda=trial.suggest_float('reg_lambda', 1e-8, 20, log=True),
        scale_pos_weight=trial.suggest_float('scale_pos_weight', 0.5, 1.5),
        validate_parameters=True,
        verbosity=0
    )

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_fold_scaled)
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    rmse_list.append(rmse)

    del X_train, X_val_fold, y_train, y_val_fold, y_pred, model
    gc.collect()

    trial.report(np.mean(rmse_list), step=fold_idx)

    if trial.should_prune():
      raise optuna.TrialPruned()

  return np.mean(rmse_list)

# Start MLflow experiment
mlflow.set_experiment(mlflow_experiment)

with mlflow.start_run():
  # - Ensure to use the same direction,
  # pruner (min_resource, max_resource, reduction_factor, etc.) if you are
  # loading an existing study
  study = optuna.create_study(
      study_name=study_name,
      storage=storage,
      direction="minimize",
      pruner=optuna.pruners.HyperbandPruner(
          min_resource=max(1, n_splits),
          max_resource=n_splits * n_repeats,
          reduction_factor=2
      ),
      sampler=optuna.samplers.TPESampler(
          n_startup_trials=max(1, int(n_trials * 0.2)),
          seed=RANDOM_STATE,
          multivariate=True
      ),
      load_if_exists=True
  )

  study.optimize(objective, n_trials=n_trials)
  log_optuna_best_trial_search_space(study)

  # Log Optuna storage details
  mlflow.log_param("optuna_storage", storage)
  mlflow.log_param("optuna_study_name", study_name)

  # Log experiment parameters
  mlflow.log_param("n_trials", n_trials)
  mlflow.log_param("n_splits", n_splits)
  mlflow.log_param("n_repeats", n_repeats)

  # Log cols_to_scale
  mlflow.log_param("columns_to_scale", ",".join(columns_to_scale))

  fanova_importances = get_param_importances(
      study,
      evaluator=FanovaImportanceEvaluator(seed=RANDOM_STATE),
      target=(
          lambda t: t.value
          if t.state == optuna.trial.TrialState.COMPLETE
          else None
      ),
      normalize=True
  )

  fanova_importances = {k: round(v, 4) for k, v in fanova_importances.items()}

  for param, importance in fanova_importances.items():
    mlflow.log_param(f"fanova_{param}", importance)

  print(f'Fanova Hyperparameter Importances (rounded): {fanova_importances}')

  # Get best parameters and trial attributes
  best_params = study.best_params
  best_trial = study.best_trial

  for param, value in best_params.items():
    mlflow.log_param(param, value)

  mlflow.log_metric("best_cv_rmse", study.best_value)

  print("Best Params:", best_params)
  print("Best CV RMSE:", study.best_value)

  # Apply best winsorization and transformation to final training and
  # validation sets
  y_train_full = np.log1p(mstats.winsorize(
      df_train[TARGET].to_numpy(dtype=np.float64),
      limits=(0, 1-best_params['winsorize_upper'])
  ))

  # Compute winsorization thresholds from training data
  # Apply fixed thresholds to validation data
  y_val = np.log1p(np.clip(
      df_val[TARGET],
      0,
      np.percentile(df_train[TARGET], best_params['winsorize_upper'] * 100)
  ))

  # Apply best scaling method to final training and validation sets
  scaling_method = best_params['scaling_method']
  scaler_params = {}

  if scaling_method == 'robust':
    quantile_lower = best_params['robust_quantile_lower']
    scaler_params['quantile_range'] = (quantile_lower, 100 - quantile_lower)
    scaler_params['unit_variance'] = best_params['robust_unit_variance']
    scaler = RobustScaler(**scaler_params)

  elif scaling_method == 'minmax':
    feature_range_min = best_params['minmax_feature_range_min']
    scaler_params['feature_range'] = (feature_range_min, feature_range_min + 1)
    scaler_params['clip'] = best_params['minmax_clip']
    scaler = MinMaxScaler(**scaler_params)

  elif scaling_method == 'quantile':
    scaler_params['n_quantiles'] = best_params['quantile_n_quantiles']

    scaler_params['output_distribution'] = best_params[
        'quantile_output_distribution'
    ]

    scaler_params['subsample'] = best_params['quantile_subsample']
    scaler = QuantileTransformer(**scaler_params, random_state=RANDOM_STATE)

  elif scaling_method == 'normalizer':
    scaler_params['norm'] = best_params['normalizer_norm']
    scaler = Normalizer(**scaler_params)

  elif scaling_method == 'standard':
    scaler = StandardScaler()

  elif scaling_method == 'maxabs':
    scaler = MaxAbsScaler()

  elif scaling_method == 'log1p':
    scaler_params['clip_negative'] = best_params['log1p_clip_negative']

  X_final = X_train_full.copy()
  X_val_scaled = X_val.copy()

  scaled_columns = [f"{col}_scaled" for col in columns_to_scale]

  if columns_to_scale and scaling_method != 'log1p':
    # Apply scaling and rename columns
    scaled_data = scaler.fit_transform(X_train_full[columns_to_scale])
    X_final = X_final.drop(columns=columns_to_scale)
    X_final[scaled_columns] = scaled_data

    scaled_data_validation = scaler.transform(X_val[columns_to_scale])
    X_val_scaled = X_val_scaled.drop(columns=columns_to_scale)
    X_val_scaled[scaled_columns] = scaled_data_validation

  elif columns_to_scale and scaling_method == 'log1p':
    X_final = apply_log1p(
        X_train_full,
        columns_to_scale,
        scaler_params.get('clip_negative', False)
    )

    X_val_scaled = apply_log1p(
        X_val, columns_to_scale, scaler_params.get('clip_negative', False)
    )

  # Train final model with best XGBoost parameters
  filtered_params = {
      k: v for k, v in best_params.items()
      if k in xgb.XGBRegressor().get_params()
  }

  # all the constant hyperparameters should be added here manually and are not
  # part of study.best_params
  final_model = xgb.XGBRegressor(
      **filtered_params,
      n_jobs=N_JOBS,
      random_state=RANDOM_STATE,
      validate_parameters=True,
      verbosity=0
  )

  X_final = X_final[sorted(X_final.columns)]
  final_model.fit(X_final, y_train_full)
  X_val_scaled = X_val_scaled[sorted(X_val_scaled.columns)]
  y_pred = final_model.predict(X_val_scaled)

  evaluate_and_log_metrics(y_val, y_pred, prefix="val")

## Scaler Transformer

In [None]:
%%writefile standard_scaler_transformer.py
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class StandardScalerTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    self.scaler = StandardScaler()

  def fit(self, X, y=None):
    """
    Fit the scaler to the data.

    Parameters:
    - X : pandas DataFrame or array-like, shape (n_samples, n_features)
    Training data to fit the scaler
    - y : None
    Ignored. For compatibility with scikit-learn pipeline.

    Returns:
    self : object
    Returns self.
    """
    # Input validation
    if not isinstance(X, (pd.DataFrame, np.ndarray)):
      raise ValueError("Input must be a pandas DataFrame or numpy array")

    # Convert to numpy array if DataFrame
    if isinstance(X, pd.DataFrame):
      self.feature_names_ = X.columns.tolist()
      X = X.values
    else:
      self.feature_names_ = None
      X = np.asarray(X)

    # Store number of features for validation in transform
    self.n_features_in_ = X.shape[1] if X.ndim > 1 else 1

    # Reshape 1D array to 2D for StandardScaler
    if X.ndim == 1:
      X = X.reshape(-1, 1)

    self.scaler.fit(X)
    return self

  def transform(self, X):
    """
    Transform the data using the fitted scaler and return as DataFrame
    with '_scaled' appended to column names.

    Parameters:
    X : pandas DataFrame or array-like, shape (n_samples, n_features)
    Data to transform

    Returns:
    X_transformed : pandas DataFrame
    Scaled data with '_scaled' appended to column names
    """
    # Check if fitted
    if not hasattr(self.scaler, 'scale_'):
      raise ValueError("""
      This StandardScalerTransformer instance is not fitted yet.
      Call 'fit' first.
      """)

    # Input validation
    if not isinstance(X, (pd.DataFrame, np.ndarray)):
      raise ValueError("Input must be a pandas DataFrame or numpy array")

    # Convert to numpy array if DataFrame
    is_dataframe = isinstance(X, pd.DataFrame)

    if is_dataframe:
      X_index = X.index
      X_values = X.values
    else:
      X_index = None
      X_values = np.asarray(X)

    # Validate number of features
    n_features = X_values.shape[1] if X_values.ndim > 1 else 1

    if n_features != self.n_features_in_:
      raise ValueError(f"""
      X has {n_features} features,
      but CustomScaler was fitted with {self.n_features_in_} features
      """)

    # Reshape 1D array to 2D for StandardScaler
    if X_values.ndim == 1:
      X_values = X_values.reshape(-1, 1)

    # Transform the data
    X_scaled = self.scaler.transform(X_values)

    # Create new column names with '_scaled' suffix
    if self.feature_names_ is not None:
      scaled_columns = [f"{col}_scaled" for col in self.feature_names_]
    else:
      scaled_columns = [f"feature_{i}_scaled" for i in range(X_scaled.shape[1])]

    # Return as DataFrame with preserved index
    return pd.DataFrame(
        X_scaled,
        columns=scaled_columns,
        index=X_index if is_dataframe else None
    )

In [None]:
copy_file_to_destination(
    source_path = 'standard_scaler_transformer.py',
    destination_path = PATH / 'standard_scaler_transformer.py'
)

## Save Fitted Model

In [None]:
from standard_scaler_transformer import StandardScalerTransformer

standard_scaler_transformer = StandardScalerTransformer()

log_transformer_to_mlflow(
    transformer=standard_scaler_transformer,
    experiment_name="Standard_Scaler_Transformer",
    model_name="standard_scaler_transformer",
    df=df_train,
    target_col=TARGET
)

## Transform Train and Test

In [None]:
df_train, df_val = apply_transformer(
    df_train,
    df_val,
    standard_scaler_transformer,
    TARGET
)

## Check Feature Distribution Mismatches

In [None]:
check_feature_distribution_mismatches(
    df_train[skewed_cols],
    df_val[skewed_cols],
    test_type="Kolmogorov-Smirnov"
)

# Kurtosis Analysis for Feature Distribution Evaluation

We compute kurtosis (using Fisher's definition) to assess the "tailedness" of numerical features after scaling:

- **Kurtosis ≈ 0:** Normal-like tails (ideal)
- **Kurtosis > 0:** Heavy tails (outliers present)
- **Kurtosis < 0:** Light tails (fewer extremes)

While perfectly normal kurtosis is not required, high values (e.g. >3) may indicate outlier sensitivity that could hurt model performance - especially for algorithms like linear models or SVMs.

This analysis helps **diagnose potential issues** and guides decisions on further transformation (e.g., Box-Cox, Yeo-Johnson) if needed to improve robustness.

**Note:** We do not force kurtosis to zero because real-world data rarely follows perfect distributions. Instead, we use this as a diagnostic tool, not a strict rule.

In [None]:
kurtosis_values = (
    df_train
    .drop(TARGET, axis=1)
    .select_dtypes(include=[np.number])
    .apply(stats.kurtosis)
)

print(kurtosis_values.round(2))

# Interaction Features Based on Correlation and Domain Logic

We create multiplicative interaction terms between key transformed features to capture **potential synergies** between variables that may not be evident in isolation.

Selection criteria:

- **Moderate-to-high correlation:** Suggests meaningful relationships worth exploring.
- **Domain relevance:** Interactions reflect real-world dynamics - e.g. a popular publisher with high average clicks may have a compounding effect on performance.
- **Post-scaling consistency:** All features are already scaled/transformed, ensuring interaction terms are numerically stable and suitable for models sensitive to scale.

These interactions help models learn **nonlinear patterns** without requiring complex algorithms - especially valuable if using linear models or boosting with shallow trees.

**Note:** We avoid exhaustive pairwise combinations to prevent overfitting and maintain interpretability. Only theoretically and statistically justified pairs are included.

In [None]:
# feature pairs are selected for creating the interaction terms considering
# the correlation value between them.

# Create interaction terms for the selected pairs
df_train['pub_popularity_x_pub_clicks'] = (
    df_train['publisher_popularity_transformed_scaled'] *
    df_train['publisher_avg_clicks_transformed_scaled']
)

df_train['market_clicks_x_market_popularity'] = (
    df_train['market_avg_clicks_transformed_scaled'] *
    df_train['market_popularity_transformed_scaled']
)

df_train['pub_clicks_x_pub_encoded'] = (
    df_train['publisher_avg_clicks_transformed_scaled'] *
    df_train['publisher_encoded_scaled']
)

df_train['market_popularity_x_market_id'] = (
    df_train['market_popularity_transformed_scaled'] *
    df_train['market_id_encoded_scaled']
)

df_train['market_clicks_x_market_id'] = (
    df_train['market_avg_clicks_transformed_scaled'] *
    df_train['market_id_encoded_scaled']
)

# Verify the updated DataFrame
print("Shape of updated df_train:", df_train.shape)
print("Columns in updated df_train:", df_train.columns.tolist())

# Polynomial Features

Diagnosing Non-Linearity with **Ramsey's RESET Test**

To assess whether **non-linear relationships** exist between features and the target, we apply Ramsey's RESET test on a baseline linear model.

Why this matters:
- Linear models assume a straight-line relationship but real-world data often violates this.
- RESET test checks if adding powers of predicted values improves fit - a significant result (low p-value) indicates **missed non-linearity**.

This is not about adding polynomial features directly - it is a **diagnostic tool** to validate the need for more complex modeling strategies.

In [None]:
X = df_train.drop(TARGET, axis=1)
y = df_train[TARGET]

# Add intercept
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [None]:
# Run RESET test
# Tests for quadratic terms
reset_result = linear_reset(model, power=2)

# Significant p-value suggests non-linearity
print(reset_result.summary())

In [None]:
# Run RESET test
# Tests for quadratic terms
reset_result = linear_reset(model, power=3)

# Significant p-value suggests non-linearity
print(reset_result.summary())

In [None]:
# Run RESET test
# Tests for quadratic terms
reset_result = linear_reset(model, power=4)

# Significant p-value suggests non-linearity
print(reset_result.summary())

# Handling Multicollinearity with VIF and Correlation Analysis

To ensure model stability, interpretability, and numerical reliability, we diagnose and address **multicollinearity** using two complementary methods:

1. **Variance Inflation Factor (VIF)**: Quantifies how much feature variance is inflated due to correlation with other features.
   - VIF > 5: Investigate
   - VIF > 10: Strong multicollinearity - action needed

2. **Correlation Matrix Heatmap**: Visualizes pairwise correlations to identify redundant features.

- **Strategy for Feature Removal:**

  When two (or more) features are highly correlated:
  - We compare their **individual correlation with the target**.
  - Retain the feature more strongly associated with the target.
  - Drop the weaker one to reduce redundancy without losing predictive signal.

- **Actions Taken:**
  - Iteratively removed `pub_clicks_x_pub_encoded`, `market_popularity_x_market_id` and `market_clicks_x_market_id` based on high VIF/correlation and lower relevance to target.
  - Re-evaluated VIF and correlation after each removal to ensure progressive improvement.

This **target-aware, iterative approach** balances multicollinearity reduction with preservation of predictive power - critical for robust modeling, especially in regression and inference settings.

In [None]:
calculate_vif(df_train, TARGET)

In [None]:
plot_and_save_correlation_matrix(df_train, PATH)

In [None]:
# - market_clicks_x_market_id and highly correlated with
# market_clicks_x_market_popularity and pub_clicks_x_pub_encoded. dropped the
# column pub_clicks_x_pub_encoded as it has the lowest correlation with
# the target column.

# - market_clicks_x_market_popularity is highly correlated with
# market_popularity_x_market_id and market_clicks_x_market_id. dropped the
# column market_popularity_x_market_id as it has the lowest correlation with
# the target column.

df_train.drop(
    columns=['pub_clicks_x_pub_encoded', 'market_popularity_x_market_id'],
    inplace = True
)

In [None]:
calculate_vif(df_train, TARGET)

In [None]:
plot_and_save_correlation_matrix(df_train, PATH)

In [None]:
# - market_clicks_x_market_id is highly correlated with
# market_clicks_x_market_popularity. dropped the
# column market_clicks_x_market_id as it has the lowest correlation with
# the target column.

# - market_clicks_x_market_popularity is highly correlated with
# market_popularity_x_market_id and market_clicks_x_market_id. dropped the
# column market_clicks_x_market_popularity as it has the lowest correlation with
# the target column.

df_train.drop(
    columns=['market_clicks_x_market_id'],
    inplace = True
)

In [None]:
calculate_vif(df_train, TARGET)

# Feature Selection: Balancing Predictive Power, Stability and Simplicity

We apply a multi-stage, diagnostic-driven strategy to refine the feature set and improve model generalization:

- **Correlation with Target:**  
We first assess each feature's absolute Pearson correlation with the target (`cpa`) to prioritize those with stronger linear signal.

- **High Pairwise Correlation (Redundancy Removal):**  
Features with correlation > 0.8 are considered redundant. When a pair is highly correlated, we retain the one more strongly linked to the target, minimizing information loss while reducing multicollinearity.

> Removed:  
> - `publisher_popularity_transformed_scaled` (vs. `publisher_avg_clicks`)  
> - `market_avg_clicks_transformed_scaled` (vs. `market_popularity`)

- **Low Variance Check:**  
Features with variance < 0.01 contribute little to model discrimination and can destabilize some algorithms.  
We remove such features to simplify the model without sacrificing predictive power.

- **Final Diagnostics:**  
After filtering, we re-check:
  - **VIF** - Ensure multicollinearity is under control
  - **Correlation matrix** - Confirm no high-correlation pairs remain

This iterative, evidence-based approach ensures our final feature set is:
- Predictive  
- Clean (no redundancy)  
- Robust to overfitting and numerical instability

It reflects a balance between statistical rigor and practical modeling - not just throwing all features at a model, but curating them intentionally.

## Pearson Correlation with Target

In [None]:
# overview
(
    df_train.drop(TARGET, axis=1)
    .corrwith(df_train[TARGET])
    .abs()
    .sort_values(ascending=False)
)

## Highly Correlated Features

In [None]:
# check if there is any pair of features whose correlation values is more
# than 0.8
plot_and_save_correlation_matrix(df_train, PATH)

In [None]:
# - publisher_popularity_transformed_scaled and
# publisher_avg_clicks_transformed_scaled have correlation of 0.843127458401863.
# Removing the column publisher_popularity_transformed_scaled as it has lower
# correlation with the target column.

# - market_popularity_transformed_scaled and
# market_avg_clicks_transformed_scaled have correlation of 0.804201534180658.
# Removing the column market_avg_clicks_transformed_scaled as it has lower
# correlation with the target column.

# - publisher_avg_clicks_transformed_scaled and
# publisher_popularity_transformed_scaled have correlation of 0.843127458401863.
# Column publisher_popularity_transformed_scaled already have been decided to
# drop.

# - market_avg_clicks_transformed_scaled and
# market_popularity_transformed_scaled have correlation of 0.804201534180658.
# Column market_avg_clicks_transformed_scaled already have been decided to
# drop.

df_train.drop(
    columns=[
        'publisher_popularity_transformed_scaled',
        'market_avg_clicks_transformed_scaled'
    ],
    inplace = True
)

In [None]:
# check if there is any pair of features whose correlation values is more than 0.8
plot_and_save_correlation_matrix(df_train, PATH)

## Feature Variance

In [None]:
# Calculate variance for all numeric features, excluding TARGET
variances = df_train.drop(TARGET, axis=1).var(numeric_only=True)

# Sort variances in descending order
sorted_variances = variances.sort_values(ascending=False)

# Print variance for each feature
print("Variance of each feature (in descending order):")

for feature, variance in sorted_variances.items():
  print(f"{feature}: {variance:.4f}")

In [None]:
# - Remove features with variance < 0.01
# - In general, features with variance < 0.05 can be dropped if that
# improves the model performance.

threshold = 0.01

X = df_train.drop(TARGET, axis=1)
selector = VarianceThreshold(threshold=threshold)
X_var = selector.fit_transform(X)
dropped_features_var = X.columns[~selector.get_support()].tolist()

if dropped_features_var:
  df_train.drop(columns=dropped_features_var, inplace=True)
  print("Features dropped due to low variance: ", dropped_features_var)
else:
  print("No features dropped due to low variance.")

## Multicollinearity and Correlation (Final Check)

In [None]:
# check multicollinearity and correlation before finalising the features
# one last time
calculate_vif(df_train, TARGET)

In [None]:
plot_and_save_correlation_matrix(df_train, PATH)

# Align Train and Validation Data Features

In [None]:
validation_only = compare_features_train_validation(df_train, df_val)

## Interaction Transformer

In [None]:
%%writefile interaction_transformer.py
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class InteractionTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, interaction_groups=None, columns_to_drop=None):
    """
    Initialize the transformer with interaction groups and columns to drop.

    Parameters:
    - interaction_groups : list of lists
      List of lists, each containing two input column names and the desired
      interaction term name
      e.g., [['col1', 'col2', 'col1_x_col2'], ['col3', 'col4', 'col3_x_col4']]
    - columns_to_drop : list
      List of column names to drop from the DataFrame
    """
    self.interaction_groups = (
        interaction_groups if interaction_groups is not None else []
    )

    self.columns_to_drop = (
        columns_to_drop if columns_to_drop is not None else []
    )

  def fit(self, X, y=None):
    """
    Fit the transformer (no-op, included for scikit-learn compatibility).
    """
    return self

  def transform(self, X):
    """
    Create interaction terms and drop specified columns.

    Parameters:
    - X : pandas DataFrame
      Input DataFrame

    Returns:
    pandas DataFrame with interaction terms added and specified columns dropped
    """
    X_transformed = X.copy()

    # Create interaction terms
    for col1, col2, interaction_name in self.interaction_groups:
      # Multiply the columns to create interaction term
      X_transformed[interaction_name] = (
          X_transformed[col1] * X_transformed[col2]
      )

    # Drop specified columns
    columns_to_drop = [
        col for col in self.columns_to_drop if col in X_transformed.columns
    ]

    if columns_to_drop:
      X_transformed = X_transformed.drop(columns=columns_to_drop)

    return X_transformed

In [None]:
copy_file_to_destination(
    source_path = 'interaction_transformer.py',
    destination_path = PATH / 'interaction_transformer.py'
)

## Save Fitted Model

In [None]:
from interaction_transformer import InteractionTransformer

interaction_groups = [
    [
        'publisher_popularity_transformed_scaled',
        'publisher_avg_clicks_transformed_scaled',
        'pub_popularity_x_pub_clicks'
    ],
    [
        'market_avg_clicks_transformed_scaled',
        'market_popularity_transformed_scaled',
        'market_clicks_x_market_popularity'
    ]
]

columns_to_drop = [
    'publisher_popularity_transformed_scaled',
    'market_avg_clicks_transformed_scaled'
]

interaction_transformer = InteractionTransformer(
    interaction_groups = interaction_groups, columns_to_drop = columns_to_drop
)

log_transformer_to_mlflow(
    transformer=interaction_transformer,
    experiment_name="Interaction_Transformer",
    model_name="interaction_transformer",
    df=df_train,
    target_col=TARGET
)

## Transform Validation

In [None]:
df_val = pd.concat(
      [
          interaction_transformer.transform(df_val.drop(TARGET, axis=1)),
          df_val[TARGET]
      ],
      axis=1
  )

In [None]:
_ = compare_features_train_validation(df_train, df_val)

## Column Selector for Test Data during Inference

In [None]:
%%writefile column_selector.py
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

# Custom transformer to select specific columns
class ColumnSelector(BaseEstimator, TransformerMixin):
  def __init__(self, columns):
    self.columns = columns

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X[self.columns]

In [None]:
copy_file_to_destination(
    source_path = 'column_selector.py',
    destination_path = PATH / 'column_selector.py'
)

# Save Train and Validation Data

In [None]:
save_object(obj=df_train, filename='df_train.pkl')
save_object(obj=df_val, filename='df_val.pkl')