# Imports and Dataset

In [48]:
import numpy as np  # Need numpy 1.23.1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time
import json

from scipy.stats import uniform, randint
from shaphypetune import BoostSearch, BoostRFE, BoostRFA, BoostBoruta
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.datasets import make_classification, make_regression
import xgboost as xgb
from xgboost import XGBRegressor
from hyperopt import hp
from hyperopt import Trials
from lightgbm import *
import shap

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load dataset

train_df = pd.read_csv('./Data/train_final.csv')
train_df.shape

(201917, 2841)

In [3]:
# Fix datetime columns to be relative floating points

object_cols = train_df.select_dtypes(include=['O']).columns
ref = datetime(2017, 1, 1)

for col in object_cols:
    if 'latest' in col or 'earliest' in col:
        train_df[col] = (pd.to_datetime(train_df[col]) - ref).dt.total_seconds() / 3600.0

# Remove remaining features of type Object

train_df = train_df.drop(columns=train_df.select_dtypes(include=['O']).columns)

In [4]:
# Round all floats to 3 decimal places

for col in train_df.columns:
    if train_df[col].dtype in ['float', 'float32', 'float64'] and col != 'target':
        train_df[col] = train_df[col].round(3)

# Reduce memory size

for col in train_df.columns:
    if col == 'target':
        continue
    elif train_df[col].dtype == 'float64':
        max_float32 = train_df[col].astype('float32').max()
        min_float32 = train_df[col].astype('float32').min()
        if (train_df[col].max() == max_float32) and (train_df[col].min() == min_float32):
            train_df[col] = train_df[col].astype('float32')
    elif train_df[col].dtype == 'int64':
        max_int32 = train_df[col].astype('int32').max()
        min_int32 = train_df[col].astype('int32').min()
        if (train_df[col].max() == max_int32) and (train_df[col].min() == min_int32):
            train_df[col] = train_df[col].astype('int32')


In [5]:
# Check memory usage

train_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201917 entries, 0 to 201916
Columns: 2815 entries, feature_1 to subsector_max_spent
dtypes: bool(15), float32(1885), float64(639), int32(276)
memory usage: 2.6 GB


In [34]:
# Split and scale the data

X_dev_df, X_test_df, y_dev, y_test = train_test_split(train_df.drop(['target'], axis=1), train_df['target'], test_size=0.2, shuffle=True, random_state=42)
X_train_df, X_val_df, y_train, y_val = train_test_split(X_dev_df, y_dev, test_size=0.25, random_state=42)

X_columns = X_train_df.columns

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_df)
X_dev = scaler.transform(X_dev_df)
X_val = scaler.transform(X_val_df)
X_test = scaler.transform(X_test_df)

# Convert dataframes to scaled version

X_train_df = pd.DataFrame(X_train, columns=X_columns)
X_dev_df = pd.DataFrame(X_dev, columns=X_columns)
X_val_df = pd.DataFrame(X_val, columns=X_columns)
X_test_df = pd.DataFrame(X_test, columns=X_columns)


# Baseline Model

In [38]:
# Create XGBoost Regressor with default parameters as a baseline

xgb_baseline = XGBRegressor(tree_method='gpu_hist')
xgb_baseline.fit(X_dev, y_dev)
xgb_baseline_rmse = np.sqrt(mean_squared_error(y_test, xgb_baseline.predict(X_test)))

print(f'XGB RMSE Baseline: {xgb_baseline_rmse}')

XGB RMSE Baseline: 3.7897541823325707


In [44]:
# Dictionary to hold selected features of future models

selected_features = {}

# SHAP Feature Selection

#### Boruta

In [None]:
# Feature Selection

xgb_boruta_shap = XGBRegressor(n_estimators=150, random_state=0, verbosity=0, n_jobs=-1, tree_method='gpu_hist')

model_boruta_shap = BoostBoruta(
    xgb_boruta_shap, max_iter=200, perc=100,
    importance_type='shap_importances', train_importance=False
)

model_boruta_shap.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=5, verbose=0, eval_metric='rmse')

In [2]:
# Feature Importance Plot

features = X_columns[model_boruta_shap.support_]
importances = model_boruta_shap.estimator_.feature_importances_

importance_df = pd.DataFrame({
    'Feature': features, 
    'Importance': importances
}).sort_values(by='Importance', ascending=True)

plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color=sns.color_palette("viridis", len(importance_df['Feature'])))
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.title('Feature Importance Plot', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=8)
plt.show()

# Model Score on Test Set, Raw

xgb_boruta_shap_train_rmse = np.sqrt(mean_squared_error(y_test, model_boruta_shap.predict(X_test)))
print(f'XGB RMSE Boruta SHAP (trained on train): {xgb_boruta_shap_train_rmse}')

# Model Score on Test Set, Trained on Dev

xgb_boruta_shap_dev = XGBRegressor(n_estimators=150, random_state=0, verbosity=0, n_jobs=-1, tree_method='gpu_hist')
xgb_boruta_shap_dev.fit(X_dev_df[features], y_dev)
xgb_boruta_shap_dev_rmse = np.sqrt(mean_squared_error(y_test, xgb_boruta_shap_dev.predict(X_test_df[features])))
print(f'XGB RMSE Boruta SHAP (trained on dev): {xgb_boruta_shap_dev_rmse}')

In [None]:
# Save model, feature names and importances

model_boruta_shap.estimator_.save_model("model_boruta_shap.json")

selected_features['boruta_shap'] = (features.to_list(), importances.to_list())
with open('./models/features.txt', 'w') as file:
    json.dump(selected_features, file)

#### Random Feature Elimination (RFE)

In [None]:
# Feature Selection

xgb_rfe_shap = XGBRegressor(n_estimators=150, random_state=0, verbosity=0, n_jobs=-1, tree_method='gpu_hist')

model_rfe_shap = BoostRFE(
    xgb_rfe_shap, min_features_to_select=10, step=50,
    importance_type='shap_importances', train_importance=False
)

model_rfe_shap.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=5, verbose=0, eval_metric='rmse')

In [None]:
# Feature Importance Plot

features = X_columns[model_rfe_shap.support_]
importances = model_rfe_shap.estimator_.feature_importances_

importance_df = pd.DataFrame({
    'Feature': features, 
    'Importance': importances
}).sort_values(by='Importance', ascending=True)

plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color=sns.color_palette("viridis", len(importance_df['Feature'])))
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.title('Feature Importance Plot', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=8)
plt.show()

# Model Score on Test Set, Raw

xgb_rfe_shap_train_rmse = np.sqrt(mean_squared_error(y_test, model_rfe_shap.predict(X_test)))
print(f'XGB RMSE RFE SHAP (trained on train): {xgb_rfe_shap_train_rmse}')

# Model Score on Test Set, Trained on Dev

xgb_rfe_shap_dev = XGBRegressor(n_estimators=150, random_state=0, verbosity=0, n_jobs=-1, tree_method='gpu_hist')
xgb_rfe_shap_dev.fit(X_dev_df[features], y_dev)
xgb_rfe_shap_dev_rmse = np.sqrt(mean_squared_error(y_test, xgb_rfe_shap_dev.predict(X_test_df[features])))
print(f'XGB RMSE RFE SHAP (trained on dev): {xgb_rfe_shap_dev_rmse}')

In [1]:
# Save model, feature names and importances

model_rfe_shap.estimator_.save_model("model_rfe_shap.json")

selected_features['rfe_shap'] = (features.to_list(), importances.to_list())
with open('./models/features.txt', 'w') as file:
    json.dump(selected_features, file)

#### Random Feature Addition (RFA)