# Feature Engineering + Optimized XGBoost

The notebook illustrates how to apply feature engineering and feature selection techniques and create an XGBoost model with optimal hyper-parameters to make a prediction on pet "Powpularity score" based only on the tabular data from a csv file.

This model could be used as a supplement to the image regression model.

In [None]:
import os
import random

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Tabular data file paths
TRAIN_DATA_PATH = '../input/petfinder-pawpularity-score/train.csv'
TEST_DATA_PATH = '../input/petfinder-pawpularity-score/test.csv'

In [None]:
TARGET_NAME = 'Pawpularity'
VAL_SIZE = 0.15
SEED = 5
EARLY_ROUNDS = 50

## Functions

In [None]:
def set_seed(seed=42):
    """Utility function to use for reproducibility.
    :param seed: Random seed
    :return: None
    """
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


def set_display():
    """Function sets display options for charts and pd.DataFrames.
    """
    # Plots display settings
    plt.style.use('fivethirtyeight')
    plt.rcParams['figure.figsize'] = 12, 8
    plt.rcParams.update({'font.size': 14})
    # DataFrame display settings
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.options.display.float_format = '{:.4f}'.format


def get_features(df: pd.DataFrame) -> list:
    """Function selects input features from a DataFrame.
    :param df: DataFrame containing features, Ids and possibly target values
    :return: List of input features
    """
    return [column for column in df.columns
            if column != 'Id' and column != TARGET_NAME]


def add_features(df: pd.DataFrame) -> pd.DataFrame:
    """Function adds new features to the DataFrame
    by summing up existing features. Uses variable "features"
    defined outside the scope of this function.
    :param df: Original DataFrame
    :return: Updated DataFrame
    """
    # Normalized sum of all original features
    df['features_sum'] = df[features].sum(axis=1) / len(features)

    # Feature pairs (normalized)
    for i in range(len(features) - 1):
        for j in range(i + 1, len(features)):
            feature_1 = features[i]
            feature_2 = features[j]
            df[f'{feature_1}_{feature_2}'] = (df[feature_1] + df[feature_2]) / 2

    # Feature triplets (normalized)
    for i in range(len(features) - 2):
        for j in range(i + 1, len(features) - 1):
            for z in range(j + 1, len(features)):
                feature_1 = features[i]
                feature_2 = features[j]
                feature_3 = features[z]
                df[f'{feature_1}_{feature_2}_{feature_3}'] = (
                    df[feature_1] + df[feature_2] + df[feature_3]) / 3

    return df


def rmse(y_true, y_pred) -> float:
    """Function calculates Root Mean Squared Error
    for predicted and actual values.
    :param y_true: Actual values
    :param y_pred: Predicted values
    :return: RMSE value
    """
    return np.sqrt(np.mean(np.square(y_true - y_pred)))


def objective(trial):
    """Function performs trials of parameter optimization
    for XGBoost model.
    :param trial: optuna trial object
    :return: RMSE score
    """
    global model
    params = {
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 250, 10_000, 250),
        'reg_lambda': trial.suggest_int('reg_lambda', 1, 100),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0, step=0.1),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 100, step=5),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.95),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0)
    }

    fit_params = dict(eval_set=[(valid_x, valid_y)], eval_metric='rmse',
                      early_stopping_rounds=EARLY_ROUNDS, verbose=False)

    pruning_callback = optuna.integration.XGBoostPruningCallback(
        trial, 'validation_0-rmse')

    model = XGBRegressor(**params)
    model.fit(train_x, train_y, **fit_params, callbacks=[pruning_callback])
    y_pred = model.predict(valid_x)
    val_rmse = rmse(valid_y, y_pred)

    return val_rmse

## Exploratory Data Analysis

In [None]:
set_seed(SEED)
set_display()

In [None]:
# Train data set
data_train = pd.read_csv(TRAIN_DATA_PATH)
print(f'Train data shape: {data_train.shape}')
data_train.head()

In [None]:
# Test data set
data_test = pd.read_csv(TEST_DATA_PATH)
print(f'Test data shape: {data_test.shape}')
data_test.head()

In [None]:
# Distribution of the target values
print(f'Target values: {data_train[TARGET_NAME].min()} - {data_train[TARGET_NAME].max()}\n'
      f'Mean value: {data_train[TARGET_NAME].mean()}\n'
      f'Median value: {data_train[TARGET_NAME].median()}\n'
      f'Standard deviation: {data_train[TARGET_NAME].std()}')

sns.histplot(data=data_train, x=TARGET_NAME, kde=True)
plt.axvline(data_train[TARGET_NAME].mean(), c='orange', ls='-', lw=3, label='Mean')
plt.axvline(data_train[TARGET_NAME].median(), c='green', ls='-', lw=3, label='Median')
plt.legend()
plt.title('Pawpularity Score')
plt.tight_layout()
plt.show()

Target values are unevenly distributed. Train set contains a small portion of samples describing images of pet that are very popular (near 100 score). Most of the pets are scored below the mean "Powpularity score".

In [None]:
# Binary features correlation with the target.
correlation = data_train.corr()
ax = sns.heatmap(correlation, center=0, annot=True, cmap='RdBu_r', fmt='0.3f')
l, r = ax.get_ylim()
ax.set_ylim(l + 0.5, r - 0.5)
plt.yticks(rotation=0)
plt.title('Correlation Matrix')
plt.show()

correlation[TARGET_NAME].sort_values()

All binary features have very low correlation with the target value. The most negatively correlated feature is "Blur", and the most positively correlated feature is "Group".

Some feature pairs demonstrate correlation to each other like "Blur" and "Eyes", "Face" and "Eyes", "Info" and "Collage", "Occlusion" and "Human". We will test the hypothesis that by combining all or some of the features we can obtain more meaningful parameters for predicting "Powpolarity score".

## Feature Engineering

We cross the original features to produce new aggregate parameters, which would demonstrate higher correlation with the target.

We apply several basic approaches:
- Simple sum of all binary features divided by the number of features. Assumption: the higher the aggregated score - the more popular the pet should be.
- Normalized sum of feature pairs and triplets.

XGBoost does not require input features to be normalized, but we keep new features scaled to the range between 0 and 1 in case we fit any other models on this data in the future.

In [None]:
# Before crossing the features we need to transform negatively correlated features
# into positively correlated. We do it by switching 0 and 1 values.
neg_features = correlation[correlation[TARGET_NAME] < 0].index.to_list()
data_train[neg_features] = data_train[neg_features].apply(lambda x: (x + 1) % 2)
data_test[neg_features] = data_test[neg_features].apply(lambda x: (x + 1) % 2)

In [None]:
# List of original input features
features = get_features(data_train)

# Add new features
data_train = add_features(data_train)
data_test = add_features(data_test)

In [None]:
# Check correlation of the new features with the target.
correlation = data_train.corr()
correlation[TARGET_NAME].sort_values()

From the sorted feature-target correlation matrix we can see that feature engineering in fact produced new parameters that demonstrate higher correlation with the target compared to single features from the original data set.

## Feature Selection

After the feature engineering we have a large set of features. Some of them are more useful that others. We need to discard the less informative features to make the training faster and avoid noise in the data. For this purpose we will train a base XGBoost model and use feature importance to select useful parameters in the train set.

In [None]:
# Updated input features
features = get_features(data_train)

# Split the data into train and validation sets
y = data_train[TARGET_NAME]
x = data_train[features]

train_x, valid_x, train_y, valid_y = train_test_split(
    x, y, test_size=VAL_SIZE, shuffle=True, random_state=SEED)
print(f'Train data shape: {train_x.shape}\n'
      f'Validation data shape: {valid_x.shape}')

In [None]:
# Train the base model
xgb_model = XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor',
                         objective='reg:squarederror', booster='gbtree')

xgb_model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
              eval_metric='rmse', early_stopping_rounds=EARLY_ROUNDS)

In [None]:
# Check the feature importance.
importance = pd.DataFrame({
    'features': features,
    'importance': xgb_model.feature_importances_
})
importance.sort_values(by='importance', inplace=True)

plt.barh([i for i in range(len(importance))], importance['importance'])
plt.title('XGBoost Feature Importance')
plt.show()

Significant number of input features have zero importance to the model. We will filter out insignificant features applying a threshold of 0.005.

In [None]:
# Select informative features.
threshold = 0.005
importance = importance[importance['importance'] >= threshold]
plt.figure(figsize=(12, 16))
plt.barh(importance['features'], importance['importance'])
plt.title('XGBoost Feature Importance')
plt.savefig('features.png', dpi=300)
plt.show()

In [None]:
# Split the data using only selected features.
features = importance['features'].to_list()
x = data_train[features]

train_x, valid_x, train_y, valid_y = train_test_split(
    x, y, test_size=VAL_SIZE, shuffle=True, random_state=SEED)
print(f'Train data shape: {train_x.shape}\n'
      f'Validation data shape: {valid_x.shape}')

## Optimization of Hyper-Parameters

We will try 200 various combinations of hyper-parameters defined in the objective() function using early stopping to prevent the model from overfitting and pruning callback to stop unpromising trials early and save time.

In [None]:
# Search for optimal parameters
study = optuna.create_study(
    sampler=optuna.samplers.TPESampler(seed=SEED),
    direction='minimize',
    study_name='xgb')

In [None]:
study.optimize(objective, n_trials=200)

In [None]:
xgb_params = study.best_params
print('XGBoost best RMSE:', study.best_value)
print('Optimal parameters:')
for key, value in xgb_params.items():
    print(f'\t{key}: {value}')

In [None]:
# Retrain the model with the best parameters
xgb_model = XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor',
                         objective='reg:squarederror', booster='gbtree',
                         **xgb_params)

xgb_model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
              eval_metric='rmse', early_stopping_rounds=EARLY_ROUNDS)

print('Validation RMSE:', xgb_model.best_score)

## Inference

In [None]:
# Predict values for the test set.
data_test[TARGET_NAME] = xgb_model.predict(data_test[features])
data_test[['Id', TARGET_NAME]].to_csv('submission.csv', index=False)
data_test[['Id', TARGET_NAME]].head()