# Age Prediction

## 1. Setup

### 1.1. Library Imports

In [None]:
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel as C
from sklearn.gaussian_process.kernels import RationalQuadratic, WhiteKernel
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

### 1.2. Configuration

In [None]:
ROOT = Path(".")

PATH_TO_DATA = ROOT / "data"
PATH_TO_X = PATH_TO_DATA / "X_train.csv"
PATH_TO_Y = PATH_TO_DATA / "y_train.csv"


##################################################
# Reproducibility
##################################################
SEED = 4
random.seed(SEED)
RS_NUMPY = np.random.RandomState(SEED)

## 2. Data Preparation

### 2.1. Dataset Creation

In [None]:
##################################################
# Load the data
##################################################
X = pd.read_csv(PATH_TO_X, index_col=0)
y = pd.read_csv(PATH_TO_Y, index_col=0)


##################################################
# Sanity checks
##################################################
# Check for non-empty DataFrames.
assert not X.empty, "X is empty."
assert not y.empty, "y is empty."

# Check for matching row numbers.
assert X.shape[0] == y.shape[0], "X and y have different numbers of rows."

# Check for no NaN values in target variable.
assert y.isnull().sum().sum() == 0, "y contains NaN values."

# Check for non-negative target variable.
assert (y >= 0).all().item(), "y contains negative values."


# Identify columns with constant values.
constant_columns = [col for col in X.columns if X[col].nunique(dropna=True) == 1]

print(f"Constant columns: {constant_columns}")


# Identify duplicated columns.
duplicated_columns = X.columns[X.T.duplicated()].to_list()

print(f"Duplicated columns: {duplicated_columns}")


# Drop constant and duplicated columns.
columns_to_drop = list(set(constant_columns + duplicated_columns))
X = X.drop(columns=columns_to_drop)


##################################################
# Data Split: Train and Test Dataset Creation
##################################################
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=RS_NUMPY
)

### 2.2. Outlier Detection

In [None]:
# Initially, NaN values are handled through median imputation, as Isolation Forest cannot process NaN values directly.
# The median is chosen given that it is robust against outliers.
median_estimator = X_train.median(axis=0, skipna=True)

X_train_temp = X_train.fillna(median_estimator, axis=0)
X_test_temp = X_test.fillna(median_estimator, axis=0)


# Using isolation forest to determine outliers.
iso = IsolationForest(contamination=0.04, random_state=RS_NUMPY)

outliers_pred_train = iso.fit_predict(X_train_temp.to_numpy())
non_outliers_mask_train = outliers_pred_train != -1  # Outliers are labeled as -1.

outliers_pred_test = iso.predict(X_test_temp.to_numpy())
non_outliers_mask_test = outliers_pred_test != -1

print(
    f"Detected outliers by Isolation Forest: {np.sum(~non_outliers_mask_train) + np.sum(~non_outliers_mask_test)}"
)


# Drop outliers.
X_train, y_train = X_train[non_outliers_mask_train], y_train[non_outliers_mask_train]
X_test, y_test = X_test[non_outliers_mask_test], y_test[non_outliers_mask_test]

### 2.3. Feature Selection

In [None]:
def plot_correlation_coefficients(X, y):
    correlations = X.apply(lambda x: x.corr(y))
    abs_correlations = correlations.abs().sort_values(ascending=False)

    plt.figure(figsize=(15, 10))
    plt.bar(range(len(abs_correlations)), abs_correlations.values)

    # Hide ticks.
    plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
    plt.grid(axis="y", linestyle="--", alpha=0.7)

    plt.xlabel("Features")
    plt.ylabel("Absolute Correlation with Target")
    plt.title("Feature Correlations with Target")
    plt.tight_layout()
    plt.show()

    # The threshold value is chosen by analysing the plot.
    threshold_corr = 0.12
    columns_to_drop = abs_correlations[
        abs_correlations <= threshold_corr
    ].index.to_list()

    return columns_to_drop


columns_to_drop = plot_correlation_coefficients(X_train, y_train.squeeze())
print(f"Number of features dropped: {len(columns_to_drop)}")

X_train = X_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)

### 2.4. Imputation of Missing Values

In [None]:
n_neighbors = 5
# Using k-Nearest Neighbors to compute the imputed values.
imputer = KNNImputer(n_neighbors=n_neighbors, weights="distance")

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

### 2.5. Normalization

In [None]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 3. Training Process

In [None]:
kernel = C(10.3**2, constant_value_bounds="fixed") * RationalQuadratic(
    length_scale=7.25, alpha=0.83, length_scale_bounds="fixed", alpha_bounds="fixed"
) + WhiteKernel(noise_level=1.77e-07, noise_level_bounds="fixed")


gp = GaussianProcessRegressor(
    kernel=kernel, alpha=0.01, random_state=RS_NUMPY, n_restarts_optimizer=0
)

# Fit the GP model.
target_mean = y_train.mean().item()
gp.fit(X_train, (y_train - target_mean).squeeze())

## 4. Evaluation

In [None]:
# Predictions
y_test_pred = gp.predict(X_test) + target_mean

score_test = r2_score(y_test, y_test_pred)
print(f"R2 test set: {score_test}")