# QRT ENS Challenge Data 2023 - Benchmark

Version 1 - Boosting, Feature engeneering & XGBoost 

## Librairies

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from joblib import dump
from tqdm import tqdm

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

import warnings
warnings.filterwarnings('ignore')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Chargement des données

- `X_train` et `X_test` ont  $35$ colonnes qui représentent les même variables explicatives mais sur des périodes de temps différentes. 

- `X_train` et `Y_train` partagent la même colonne `ID` - chaque ligne a un ID unique associé à un jour et à un pays. 

- La variable cible `TARGET` de `Y_train` correspond à la variation de prix journalière des futures sur l'électricité (maturité 24h).

- **On notera que certaines colonnes ont des valeurs manquantes**.


In [2]:
# After downloading the X_train/X_test/Y_train .csv files in your working directory:

X_train = pd.read_csv('../data/raw/X_train.csv')
Y_train = pd.read_csv('../data/raw/y_train.csv')
X_test = pd.read_csv('../data/raw/X_test_final.csv')

In [3]:
X_train.head()

Unnamed: 0,ID,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,1054,206,FR,0.210099,-0.427458,-0.606523,0.606523,,0.69286,,...,-0.444661,-0.17268,-0.556356,-0.790823,-0.28316,-1.06907,-0.063404,0.339041,0.124552,-0.002445
1,2049,501,FR,-0.022399,-1.003452,-0.022063,0.022063,-0.57352,-1.130838,0.57352,...,-1.183194,-1.2403,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365
2,1924,687,FR,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,...,1.947273,-0.4807,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952
3,297,720,DE,-0.983324,-0.849198,-0.839586,0.839586,-0.27087,0.56323,0.27087,...,-0.976974,-1.114838,-0.50757,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948
4,1101,818,FR,0.143807,-0.617038,-0.92499,0.92499,,0.990324,,...,-0.526267,-0.541465,-0.42455,-1.088158,-1.01156,0.614338,0.729495,0.245109,1.526606,2.614378


In [4]:
Y_train.head()

Unnamed: 0,ID,TARGET
0,1054,0.028313
1,2049,-0.112516
2,1924,-0.18084
3,297,-0.260356
4,1101,-0.071733


In [5]:
X_train["TARGET"] = Y_train["TARGET"]

## Feature engineering
The main goal here is to reconstruct some of the lost time dimension to create stationary features.

In general we had added statistics, technical indicators, seasonality, clusters and bag of features.

In [6]:
def slope(y):
    return np.polyfit(range(len(y)), y, 1)[0] if len(y) > 0 else np.nan

In [7]:
def add_rolling_statistics(df, variables, windows, countries=['DE_', 'FR_']):
    # Define a slope calculation function
    def slope(y):
        return np.polyfit(range(len(y)), y, 1)[0] if len(y) > 0 else np.nan

    # Calculate rolling statistics for each variable and window
    for var in variables:
        for window in windows:
            for country in countries:
                df[f'{country}{var}_MEAN_{window}D'] = df[f'{country}{var}'].rolling(window=window).mean()
                df[f'{country}{var}_STD_{window}D'] = df[f'{country}{var}'].rolling(window=window).std()
                df[f'{country}{var}_MEDIAN_{window}D'] = df[f'{country}{var}'].rolling(window=window).median()
                df[f'{country}{var}_MIN_{window}D'] = df[f'{country}{var}'].rolling(window=window).min()
                df[f'{country}{var}_MAX_{window}D'] = df[f'{country}{var}'].rolling(window=window).max()
                # Apply the slope function to the rolling window
                df[f'{country}{var}_SLOPE_{window}D'] = df[f'{country}{var}'].rolling(window=window).apply(slope, raw=True)
    return df

In [8]:
def add_seasonality_features(df):
    days_in_year = 365.25
    df['SIN_YEAR'] = np.sin(2 * np.pi * df['DAY_ID'] / days_in_year)
    df['COS_YEAR'] = np.cos(2 * np.pi * df['DAY_ID'] / days_in_year)
    
    days_in_week = 7
    df['SIN_WEEK'] = np.sin(2 * np.pi * df['DAY_ID'] / days_in_week)
    df['COS_WEEK'] = np.cos(2 * np.pi * df['DAY_ID'] / days_in_week)

    df['SEASON'] = pd.cut(df['DAY_ID'] % 365, bins=[0, 79, 172, 264, 365], labels=[0, 1, 2, 3], right=False).astype(int)
    return df

In [9]:
def add_energy_source_ratios_and_effects(df):
    for country in ['DE_', 'FR_']:
        for energy_source in ['GAS', 'COAL', 'HYDRO', 'NUCLEAR', 'SOLAR', 'WINDPOW']:
            total_energy = df[f'{country}GAS'] + df[f'{country}COAL'] + df[f'{country}HYDRO'] + \
                           df[f'{country}NUCLEAR'] + df[f'{country}SOLAR'] + df[f'{country}WINDPOW']
            df[f'{country}{energy_source}_RATIO'] = df[f'{country}{energy_source}'] / total_energy

        df[f'{country}WIND_SOLAR'] = df[f'{country}WINDPOW'] + df[f'{country}SOLAR']
        df[f'{country}TEMP_EFFECT'] = df[f'{country}TEMP'] * df[f'{country}CONSUMPTION']
        df[f'{country}WIND_EFFECT'] = df[f'{country}WIND'] * df[f'{country}WINDPOW']
        df[f'{country}SOLAR_EFFECT'] = (df[f'{country}SOLAR'] / df[f'{country}TEMP']).replace([np.inf, -np.inf], np.nan).fillna(0)
    
    return df

In [10]:
def add_market_features(df):
    for commodity in ['GAS_RET', 'COAL_RET', 'CARBON_RET']:
        df[f'{commodity}_VOLATILITY_7D'] = df[commodity].rolling(window=7).std()
        df[f'{commodity}_VOLATILITY_30D'] = df[commodity].rolling(window=30).std()
        df[f'{commodity}_EMA_30D'] = df[commodity].ewm(span=30, adjust=False).mean()
    
    return df

In [11]:
def add_custom_features(df):
    # Temporal window and variables for rolling statistics
    windows = [7, 30]
    variables = ['CONSUMPTION', 'GAS', 'COAL', 'HYDRO', 'NUCLEAR', 'SOLAR', 'WINDPOW', 'TEMP', 'RAIN', 'WIND']

    # Calculate rolling statistics and other features
    df = add_rolling_statistics(df, variables, windows)

    # Seasonality Features
    df = add_seasonality_features(df)

    # Energy Source Ratios and Effects
    df = add_energy_source_ratios_and_effects(df)

    # Market Volatility and Moving Averages
    df = add_market_features(df)

    # Ensure all missing data are filled if any new were created
    df.fillna(method='bfill', inplace=True)

    return df

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel

def feature_engineering(X):
    # Copy the original dataframe to avoid modifying it directly
    df = X.copy()
    
    # Define the country codes
    country_codes = {'DE': 0, 'FR': 1}
    
    # Map country codes to numerical values
    df['COUNTRY'] = df['COUNTRY'].map(country_codes)

    imputer = SimpleImputer(strategy='median')
    df[df.columns] = imputer.fit_transform(df)

    df = add_custom_features(df)
    
    return df

#### *Apply the feature engineering to the df*

In [13]:
df = feature_engineering(X_train)

In [14]:
def select_features_based_on_correlation(dataframe, target_column, multicollinear_threshold=0.7, correlation_threshold=0.05):
    # Calculate the Spearman correlation matrix
    corr_matrix = dataframe.corr(method='spearman')
    
    # Identify features that are highly correlated with each other
    # (excluding the target variable correlation)
    high_corr_var = np.where(corr_matrix > multicollinear_threshold)
    high_corr_var = [(corr_matrix.index[x], corr_matrix.columns[y]) 
                     for x, y in zip(*high_corr_var) 
                     if x != y and x < y]
    
    # Extract the names of columns to drop based on multicollinearity
    multicollinear_features = set([item for sublist in high_corr_var for item in sublist])
    
    # Identify features that have a low correlation with the target variable
    low_corr_with_target = corr_matrix[target_column][abs(corr_matrix[target_column]) < correlation_threshold].index.tolist()
    
    # Combine features to drop due to multicollinearity and low correlation with target
    features_to_drop = multicollinear_features.union(low_corr_with_target)
    
    # Determine the final list of features to keep
    features_to_keep = [feature for feature in dataframe.columns if feature not in features_to_drop and feature != target_column]
    
    return features_to_keep

In [15]:
def select_top_features(features, max_features):
    # Sélectionnez les max_features les plus importants si le nombre de caractéristiques est plus grand que max_features
    if len(features) > max_features:
        return features[:max_features]
    else:
        return features

selected_features = []

# Supposons que select_features_based_on_correlation renvoie des caractéristiques triées par leur importance
refined_features = select_features_based_on_correlation(df, 'TARGET', 0.7, 0.05)
# Sélectionnez uniquement les 15 caractéristiques les plus importantes
top_features = select_top_features(refined_features, 30)
df_reduced = df[top_features + ['TARGET']]
print(f"Selected features for df: {len(df_reduced.columns)}")


Selected features for df: 31


In [16]:
print(top_features)

['DE_FR_EXCHANGE', 'FR_DE_EXCHANGE', 'DE_NET_IMPORT', 'DE_GAS', 'DE_HYDRO', 'FR_HYDRO', 'FR_WINDPOW', 'DE_RESIDUAL_LOAD', 'FR_RAIN', 'GAS_RET', 'CARBON_RET', 'DE_CONSUMPTION_MAX_7D', 'DE_GAS_MIN_30D', 'DE_COAL_SLOPE_7D', 'DE_HYDRO_MIN_7D', 'DE_HYDRO_SLOPE_7D', 'DE_HYDRO_SLOPE_30D', 'FR_HYDRO_MIN_30D', 'DE_WINDPOW_SLOPE_7D', 'FR_WINDPOW_SLOPE_7D', 'DE_WINDPOW_SLOPE_30D', 'FR_WINDPOW_SLOPE_30D', 'DE_RAIN_MIN_7D', 'DE_RAIN_SLOPE_30D', 'DE_HYDRO_RATIO', 'DE_WINDPOW_RATIO', 'DE_WIND_SOLAR', 'DE_SOLAR_EFFECT', 'FR_WIND_SOLAR', 'GAS_RET_VOLATILITY_7D']


## Model and training

#### Use of GridSearch to find optimal hyperparameters

In [17]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from scipy.stats import spearmanr
import numpy as np
import itertools

def train_and_evaluate_catboost(X, y):
    # Define the parameter grid to search over
    param_grid = {
        'depth': [4, 6],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [200],
        'l2_leaf_reg': [3, 7]
    }

    # Determine all combinations
    keys, values = zip(*param_grid.items())
    combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    # Set up K-Fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    best_score = -np.inf
    best_params = None

    # Iterate over combinations
    for params in combinations:
        scores = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model = CatBoostRegressor(silent=True, **params)
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            
            score = spearmanr(y_test, predictions).correlation
            scores.append(score)

        mean_score = np.mean(scores)
        if mean_score > best_score:
            best_score = mean_score
            best_params = params

        print(f"Params: {params}, Mean Spearman Correlation: {mean_score}")

    print(f"Best Params: {best_params}, Best Mean Spearman Correlation: {best_score}")
    return best_params

In [19]:
# Assuming 'TARGET' is the name of your target variable in each dataset
print(f"Training CatBoost model...")
X = df_reduced.drop('TARGET', axis=1)
y = df_reduced['TARGET'].rank()
best_params = train_and_evaluate_catboost(X, y)

Training CatBoost model...
Params: {'depth': 4, 'learning_rate': 0.01, 'iterations': 200, 'l2_leaf_reg': 3}, Mean Spearman Correlation: 0.23960420048054543
Params: {'depth': 4, 'learning_rate': 0.01, 'iterations': 200, 'l2_leaf_reg': 7}, Mean Spearman Correlation: 0.24312894416374314
Params: {'depth': 4, 'learning_rate': 0.05, 'iterations': 200, 'l2_leaf_reg': 3}, Mean Spearman Correlation: 0.22590630507205808
Params: {'depth': 4, 'learning_rate': 0.05, 'iterations': 200, 'l2_leaf_reg': 7}, Mean Spearman Correlation: 0.23961778842645948
Params: {'depth': 4, 'learning_rate': 0.1, 'iterations': 200, 'l2_leaf_reg': 3}, Mean Spearman Correlation: 0.19153053767545994
Params: {'depth': 4, 'learning_rate': 0.1, 'iterations': 200, 'l2_leaf_reg': 7}, Mean Spearman Correlation: 0.19032374269872535
Params: {'depth': 6, 'learning_rate': 0.01, 'iterations': 200, 'l2_leaf_reg': 3}, Mean Spearman Correlation: 0.248288950880117
Params: {'depth': 6, 'learning_rate': 0.01, 'iterations': 200, 'l2_leaf_re

In [20]:
# Retrain with best params on whole dataset
final_model = CatBoostRegressor(silent=True, **best_params)
final_model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x296d0147890>

## Prepare submission

In [21]:
X_test_final = pd.read_csv('../data/raw/X_test_final.csv')
df_test = feature_engineering(X_test_final)

In [22]:
df_test_reduced = df_test[top_features]

In [23]:
predictions = final_model.predict(df_test_reduced)

In [24]:
Y_test_submission = X_test_final[['ID']].copy()
Y_test_submission["TARGET"] = predictions


In [27]:
Y_test_submission.to_csv('submission_catboost.csv', index=False)