In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

In [None]:
# Load data
X_train = pd.read_csv('X_train.csv')
Y_train = pd.read_csv('Y_train.csv')
X_test = pd.read_csv('X_test.csv')

X_train_clean = pd.read_csv('X_train.csv')
Y_train_clean = pd.read_csv('Y_train.csv')


In [None]:
# Separate data by country
X_train_fr = X_train_clean[X_train_clean['COUNTRY'] == 'FR']
X_train_de = X_train_clean[X_train_clean['COUNTRY'] == 'DE']
Y_train_fr = Y_train_clean[Y_train_clean["ID"].isin(X_train_fr["ID"])]
Y_train_de = Y_train_clean[Y_train_clean["ID"].isin(X_train_de["ID"])]


In [None]:

# Train-Test Split
X_train_fr = X_train_fr.drop(['COUNTRY', 'DAY_ID', 'ID'], axis=1)
X_train_de = X_train_de.drop(['COUNTRY', 'DAY_ID', 'ID'], axis=1)


In [None]:
# Fill nan values with median
for col in X_train_fr.columns:
    X_train_fr[col].fillna(X_train_fr[col].median(), inplace=True)
for col in X_train_de.columns:
    X_train_de[col].fillna(X_train_de[col].median(), inplace=True)

In [None]:
# Split into training and validation set
X_train_fr_split, X_val_fr_split, Y_train_fr_split, Y_val_fr_split = train_test_split(X_train_fr, Y_train_fr['TARGET'], test_size=0.2, random_state=42)
X_train_de_split, X_val_de_split, Y_train_de_split, Y_val_de_split = train_test_split(X_train_de, Y_train_de['TARGET'], test_size=0.2, random_state=42)

In [None]:
# Feature engineering
# 1. Date-related features
X_train['DAY_ID'] = pd.to_datetime(X_train['DAY_ID'])
X_train['day_of_week'] = X_train['DAY_ID'].dt.dayofweek
X_train['month'] = X_train['DAY_ID'].dt.month

X_test['DAY_ID'] = pd.to_datetime(X_test['DAY_ID'])
X_test['day_of_week'] = X_test['DAY_ID'].dt.dayofweek
X_test['month'] = X_test['DAY_ID'].dt.month

# 2. Interaction features
X_train['gas_coal_interaction'] = X_train['DE_GAS'] * X_train['DE_COAL']
X_test['gas_coal_interaction'] = X_test['DE_GAS'] * X_test['DE_COAL']

# 3. Rolling averages
X_train['rolling_temp_mean'] = X_train['DE_TEMP'].rolling(window=3).mean()
X_test['rolling_temp_mean'] = X_test['DE_TEMP'].rolling(window=3).mean()

In [None]:
rf_model = RandomForestRegressor(random_state=42)

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'min_child_weight': [1, 3, 5]
}

In [None]:
# RF Model Germany Training and Validation set
rf_model.fit(X_train_de_split, Y_train_de_split)

output_train_rf_de = rf_model.predict(X_train_de_split)
output_val_rf_de = rf_model.predict(X_val_de_split)

correlation_train_xgb_de = spearmanr(output_train_rf_de, Y_train_de_split).correlation
correlation_val_xgb_de = spearmanr(output_val_rf_de, Y_val_de_split).correlation

print(f"Spearman's correlation for XGBoost on training set (Germany): {correlation_train_xgb_de}")
print(f"Spearman's correlation for XGBoost on validation set (Germany): {correlation_val_xgb_de}")

In [None]:
# Hyperparameter tuning using GridSearchCV

grid_search_de = GridSearchCV(rf_model, param_grid, cv=kf, scoring='neg_mean_squared_error')
grid_search_de.fit(X_train_de, Y_train_de)

# Best hyperparameters for Germany
best_params_de = grid_search_de.best_params_
best_rf_model_de = grid_search_de.best_estimator_

# Model evaluation on Germany training set
output_de = best_rf_model_de.predict(X_train_de)

spearman_de = spearmanr(output_de, Y_train_de).correlation

print('Spearman correlation for tuned XBoost on training set Germany: {:.2f}'.format(spearman_de))

In [None]:
# Fit the model for France
rf_model.fit(X_train_fr_split, Y_train_fr_split)

output_train_rf_fr = rf_model.predict(X_train_fr_split)
output_val_rf_fr = rf_model.predict(X_val_fr_split)

correlation_train_xgb_fr = spearmanr(output_train_rf_fr, Y_train_fr_split).correlation
correlation_val_xgb_fr = spearmanr(output_val_rf_fr, Y_val_fr_split).correlation

print(f"Spearman's correlation for XGBoost on training set (France): {correlation_train_xgb_fr}")
print(f"Spearman's correlation for XGBoost on validation set (France): {correlation_val_xgb_fr}")

In [None]:
grid_search_fr = GridSearchCV(rf_model, param_grid, cv=kf, scoring='neg_mean_squared_error')
grid_search_fr.fit(X_train_fr, Y_train_fr)

best_params_fr = grid_search_fr.best_params_
best_rf_model_fr = grid_search_fr.best_estimator_

# Model evaluation on Germany training set
output_fr = best_rf_model_fr.predict(X_train_fr)

spearman_fr = spearmanr(output_fr, Y_train_fr).correlation

print('Spearman correlation for tuned XBoost on training set France: {:.2f}'.format(spearman_de))

In [None]:
# Final predictions for the test set
X_test_de = X_test[X_test['COUNTRY'] == 'DE'].drop(['ID', 'DAY_ID', 'COUNTRY'], axis=1)
Y_test_submission_de = X_test_de[['ID']].copy()
Y_test_submission_de['TARGET'] = best_rf_model_de.predict(X_test_de)

X_test_fr= X_test[X_test['COUNTRY'] == 'FR'].drop(['ID', 'DAY_ID', 'COUNTRY'], axis=1)
Y_test_submission_fr = X_test_fr[['ID']].copy()
Y_test_submission_fr['TARGET'] = best_rf_model_fr.predict(X_test_fr)

# Combine predictions for the final submission file
Y_test_submission = pd.concat([Y_test_submission_de, Y_test_submission_de], axis=0)