In [33]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


#data:
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')


def preprocess_data(targets, observed, estimated):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    
    Returns:
    - A preprocessed dataframe ready for training.
    """

    # Ensure the datetime columns are in datetime format
    targets['time'] = pd.to_datetime(targets['time'])
    observed['date_forecast'] = pd.to_datetime(observed['date_forecast'])
    estimated['date_forecast'] = pd.to_datetime(estimated['date_forecast'])
    estimated['date_calc'] = pd.to_datetime(estimated['date_calc'])
    
    # Resample observed and estimated data to 1 hour using mean() as aggregator
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().reset_index()
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])
    
    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    categorical_features = merged_data.columns[3:].tolist()

    
    return merged_data, categorical_features


X_train_a, cat_features_a = preprocess_data(train_a, X_train_observed_a, X_train_estimated_a)
X_train_b, cat_features_b = preprocess_data(train_b, X_train_observed_b, X_train_estimated_b)
X_train_c, cat_features_c = preprocess_data(train_c, X_train_observed_c, X_train_estimated_c)

#print(X_train_a)
print(train_a)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_a, train_a['pv_measurement'].values, test_size=0.2, random_state=42)

# Initialize CatBoostRegressor model
model = CatBoostRegressor(loss_function='MAE', learning_rate=0.1, verbose=200)

# Train model
model.fit(X_train, y_train)

# Validate model
predictions = model.predict(X_val)
mae = mean_absolute_error(y_val, predictions)

print(f"Mean Absolute Error on validation set: {mae:.2f}")


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().reset_index()
  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().reset_index()


                     time  pv_measurement
0     2019-06-02 22:00:00            0.00
1     2019-06-02 23:00:00            0.00
2     2019-06-03 00:00:00            0.00
3     2019-06-03 01:00:00            0.00
4     2019-06-03 02:00:00           19.36
...                   ...             ...
34080 2023-04-30 19:00:00            9.02
34081 2023-04-30 20:00:00            0.00
34082 2023-04-30 21:00:00            0.00
34083 2023-04-30 22:00:00            0.00
34084 2023-04-30 23:00:00            0.00

[34085 rows x 2 columns]
Fitting 3 folds for each of 9 candidates, totalling 27 fits


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().reset_index()


[CV] END ..........depth=6, l2_leaf_reg=1, learning_rate=0.3; total time=  23.1s
[CV] END ..........depth=6, l2_leaf_reg=1, learning_rate=0.1; total time=  23.1s
[CV] END ..........depth=6, l2_leaf_reg=1, learning_rate=0.1; total time=  23.1s
[CV] END ..........depth=6, l2_leaf_reg=1, learning_rate=0.1; total time=  23.3s
[CV] END ..........depth=6, l2_leaf_reg=1, learning_rate=0.3; total time=  23.4s
[CV] END .........depth=6, l2_leaf_reg=1, learning_rate=0.03; total time=  23.4s
[CV] END .........depth=6, l2_leaf_reg=1, learning_rate=0.03; total time=  23.4s
[CV] END .........depth=6, l2_leaf_reg=1, learning_rate=0.03; total time=  23.7s
[CV] END ..........depth=6, l2_leaf_reg=1, learning_rate=0.3; total time=  22.4s
[CV] END ..........depth=6, l2_leaf_reg=3, learning_rate=0.1; total time=  22.7s
[CV] END ..........depth=6, l2_leaf_reg=3, learning_rate=0.1; total time=  22.7s
[CV] END .........depth=6, l2_leaf_reg=3, learning_rate=0.03; total time=  23.2s
[CV] END .........depth=6, l