In [57]:
import pandas as pd
import numpy as np

### Feature generation

In [58]:
df_final = pd.read_csv('../datasets/df_combined.csv', parse_dates=["time"])

df_final['time'] = pd.to_datetime(df_final['time'], utc=True)

print(df_final['time'].dtype)


datetime64[ns, UTC]


In [59]:
# df_final.set_index("time", inplace=True)

df_final['hour'] = df_final['time'].dt.hour
df_final['weekday'] = df_final['time'].dt.weekday
df_final['month'] = df_final['time'].dt.month 

In [60]:
# # Ensure 'hour' and 'weekday' columns are created
# df_final['hour'] = df_final['time'].dt.hour
# df_final['weekday'] = df_final['time'].dt.weekday

# Define conditions and values for 'business hour' and 'weekday_category' in one go

# Business hour conditions and values
business_conditions = [
    ((df_final['hour'] > 8) & (df_final['hour'] < 14)) | ((df_final['hour'] > 17) & (df_final['hour'] < 21)),
    (df_final['hour'] >= 14) & (df_final['hour'] <= 17)
]
business_values = [2, 1]

# Weekday category conditions and values
weekday_conditions = [
    (df_final['weekday'] == 6),  # Sunday
    (df_final['weekday'] == 5)   # Saturday
]
weekday_values = [2, 1]

# Assign 'business hour' and 'weekday_category' using np.select
df_final['business hour'] = np.select(business_conditions, business_values, default=0)
df_final['weekday_category'] = np.select(weekday_conditions, weekday_values, default=0)


In [61]:
# Define the list of cities
# cities = ['Barcelona', 'Bilbao', 'Madrid', 'Seville', 'Valencia']

population =  5179243 + 987000 + 6155116 + 1305342 + 1645342

weights = [
5179243 / population,
987000 / population,
6155116 / population,
1305342 / population,
1645342 / population
]
# Calculate the temperature range for each city in a vectorized way
for label in range(5):
    # Compute the absolute difference and store it in a new column
    df_final['temp_range_{}'.format(label)] = abs(df_final['temp_max_{}'.format(label)] - df_final['temp_min_{}'.format(label)])


cities_weights = {label: weight for label, weight in enumerate(weights)}

df_final['temp_weighted'] = sum(df_final[f'temp_{label}'] * weight for label, weight in enumerate(weights))

print(df_final.columns)

Index(['time', 'generation biomass', 'generation fossil brown coal/lignite',
       'generation fossil gas', 'generation fossil hard coal',
       'generation fossil oil', 'generation hydro pumped storage consumption',
       'generation hydro run-of-river and poundage',
       'generation hydro water reservoir', 'generation nuclear',
       'generation other', 'generation other renewable', 'generation solar',
       'generation waste', 'generation wind onshore', 'total load actual',
       'price day ahead', 'price actual', 'temp_0', 'temp_min_0', 'temp_max_0',
       'pressure_0', 'humidity_0', 'wind_speed_0', 'wind_deg_0', 'rain_1h_0',
       'snow_3h_0', 'clouds_all_0', 'weather_id_0', 'temp_1', 'temp_min_1',
       'temp_max_1', 'pressure_1', 'humidity_1', 'wind_speed_1', 'wind_deg_1',
       'rain_1h_1', 'snow_3h_1', 'clouds_all_1', 'weather_id_1', 'temp_2',
       'temp_min_2', 'temp_max_2', 'pressure_2', 'humidity_2', 'wind_speed_2',
       'wind_deg_2', 'rain_1h_2', 'snow_3h_2

### Feature selection

In [77]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [71]:
df_final = df_final.drop(['snow_3h_0', 'snow_3h_3'], axis=1)

KeyError: "['snow_3h_0', 'snow_3h_3'] not found in axis"

In [72]:
def multivariate_data(dataset, target, start_index, end_index, 
                      history_size, target_size, step, single_step=False):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size


    for i in range(start_index, end_index):
        indices = range(i - history_size, i, step)
        data.appennd(dataset[indices])

        if single_step:
            labels.append(target[i + target_size])
        else:
            labels.append(target[i:i + target_size])

    return np.array(data), np.array(labels)

In [73]:
ds_size = len(df_final)

train_ratio = 0.7
cv_ratio = 0.2
test_ratio = 0.1

train_end_idx = int(train_ratio * ds_size)
cv_end_idx = train_end_idx + int(cv_ratio * ds_size)
test_end_idx = cv_end_idx + int(test_ratio * ds_size)

# Ensure that the indices do not exceed the dataset size (useful for edge cases)
cv_end_idx = min(cv_end_idx, ds_size)
test_end_idx = min(test_end_idx, ds_size)

In [74]:
X = df_final[df_final.columns.drop('price actual')].values
y = df_final['price actual'].values

y = y.reshape(-1, 1)

In [75]:
scaler_X = MinMaxScaler(feature_range=(0, 1))
scaler_y = MinMaxScaler(feature_range=(0, 1))

In [76]:
scaler_X.fit(X[:train_end_idx])
scaler_y.fit(y[:train_end_idx])

TypeError: float() argument must be a string or a real number, not 'Timestamp'

In [None]:
X_norm = scaler_X.transform(X)
y_norm = scaler_y.transform(y)

In [None]:
pca = PCA()
X_pca = pca.fit(X_norm[:train_end_idx])

In [None]:
num_components = len(pca.explained_variance_ratio_)
plt.figure(figsize=(10, 6))
plt.bar(np.arange(num_components), pca.explained_variance_ratio_)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Principal component')
plt.ylabel('Explained variance')
plt.show()

In [None]:
pca = PCA(n_components=0.80)
pca.fit(X_norm[:train_end_idx])
X_pca = pca.transform(X_norm)

In [None]:
X_pca.shape

In [None]:
dataset_norm = np.concatenate((X_pca, y_norm), axis=1)

past_history = 24
future_target = 0

In [None]:
X_train, y_train = multivariate_data(dataset_norm, dataset_norm[:, -1],
                                     0, train_end_idx, past_history, 
                                     future_target, step=1, single_step=True)

In [None]:
X_val, y_val = multivariate_data(dataset_norm, dataset_norm[:, -1],
                                 train_end_idx, cv_end_idx, past_history, 
                                 future_target, step=1, single_step=True)

In [None]:
X_test, y_test = multivariate_data(dataset_norm, dataset_norm[:, -1],
                                   cv_end_idx, test_end_idx, past_history, 
                                   future_target, step=1, single_step=True)

In [None]:
batch_size = 32
buffer_size = 1000

In [None]:
train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train = train.cache().shuffle(buffer_size).batch(batch_size).prefetch(1)

validation = tf.data.Dataset.from_tensor_slices((X_val, y_val))
validation = validation.batch(batch_size).prefetch(1)

In [None]:
# Define some common parameters

input_shape = X_train.shape[-2:]
loss = tf.keras.losses.MeanSquaredError()
metric = [tf.keras.metrics.RootMeanSquaredError()]
lr_schedule = tf.keras.callbacks.LearningRateScheduler(
              lambda epoch: 1e-4 * 10**(epoch / 10))
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10)

In [None]:
y_test = y_test.reshape(-1, 1)
y_test_inv = scaler_y.inverse_transform(y_test)