### Libraries, paths, and set-up

In [1]:
import pandas as pd
import os
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder

# Changing directory
os.chdir('/Users/manotas/Documents/GitHub-Repos/ML-Energy-Colombia')

# Local libraries
from src.utils.utils import CyclicalDateTimeFeatures, datetimer
from src.data.window import create_windows, split_features_targets


# Setting storage paths
storage_path = 'data/processed/'

# Loading the full data
try:
    fulldata = pd.read_csv(os.path.join(storage_path, 'fulldata.csv'))
except FileNotFoundError:
    print("The file does not exist")

fulldata = datetimer(fulldata)

### Train, test split

In [2]:
# Identifying all object columns except 'plant' and 'agent'
object_columns = [col for col in fulldata.columns if fulldata[col].dtype == 'object' and col not in ['plant', 'agent']]

# Identifying all non-object columns
non_object_columns = [col for col in fulldata.columns if fulldata[col].dtype != 'object']

# Rearranging fulldata so object columns (except 'plant' and 'agent') come last, followed by 'plant' and 'agent'
fulldata = fulldata[non_object_columns + object_columns + ['plant', 'agent']]

# Creating a mask based on the datetime condition
mask = fulldata['datetime'] < '2022-01-01'

# Splitting features and target variable
X = fulldata.drop(['daily_ask'], axis=1)  # Drop the target variable to separate features
Y = fulldata[['daily_ask']]  # Target variable (kept as a DataFrame to facilitate windowing)

# Creating a mask based on the datetime condition
mask = fulldata['datetime'] < '2022-01-01'

# Applying the mask to split the features and target variable
X_train = X[mask]
X_test = X[~mask]
Y_train = Y[mask]
Y_test = Y[~mask]

# Saving datetime to index afterwards
train_dt = X_train['datetime'].copy()
test_dt = X_test['datetime'].copy()

### Preprocessing and transformation

In [3]:
# Dynamically identifying categorical and numerical columns, excluding the target variable 'daily_ask'
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object' and col not in ['plant','agent']]
numerical_columns = [col for col in X_train.columns if X_train[col].dtype != 'object' and col not in ['datetime']]  # Assuming 'datetime' needs special handling or is excluded

# Combining preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cyclic', CyclicalDateTimeFeatures(), ['datetime']),
        ('num', StandardScaler(), numerical_columns),
        ('cat', CatBoostEncoder(), categorical_columns)
    ],
    remainder='passthrough')

# Creating the preprocessing pipeline
encoding_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Now fitting this pipeline to the training data and transform both sets
encoding_pipeline.fit(X_train, Y_train)  # Fit to the training data
X_train_transformed = encoding_pipeline.transform(X_train)  # Transform training data
X_test_transformed = encoding_pipeline.transform(X_test)  # Transform testing data


In [10]:
# Names of the cyclical features generated by CyclicalDateTimeFeatures transformer
cyclical_features = ['hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos']

# Original order of columns, replacing 'datetime' with the cyclical features, 
# and ensuring 'plant' and 'agent' are correctly positioned
new_order = cyclical_features + [col for col in numerical_columns + categorical_columns if col not in ['plant', 'agent']] + ['plant'] + ['agent']

# Converting transformed arrays back into DataFrames with the new column order
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=new_order, index=X_train.index)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=new_order, index=X_test.index)

# Reapplying the datetime information as the index
X_train_transformed_df.index = pd.to_datetime(train_dt)
X_test_transformed_df.index = pd.to_datetime(test_dt)

# Ensuring indices are unique
X_train_transformed_df = X_train_transformed_df.loc[~X_train_transformed_df.index.duplicated(keep='first')]
X_test_transformed_df = X_test_transformed_df.loc[~X_test_transformed_df.index.duplicated(keep='first')]
Y_train = Y_train.loc[~Y_train.index.duplicated(keep='first')]
Y_test = Y_test.loc[~Y_test.index.duplicated(keep='first')]

# Combining `X_train` and `Y_train` for windowing
X_train_transformed_df['datetime'] = train_dt  # Adding datetime column back
X_test_transformed_df['datetime'] = test_dt    # Adding datetime column back

train_data = pd.concat([X_train_transformed_df, Y_train], axis=1)
test_data = pd.concat([X_test_transformed_df, Y_test], axis=1)

### Defining window data
We define a formula to build specific-sized windows of data for plants and datetimes

In [12]:
# Create windows for training and test sets
window_size = 24  # Assuming 24 hours

train_windows_n = create_windows(train_data, window_size, overlap=False)
test_windows_n = create_windows(test_data, window_size, overlap=False)

train_windows_o = create_windows(train_data, window_size, overlap=False)
test_windows_o = create_windows(test_data, window_size, overlap=False)

Processing plants: 100%|██████████| 70/70 [00:23<00:00,  2.95it/s]
Processing plants: 100%|██████████| 69/69 [00:02<00:00, 32.72it/s]
Processing plants: 100%|██████████| 70/70 [00:29<00:00,  2.37it/s]
Processing plants: 100%|██████████| 69/69 [00:03<00:00, 18.36it/s]


In [23]:
# Applying the splitting function
X_train_windows_n, Y_train_windows_n = split_features_targets(train_windows_n, 'daily_ask')
X_test_windows_n, Y_test_windows_n = split_features_targets(test_windows_n, 'daily_ask')

# Applying the splitting function
X_train_windows_o, Y_train_windows_o = split_features_targets(train_windows_o, 'daily_ask')
X_test_windows_o, Y_test_windows_o = split_features_targets(test_windows_o, 'daily_ask')

### Saving window data

In [24]:
# Saving the non-overlapping windows list
with open(os.path.join(storage_path, 'train_non_overlapping_windows.pkl'), 'wb') as f:
    pickle.dump((X_train_windows_n, Y_train_windows_n), f)

with open(os.path.join(storage_path, 'test_non_overlapping_windows.pkl'), 'wb') as f:
    pickle.dump((X_test_windows_n, Y_test_windows_n), f)

with open(os.path.join(storage_path, 'train_overlapping_windows.pkl'), 'wb') as f:
    pickle.dump((X_train_windows_o, Y_train_windows_o), f)

with open(os.path.join(storage_path, 'test_overlapping_windows.pkl'), 'wb') as f:
    pickle.dump((X_test_windows_o, Y_test_windows_o), f)