### Libraries, paths, and set-up

In [32]:
import pandas as pd
import os
import warnings
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
import src.utils.utils
import src.data.window

# Changing directory
os.chdir('/Users/manotas/Documents/GitHub-Repos/ML-Energy-Colombia')

# Setting storage paths
storage_path = 'data/processed/'

# Loading the full data
fulldata = pd.read_csv(os.path.join(storing_path, 'fulldata.csv'))

### Train, test split

In [3]:
# Identifying all object columns except 'plant' and 'agent'
object_columns = [col for col in fulldata.columns if fulldata[col].dtype == 'object' and col not in ['plant', 'agent']]

# Identifying all non-object columns
non_object_columns = [col for col in fulldata.columns if fulldata[col].dtype != 'object']

# Rearranging fulldata so object columns (except 'plant' and 'agent') come last, followed by 'plant' and 'agent'
fulldata = fulldata[non_object_columns + object_columns + ['plant', 'agent']]

# Creating a mask based on the datetime condition
mask = fulldata['datetime'] < '2022-01-01'

# Splitting features and target variable
X = fulldata.drop(['daily_ask'], axis=1)  # Drop the target variable to separate features
Y = fulldata['daily_ask']  # Target variable

# Applying the mask to split the features
X_train = X[mask]
X_test = X[~mask]

# Applying the mask to split the target variable
Y_train = Y[mask]
Y_test = Y[~mask]

# Saving datetime to index afterwards
train_dt = X_train['datetime'].copy()
test_dt = X_test['datetime'].copy()

### Preprocessing and transformation

In [4]:
# Dynamically identifying categorical and numerical columns, excluding the target variable 'daily_ask'
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object' and col not in ['plant','agent']]
numerical_columns = [col for col in X_train.columns if X_train[col].dtype != 'object' and col not in ['datetime']]  # Assuming 'datetime' needs special handling or is excluded

# Combining preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cyclic', CyclicalDateTimeFeatures(), ['datetime']),
        ('num', StandardScaler(), numerical_columns),
        ('cat', CatBoostEncoder(), categorical_columns)
    ],
    remainder='passthrough')

# Creating the preprocessing pipeline
encoding_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Now fitting this pipeline to the training data and transform both sets
encoding_pipeline.fit(X_train, Y_train)  # Fit to the training data
X_train_transformed = encoding_pipeline.transform(X_train)  # Transform training data
X_test_transformed = encoding_pipeline.transform(X_test)  # Transform testing data


In [6]:
# Names of the cyclical features generated by CyclicalDateTimeFeatures transformer
cyclical_features = ['hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos']

# Original order of columns, replacing 'datetime' with the cyclical features, 
# and ensuring 'plant' and 'agent' are correctly positioned
new_order = cyclical_features + [col for col in numerical_columns + categorical_columns if col not in ['plant', 'agent']] + ['plant'] + ['agent']

# Converting transformed arrays back into DataFrames with the new column order
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=new_order, index=X_train.index)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=new_order, index=X_test.index)

# Reapplying the datetime information as the index
X_train_transformed_df.index = pd.to_datetime(train_dt)
X_test_transformed_df.index = pd.to_datetime(test_dt)


### Saving transformed data

In [23]:
# Saving transformed DataFrames for ease of access
X_train = X_train_transformed_df.copy()
X_train['datetime'] = X_train.index
X_train.set_index(['datetime', 'plant'], inplace=True)
X_train.to_csv(os.path.join(storing_path, 'X_train.csv'))

X_test = X_test_transformed_df.copy()
X_test['datetime'] = X_test.index
X_test.set_index(['datetime', 'plant'], inplace=True)
X_test.to_csv(os.path.join(storing_path, 'X_test.csv'))

In [18]:
# Reloading saved DataFrames
X_train = pd.read_csv(os.path.join(storing_path, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(storing_path, 'X_test.csv'))

### Defining window data
We define a formula to build specific-sized windows of data for plants and datetimes

In [23]:
window_size = 24  # for example, a window size of 24 hours
n_ovr_windows = create_windows(X_train, window_size, overlap=False)
ovr_windows = create_windows(X_train, window_size, overlap=True)

Processing plants: 100%|██████████| 70/70 [16:52<00:00, 14.46s/it]
Processing plants: 100%|██████████| 70/70 [05:55<00:00,  5.08s/it]


### Saving window data

In [None]:
# Saving the non-overlapping windows list
with open(os.path.join(storing_path, 'non_overlapping_windows.pkl'), 'wb') as f:
    pickle.dump(n_ovr_windows, f)

# Saving the overlapping windows list
with open(os.path.join(storing_path, 'overlapping_windows.pkl'), 'wb') as f:
    pickle.dump(ovr_windows, f)