### Libraries, paths, and set-up

In [1]:
import pandas as pd
import numpy as np
import os
os.chdir('..')
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from src.models.metrics import *
from src.utils.utils import *
from src.data.window import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from joblib import dump, load
import pickle
from src.utils.utils import *

storing_path = 'data/processed/'

if not os.path.exists('models/data'):
    os.mkdir('models/data')

fulldata = pd.read_csv(storing_path + 'fulldata.csv')
fulldata = datetimer(fulldata)


### Train, test split

In [2]:
# Identify all object columns except 'plant' and 'agent'
object_columns = [col for col in fulldata.columns if fulldata[col].dtype == 'object' and col not in ['plant', 'agent']]

# Identify all non-object columns
non_object_columns = [col for col in fulldata.columns if fulldata[col].dtype != 'object']

# Rearrange fulldata so object columns (except 'plant' and 'agent') come last, followed by 'plant' and 'agent'
fulldata = fulldata[non_object_columns + object_columns + ['plant', 'agent']]

# Create a mask based on the datetime condition
mask = fulldata['datetime'] < '2022-01-01'

# Splitting features and target variable
X = fulldata.drop(['daily_ask'], axis=1)  # Drop the target variable to separate features
Y = fulldata['daily_ask']  # Target variable

# Applying the mask to split the features
X_train = X[mask]
X_test = X[~mask]

# Applying the mask to split the target variable
Y_train = Y[mask]
Y_test = Y[~mask]

# Saving datetime to index afterwards
train_dt = X_train['datetime'].copy()
test_dt = X_test['datetime'].copy()

In [3]:

# Dynamically identify categorical and numerical columns, excluding the target variable 'daily_ask'
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object' and col not in ['plant','agent']]
numerical_columns = [col for col in X_train.columns if X_train[col].dtype != 'object' and col not in ['datetime']]  # Assuming 'datetime' needs special handling or is excluded

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cyclic', CyclicalDateTimeFeatures(), ['datetime']),
        ('num', StandardScaler(), numerical_columns),
        ('cat', CatBoostEncoder(), categorical_columns)
    ],
    remainder='passthrough')

# Create the preprocessing pipeline
encoding_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Now fit this pipeline to your training data and transform both training and testing sets
encoding_pipeline.fit(X_train, Y_train)  # Fit to the training data
X_train_transformed = encoding_pipeline.transform(X_train)  # Transform training data
X_test_transformed = encoding_pipeline.transform(X_test)  # Transform testing data


In [4]:
fulldata

Unnamed: 0,daily_ask,datetime,supply_hourly,demand_hourly,hourly_bid,hourly_mc,total_supply_agent,total_supply_t,residual_supply,rsi_agent,...,lerner,comp_markup,comp_lerner,soi,heat_rate,netcapacity_kW,technology,fuel,plant,agent
0,200.00000,2010-01-01,419000.0,5189693.95,124.687267,115.0,882000.0,11699000.0,10817000.0,2.084323,...,0.425000,75.312733,0.376564,-1.1,4.1780,427000.0,Hydro,AGUA,ALBAN,CELSIA COLOMBIA S.A. E.S.P.
1,402.26000,2010-01-01,60000.0,5189693.95,124.687267,115.0,1176000.0,11699000.0,10523000.0,2.027673,...,0.714115,277.572733,0.690033,-1.1,10.4846,60000.0,Thermal,GAS,BARRANQUILLA 3,TERMOBARRANQUILLA S.A. EMPRESA DE SERVICIOS PU...
2,217.66900,2010-01-01,60000.0,5189693.95,124.687267,115.0,1176000.0,11699000.0,10523000.0,2.027673,...,0.471675,92.981733,0.427170,-1.1,11.6696,60000.0,Thermal,GAS,BARRANQUILLA 4,TERMOBARRANQUILLA S.A. EMPRESA DE SERVICIOS PU...
3,306.00000,2010-01-01,534000.0,5189693.95,124.687267,115.0,2199000.0,11699000.0,9500000.0,1.830551,...,0.624183,181.312733,0.592525,-1.1,0.5895,540000.0,Hydro,AGUA,BETANIA,ENEL COLOMBIA SA ESP
4,385.00000,2010-01-01,60000.0,5189693.95,124.687267,115.0,2199000.0,11699000.0,9500000.0,1.830551,...,0.701299,260.312733,0.676137,-1.1,11.8104,60000.0,Thermal,GAS,CARTAGENA 2,ENEL COLOMBIA SA ESP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3554306,1495.00000,2022-12-31,71000.0,6772385.63,518.980000,520.0,71000.0,14148000.0,14077000.0,2.078588,...,0.652174,976.020000,0.652856,2.1,8.0900,88000.0,Thermal,GAS,TERMONORTE,TERMONORTE S.A.S. E.S.P.
3554307,649.63400,2022-12-31,19000.0,6772385.63,518.980000,520.0,128000.0,14148000.0,14020000.0,2.070172,...,0.199549,130.654000,0.201119,2.1,7.2600,19000.0,Thermal,ACPM,TERMOPROYECTOS,PROELECTRICA S.A.S E.S.P.
3554308,1297.28500,2022-12-31,353000.0,6772385.63,518.980000,520.0,3349000.0,14148000.0,10799000.0,1.594564,...,0.599163,778.305000,0.599949,2.1,6.5670,428000.0,Thermal,GAS,TERMOSIERRA CC,EMPRESAS PUBLICAS DE MEDELLIN E.S.P.
3554309,800.00003,2022-12-31,241000.0,6772385.63,518.980000,520.0,241000.0,14148000.0,13907000.0,2.053486,...,0.350000,281.020030,0.351275,2.1,6.5573,241000.0,Thermal,ACPM,TERMOVALLE CC,PRIME TERMOVALLE S.A.S EMPRESA DE SERVICIOS PU...


In [8]:
# Names of the cyclical features generated by your CyclicalDateTimeFeatures transformer
cyclical_features = ['hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos']

# Original order of columns, replacing 'datetime' with the cyclical features, 
# and ensuring 'plant' and 'agent' are correctly positioned
new_order = cyclical_features + [col for col in numerical_columns + categorical_columns if col not in ['plant', 'agent']] + ['plant'] + ['agent']

# Convert transformed arrays back into DataFrames with the new column order
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=new_order, index=X_train.index)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=new_order, index=X_test.index)

# Reapply the datetime information as the index
X_train_transformed_df.index = pd.to_datetime(train_dt)
X_test_transformed_df.index = pd.to_datetime(test_dt)


In [9]:
X_train_transformed_df

Unnamed: 0_level_0,hour_sin,hour_cos,day_sin,day_cos,dayofweek_sin,dayofweek_cos,month_sin,month_cos,supply_hourly,demand_hourly,...,lerner,comp_markup,comp_lerner,soi,heat_rate,netcapacity_kW,technology,fuel,plant,agent
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01 00:00:00,0.0,1.0,0.201299,0.97953,-0.433884,-0.900969,0.5,0.866025,0.957971,-1.32155,...,-0.429662,-0.659373,-0.301899,-1.642586,-1.05271,0.76608,286.787751,286.787751,ALBAN,CELSIA COLOMBIA S.A. E.S.P.
2010-01-01 00:00:00,0.0,1.0,0.201299,0.97953,-0.433884,-0.900969,0.5,0.866025,-0.590272,-1.32155,...,0.613478,0.106214,0.625266,-1.642586,0.632325,-0.692622,450.037985,550.476375,BARRANQUILLA 3,TERMOBARRANQUILLA S.A. EMPRESA DE SERVICIOS PU...
2010-01-01 00:00:00,0.0,1.0,0.201299,0.97953,-0.433884,-0.900969,0.5,0.866025,-0.590272,-1.32155,...,-0.261257,-0.592493,-0.152218,-1.642586,0.94894,-0.692622,450.037985,550.476375,BARRANQUILLA 4,TERMOBARRANQUILLA S.A. EMPRESA DE SERVICIOS PU...
2010-01-01 00:00:00,0.0,1.0,0.201299,0.97953,-0.433884,-0.900969,0.5,0.866025,1.453926,-1.32155,...,0.288999,-0.258146,0.336861,-1.642586,-2.011507,1.215217,286.787751,286.787751,BETANIA,ENEL COLOMBIA SA ESP
2010-01-01 00:00:00,0.0,1.0,0.201299,0.97953,-0.433884,-0.900969,0.5,0.866025,-0.590272,-1.32155,...,0.567236,0.040882,0.584164,-1.642586,0.98656,-0.692622,450.037985,550.476375,CARTAGENA 2,ENEL COLOMBIA SA ESP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 23:00:00,-0.258819,0.965926,-0.0,1.0,-0.433884,-0.900969,-0.0,1.0,-0.728276,-0.200479,...,-0.442776,-0.56396,-0.390463,1.330865,1.04347,-0.819811,450.037985,550.476375,TERMOYOPAL 2,TERMOYOPAL GENERACION 2 S.A.S E.S.P.
2021-12-31 23:00:00,-0.258819,0.965926,-0.0,1.0,-0.433884,-0.900969,-0.0,1.0,-0.693775,-0.200479,...,-1.364996,-0.900283,-1.244125,1.330865,0.887621,-0.788014,450.037985,178.801397,ZIPAEMG 2,ENEL COLOMBIA SA ESP
2021-12-31 23:00:00,-0.258819,0.965926,-0.0,1.0,-0.433884,-0.900969,-0.0,1.0,-0.849031,-0.200479,...,-1.19217,-0.853903,-1.084147,1.330865,0.957704,-0.680698,450.037985,178.801397,ZIPAEMG 3,ENEL COLOMBIA SA ESP
2021-12-31 23:00:00,-0.258819,0.965926,-0.0,1.0,-0.433884,-0.900969,-0.0,1.0,-0.611835,-0.200479,...,-1.195764,-0.854925,-1.087474,1.330865,1.304377,-0.676723,450.037985,178.801397,ZIPAEMG 4,ENEL COLOMBIA SA ESP


### Defining window data
We define a formula to build specific-sized windows of data for plants and datetimes

In [7]:
# Define the window size
window_size = 7

# Apply the window function to the train and test data
train_ow = create_windows(train_df, window_size)
test_ow = create_windows(test_df, window_size)

train_nw = create_windows_no_overlap(train_df, window_size)
test_nw = create_windows_no_overlap(test_df, window_size)

NameError: name 'train_df' is not defined

In [None]:
train_df.columns

Index(['plant', 'daily_ask', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
       'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos',
       'supply_hourly', 'demand_hourly', 'hourly_bid', 'hourly_mc',
       'total_supply_agent', 'total_supply_t', 'residual_supply', 'rsi_agent',
       'market_share_agent', 'markup', 'lerner', 'comp_markup', 'comp_lerner',
       'soi', 'heat_rate', 'agent', 'technology', 'fuel', 'netcapacity_kW'],
      dtype='object')

In [None]:
pd.set_option('display.max_columns', None)
train_df.head(1)

Unnamed: 0,plant,daily_ask,hour_sin,hour_cos,day_sin,day_cos,dayofweek_sin,dayofweek_cos,month_sin,month_cos,supply_hourly,demand_hourly,hourly_bid,hourly_mc,total_supply_agent,total_supply_t,residual_supply,rsi_agent,market_share_agent,markup,lerner,comp_markup,comp_lerner,soi,heat_rate,agent,technology,fuel,netcapacity_kW
0,0.0,1.0,0.201299,0.97953,-0.433884,-0.900969,0.5,0.866025,0.957971,-1.32155,-0.282891,-0.238093,-0.384757,-0.905918,-1.642586,-1.05271,0.76608,407.071221,407.071221,ALBAN,200.0,10817000.0,2.084323,0.075391,85.0,0.425,75.312733,0.376564,CELSIA COLOMBIA S.A. E.S.P.


In [None]:
train_data.head(1)

Unnamed: 0,plant,daily_ask,datetime,supply_hourly,demand_hourly,hourly_bid,hourly_mc,total_supply_agent,total_supply_t,residual_supply,rsi_agent,market_share_agent,markup,lerner,comp_markup,comp_lerner,soi,heat_rate,agent,technology,fuel,netcapacity_kW
0,ALBAN,200.0,2010-01-01,419000.0,5189693.95,124.687267,115.0,882000.0,11699000.0,10817000.0,2.084323,0.075391,85.0,0.425,75.312733,0.376564,-1.1,4.178,CELSIA COLOMBIA S.A. E.S.P.,Hydro,AGUA,427000.0


In [None]:
train_data

Unnamed: 0,plant,daily_ask,datetime,supply_hourly,demand_hourly,hourly_bid,hourly_mc,total_supply_agent,total_supply_t,residual_supply,...,markup,lerner,comp_markup,comp_lerner,soi,heat_rate,agent,technology,fuel,netcapacity_kW
0,ALBAN,200.000,2010-01-01 00:00:00,419000.0,5189693.95,124.687267,115.000,882000.0,11699000.0,10817000.0,...,85.000,0.425000,75.312733,0.376564,-1.1,4.1780,CELSIA COLOMBIA S.A. E.S.P.,Hydro,AGUA,427000.0
1,BARRANQUILLA 3,402.260,2010-01-01 00:00:00,60000.0,5189693.95,124.687267,115.000,1176000.0,11699000.0,10523000.0,...,287.260,0.714115,277.572733,0.690033,-1.1,10.4846,TERMOBARRANQUILLA S.A. EMPRESA DE SERVICIOS PU...,Thermal,GAS,60000.0
2,BARRANQUILLA 4,217.669,2010-01-01 00:00:00,60000.0,5189693.95,124.687267,115.000,1176000.0,11699000.0,10523000.0,...,102.669,0.471675,92.981733,0.427170,-1.1,11.6696,TERMOBARRANQUILLA S.A. EMPRESA DE SERVICIOS PU...,Thermal,GAS,60000.0
3,BETANIA,306.000,2010-01-01 00:00:00,534000.0,5189693.95,124.687267,115.000,2199000.0,11699000.0,9500000.0,...,191.000,0.624183,181.312733,0.592525,-1.1,0.5895,ENEL COLOMBIA SA ESP,Hydro,AGUA,540000.0
4,CARTAGENA 2,385.000,2010-01-01 00:00:00,60000.0,5189693.95,124.687267,115.000,2199000.0,11699000.0,9500000.0,...,270.000,0.701299,260.312733,0.676137,-1.1,11.8104,ENEL COLOMBIA SA ESP,Thermal,GAS,60000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3230357,TERMOYOPAL 2,290.000,2021-12-31 23:00:00,28000.0,6590748.81,189.480000,167.804,186000.0,14771000.0,14585000.0,...,122.196,0.421366,100.520000,0.346621,1.5,12.0234,TERMOYOPAL GENERACION 2 S.A.S E.S.P.,Thermal,GAS,28000.0
3230358,ZIPAEMG 2,201.147,2021-12-31 23:00:00,36000.0,6590748.81,189.480000,167.804,3066000.0,14771000.0,11705000.0,...,33.343,0.165764,11.667000,0.058002,1.5,11.4401,ENEL COLOMBIA SA ESP,Thermal,CARBON,36000.0
3230359,ZIPAEMG 3,213.400,2021-12-31 23:00:00,0.0,6590748.81,189.480000,167.804,3066000.0,14771000.0,11705000.0,...,45.596,0.213664,23.920000,0.112090,1.5,11.7024,ENEL COLOMBIA SA ESP,Thermal,CARBON,63000.0
3230360,ZIPAEMG 4,213.130,2021-12-31 23:00:00,55000.0,6590748.81,189.480000,167.804,3066000.0,14771000.0,11705000.0,...,45.326,0.212668,23.650000,0.110965,1.5,12.9999,ENEL COLOMBIA SA ESP,Thermal,CARBON,64000.0


In [None]:
# Inspect the first few windows
for i in range(3):
    print(f"Window {i+1}")
    print(train_now[i])
    print("\n")

Window 1


NameError: name 'train_now' is not defined

### Saving constructed variables for ease of loading

In [None]:
train_df.to_csv('models/data/train_df.csv')
test_df.to_csv('models/data/test_df.csv')

In [None]:
# Save the pipeline
dump(pipeline, '/Users/manotas/Desktop/models/pipeline.joblib')

['/Users/manotas/Desktop/models/pipeline.joblib']