### Libraries, paths, and set-up

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import os
os.chdir('/Users/manotas/Documents/GitHub-Repos/ML-Energy-Colombia')
import warnings
from src.data.loader import data_loader
from src.utils.utils import *
warnings.filterwarnings('ignore')

if not os.path.exists('data/processed'):
    os.mkdir('data/processed')

storing_path = 'data/processed/'

In [2]:
# Processed dataframes
askprice, supply = data_loader('askprice', 'supply')

### Handling supply data
By market setup, ask prices are sent in daily for all hours in a day. Here we increase granularity to hourly information (repeated).

In [3]:
# Some plants do not offer their generation to the main power grid, these are used only for sustaining public or private entities
# according to law, they do have to be registered. We won't predict for these, as they do not bid in the open day-ahead market.
supply_plants = set(supply['agent_code'])
askprice_plants = set(askprice['agent_code'])

# Plants in supply but not in askprice
only_in_supply = supply_plants - askprice_plants
print("Plants only in supply: ", only_in_supply)

# Plants in askprice but not in supply
only_in_askprice = askprice_plants - supply_plants
print("Plants only in askprice: ", only_in_askprice)

Plants only in supply:  {'ESUG', 'HDTG', 'RPEG', 'ABAG', 'AAGG', 'DEPG', 'GNCG', 'HLAG', 'CEEG', 'EEPG', 'EMEG', 'RTAG', 'AXEG', 'CHZG', 'NTCG', 'PECG', 'TMMG', 'ADCG', 'MCAG', 'EGPG', 'CCGG', 'EGEG', 'GSAG', 'CETG', 'GPEG', 'GLMG', 'ENUG', 'GNYG', 'HCCG', 'NRCG', 'IACG', 'DLRG', 'TRPG', 'RENG', 'HBCG', 'SPRG', 'EMSG', 'PCYG', 'BDJG', 'ERCG', 'GPYG', 'ENGG', 'SFEG', 'GDEG', 'CDNG', 'GALG', 'CMXG', 'HZEG', 'GCEG', 'HEMG', 'GCYG'}
Plants only in askprice:  set()


In [4]:
# Copying askprice to work with it, verifying date formatting
askprice_c = askprice.copy()
askprice_c = datetimer(askprice_c, col_name='date')

# A new DataFrame that has all hours for all days for all plants
all_hours = pd.date_range(start=askprice_c['date'].min(), end=askprice_c['date'].max(), freq='H')
all_plants = askprice_c['plant'].unique()
index = pd.MultiIndex.from_product([all_plants, all_hours], names=['plant', 'date'])
all_data = pd.DataFrame(index=index).reset_index()

# Merging this with askprice_c
askprice_hourly = pd.merge(all_data, askprice_c, on=['plant', 'date'], how='left')
askprice_hourly['datetime'] = askprice_hourly['date']
#askprice_hourly = askprice_hourly.sort_values(by='date')

# Filling the rest of hours in a day
askprice_hourly['date'] = askprice_hourly['date'].dt.date
askprice_hourly = askprice_hourly.groupby(['plant', 'date']).apply(lambda group: group.ffill())
askprice_hourly

# Dropping the current multindex, sorting by time and plant, reindexing
askprice_hourly = askprice_hourly.reset_index(drop=True)
askprice_hourly = askprice_hourly.sort_values(by=['plant', 'datetime'])
askprice_hourly = askprice_hourly.reset_index(drop=True)

# This leaves us with NaNs for those plants that didn't offer in a particular day, for agent_code we can look to the original dfs
# for the price points, we can assume they didn't offer (0)
plant_agent_dict = askprice_c.dropna(subset=['agent_code']).set_index('plant')['agent_code'].to_dict()

# Filling NaN values in the 'agent_code' column of the 'askprice_hourly' df
askprice_hourly.loc[askprice_hourly['agent_code'].isna(), 'agent_code'] = askprice_hourly.loc[askprice_hourly['agent_code'].isna(), 'plant'].map(plant_agent_dict)

# Filling monetary NaN values
askprice_hourly = askprice_hourly.fillna(0)
askprice_hourly = askprice_hourly.reset_index(drop=True)
askprice_hourly = askprice_hourly.drop(columns=['date'])

# Storing
askprice_hourly.to_csv(storing_path + 'h_supplyprice.csv', index=False)

In [5]:
# Constructing a full supply dataframe with all prices, all supply offerings (availability), for all plants, at all dates
supply = datetimer(supply)

fullsupply = askprice_hourly.merge(supply, on=['plant','agent_code','datetime'], how='left')
fullsupply = fullsupply.fillna(0)

# Storing for ease of coding
fullsupply.to_csv(storing_path + 'fullsupply.csv', index=False)