### Libraries, paths, and set-up

In [1]:
import pandas as pd
import numpy as np
import os
os.chdir('/Users/manotas/Documents/GitHub-Repos/ML-Energy-Colombia')
from tabulate import tabulate
from scipy.stats import skew, kurtosis, jarque_bera
from statsmodels.tsa.stattools import adfuller, kpss
from src.data.loader import data_loader
from tqdm import tqdm 
import warnings
warnings.filterwarnings('ignore')

if not os.path.exists('reports/tables'):
    os.mkdir('reports/tables')

In [26]:
# Processed dataframes
soi, capacity, askprice, bidprice, resource, demand, supply, price_fuel, mcost, generation = data_loader()

In [27]:
supply = supply.drop(columns=['agent_code','plant'])
demand = demand.drop(columns=['distributor_code'])
generation = generation.drop(columns=['plant', 'technology', 'fuel', 'agent_code', 'dispatch_type'])


In [28]:
dataframes = {'bidprice': bidprice, 'demand': demand, 'supply': supply, 'mcost': mcost, 'generation': generation}

for name, df in dataframes.items():
    df['datetime'] = pd.to_datetime(df['datetime'])
    df.set_index('datetime', inplace=True)
    dataframes[name] = df.resample('D').mean()

# Unpack dataframes from dictionary to global variables
globals().update(dataframes)


### Descriptive statistics
An example to build a custom function that returns relevant descriptive stats for a given variable within a column, directly outputting into a LaTeX table for further usage

In [51]:
def ts_summary(df, col, tf=True):
    if col not in df.columns:
        print(f'Column {col} not found in dataframe')
        return None
    
    df = df.dropna()
    col_data = df[col].copy()
    
    
    if tf:
        col_data = np.log(col_data).fillna(0)
        col_data = col_data.diff().dropna()
        col_data = col_data.replace([np.inf, -np.inf], np.nan)
        col_data = col_data.dropna()


    results = {
        'Mean': col_data.mean(),
        'SD': col_data.std(),
        'Skewness': skew(col_data),
        'Kurtosis': kurtosis(col_data),
        'JB Test_pval': jarque_bera(col_data)[1],
        'ADF Test_pval': adfuller(col_data)[1],
        'KPSS Test_pval': kpss(col_data, regression='c')[1]
    }
    return results

In [38]:
# Dictionary of datasets and corresponding columns
data_dict = {
    'capacity_kW': capacity,
    'daily_ask': askprice,
    'soi': soi,
    'hourly_bid': bidprice,
    'hourly_mc': mcost,
    'generation_hourly': generation,
    'demand_hourly': demand,
    'supply_hourly':supply,
}

# Applying the function to each specified column of each dataframe
results = [ts_summary(df, col, tf=False) for col, df in tqdm(data_dict.items())]
results_df = pd.DataFrame(results, index=data_dict.keys())

# Converting the results dataframe to a LaTeX table
latex_table = tabulate(results_df, tablefmt='latex', headers='keys')

# Writing the table to a .tex file
with open('reports/tables/descstats_lvl.tex', 'w') as file:
    file.write(latex_table)



[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 8/8 [01:04<00:00,  8.10s/it]


In [52]:
# Dictionary of datasets and corresponding columns
data_dict = {
    'capacity_kW': capacity,
    'daily_ask': askprice,
    'soi': soi,
    'hourly_bid': bidprice,
    'hourly_mc': mcost,
    'generation_hourly': generation,
    'demand_hourly': demand,
    'supply_hourly':supply,
}

# Applying the function to each specified column of each dataframe
resultstf = [ts_summary(df, col, tf=True) for col, df in tqdm(data_dict.items())]
resultstf_df = pd.DataFrame(results, index=data_dict.keys())

# Converting the results dataframe to a LaTeX table
latex_tabletf = tabulate(resultstf_df, tablefmt='latex', headers='keys')

# Writing the table to a .tex file
with open('reports/tables/descstats_tf.tex', 'w') as file:
    file.write(latex_tabletf)






[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




100%|██████████| 8/8 [01:06<00:00,  8.25s/it]
