# Here we load, transform and save data, into workable formats for the models

In [2]:
import pandas as pd
import numpy as np
import glob
import os

# File paths for marginalpdbc and precious_pibcic files
marginalpdbc_files = glob.glob('../data_23/marginalpdbc*.csv')
precious_files = glob.glob('../data_23/precios_pibci*.csv')

print(f"Found {len(marginalpdbc_files)} marginalpdbc files")
print(f"Found {len(precious_files)} precious files")

# Function to encode time (day, month, hour) as sine and cosine
def encode_time(value, max_value):
    value_sin = np.sin(2 * np.pi * value / max_value)
    value_cos = np.cos(2 * np.pi * value / max_value)
    return value_sin, value_cos

# Process marginalpdbc files
marginal_data = []
for file in marginalpdbc_files:
    print(f"Processing marginalpdbc file: {file}")
    try:
        data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
        data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price1', 'Unused']
        data = data[['Year', 'Month', 'Day', 'Hour', 'Price1']].dropna()

        # Convert columns to numeric
        data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Price1'] = pd.to_numeric(data['Price1'], errors='coerce')
        data = data.dropna(subset=['Year', 'Month', 'Day', 'Hour', 'Price1'])

        # Encode time features
        data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
        data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
        data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

        # Scale Year
        data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

        marginal_data.append(data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all marginalpdbc data
if marginal_data:
    marginal_data = pd.concat(marginal_data, ignore_index=True)
    print(f"Processed {len(marginal_data)} rows from marginalpdbc files")
else:
    print("No valid marginalpdbc data processed.")

# Process precious_pibcic files
precious_data = []
for file in precious_files:
    print(f"Processing precious file: {file}")
    try:
        data = pd.read_csv(file, delimiter=';', skiprows=2, encoding='latin1')  # Skip first two metadata rows
        data = data.rename(columns=lambda x: x.strip())  # Normalize column names
        data = data.rename(columns={
            'Año': 'Year',
            'Mes': 'Month',
            'Día': 'Day',
            'Hora': 'Hour',
            'MedioES': 'Price2'
        })[['Year', 'Month', 'Day', 'Hour', 'Price2']].dropna()

        # Convert columns to numeric
        data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Price2'] = pd.to_numeric(data['Price2'].str.replace(',', '.'), errors='coerce')  # Handle decimal commas
        data = data.dropna(subset=['Year', 'Month', 'Day', 'Hour', 'Price2'])

        precious_data.append(data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all precious_pibcic data
if precious_data:
    precious_data = pd.concat(precious_data, ignore_index=True)
    print(f"Processed {len(precious_data)} rows from precious files")
else:
    print("No valid precious data processed.")

# Merge marginalpdbc and precious_pibcic data on Year, Month, Day, and Hour
combined_data = pd.merge(
    marginal_data,
    precious_data,
    on=['Year', 'Month', 'Day', 'Hour'],
    how='inner',
    suffixes=('_marginal', '_precious')
)

# Save to a CSV file
output_file = '../TrainingData/trainingdata23.csv'
combined_data.to_csv(output_file, index=False)
print(f"Data has been processed and saved to '{output_file}'")

# Debug: Preview the combined dataset
print(combined_data.head())


Found 365 marginalpdbc files
Found 365 precious files
Processing marginalpdbc file: ../data_23/marginalpdbc_20230421.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230307.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230610.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20231224.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230228.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230420.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230111.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230921.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230714.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20231201.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230519.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230808.1.csv
Processing marginalpdbc file: ../data_23/marginalpdbc_20230611.1.csv
Processing marginalpdbc file: ../data_23/marginal