# Here we load, transform and save data, into workable formats for the models

### Here we load DA, ID and Gas prices and write them to a single file

In [None]:
import pandas as pd
import numpy as np
import glob
import os

#print working directory
print(os.getcwd())

raw_data_dir = '/home/teitur/DTU/electricproject/deeplearning/raw_data'

# Lets print how many files are in the raw_data directory

marginalpdbc_files = glob.glob(os.path.join(raw_data_dir, 'marginalpdbc*.csv'))
precious_files = glob.glob(os.path.join(raw_data_dir, 'precios_pibcic*.csv'))

print(f"Found {len(marginalpdbc_files)} marginalpdbc files")
print(f"Found {len(precious_files)} precious files")

# Lets do the same for the gas prices in the data_gas directory
gas_price_dir = '/home/teitur/DTU/electricproject/deeplearning/DATA/data_gas'
gas_files = glob.glob(os.path.join(gas_price_dir, 'MIBGAS_Data_*.csv'))

print(f"Found {len(gas_files)} gas price files")      

# Function to encode time (day, month, hour) as sine and cosine
def encode_time(value, max_value):
    value_sin = np.sin(2 * np.pi * value / max_value)
    value_cos = np.cos(2 * np.pi * value / max_value)
    return value_sin, value_cos


# Load and preprocess all gas price data
all_gas_data = []
print(f"Found {len(gas_files)} gas price files")

for gas_file in gas_files:
    print(f"Processing file: {gas_file}")
    try:
        gas_data = pd.read_csv(gas_file)
        gas_data.columns = gas_data.columns.str.strip().str.replace('\n', ' ', regex=False)  # Normalize column names
        gas_data = gas_data[['Trading Day', 'Price [EUR/MWh]']].rename(columns={
            'Trading Day': 'Trading_Day',
            'Price [EUR/MWh]': 'Gas_Price'
        })
        gas_data['Trading_Day'] = pd.to_datetime(gas_data['Trading_Day'])
        gas_data['Year'] = gas_data['Trading_Day'].dt.year
        gas_data['Month'] = gas_data['Trading_Day'].dt.month
        gas_data['Day'] = gas_data['Trading_Day'].dt.day
        all_gas_data.append(gas_data[['Year', 'Month', 'Day', 'Gas_Price']])
    except Exception as e:
        print(f"Error processing file {gas_file}: {e}")

# Combine all gas price data
gas_data = pd.concat(all_gas_data, ignore_index=True)
gas_data = gas_data.drop_duplicates(subset=['Year', 'Month', 'Day'])


# Forward-fill missing Gas_Price values
gas_data = gas_data.sort_values(by=['Year', 'Month', 'Day'])
gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='ffill')
gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='bfill')  # Backfill if forward-fill doesn't work

print(f"Combined gas data rows after filling: {len(gas_data)}")
print(gas_data.head())

# Process marginalpdbc files and merge all data
marginal_data = []

for file in marginalpdbc_files:
    print(f"Processing file: {file}")
    data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
    data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price1', 'Price2']
    data = data[['Year', 'Month', 'Day', 'Hour', 'Price1']].dropna()
    data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
    data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
    data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
    data = data.dropna(subset=['Hour', 'Month', 'Day'])
    
    # Encode time features
    data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
    data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
    data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

    # Ensure Year is numeric and scaled
    data['Year'] = data['Year'].astype(int)
    data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

    # Merge with gas price data
    merged_data = pd.merge(data, gas_data, on=['Year', 'Month', 'Day'], how='left')

    marginal_data.append(merged_data)

# Combine all marginal data
marginal_data = pd.concat(marginal_data, ignore_index=True)
marginal_data = marginal_data.drop_duplicates(subset=['Year', 'Month', 'Day', 'Hour'])

# Process precious files and merge with marginal data
precious_data = []

for file in precious_files:
    print(f"Processing file: {file}")
    try:
        data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
        data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price_Precious', 'Other_Column']
        data = data[['Year', 'Month', 'Day', 'Hour', 'Price_Precious']].dropna()
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        # Replace commas with dots and convert to numeric
        data['Price_Precious'] = pd.to_numeric(data['Price_Precious'].str.replace(',', '.'), errors='coerce')
        data = data.dropna(subset=['Hour', 'Month', 'Day'])

        # Encode time features
        data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
        data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
        data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

        # Ensure Year is numeric and scaled
        data['Year'] = data['Year'].astype(int)
        data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

        # Merge with gas price data
        merged_data = pd.merge(data, gas_data, on=['Year', 'Month', 'Day'], how='left')

        precious_data.append(merged_data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all precious data
precious_data = pd.concat(precious_data, ignore_index=True)
precious_data = precious_data.drop_duplicates(subset=['Year', 'Month', 'Day', 'Hour'])

# Merge precious data with marginal data using inner join to ensure consistency
final_data = pd.merge(
    marginal_data,
    precious_data,
    on=[
        'Year', 'Month', 'Day', 'Hour', 'Hour_Sin', 'Hour_Cos',
        'Day_Sin', 'Day_Cos', 'Month_Sin', 'Month_Cos', 'Year_Scaled',
        'Gas_Price'
    ],
    how='inner'  # Use 'inner' join to keep only consistent timestamps
)

# Rename columns for clarity
final_data = final_data.rename(columns={
    'Price1': 'DA',  # Day Ahead prices
    'Price_Precious': 'ID'  # Intra Day prices
})

# Add a new column for the price difference
final_data['Diff'] = final_data['DA'] - final_data['ID']

# Save two versions of the data

# 1. Version with original time columns
columns_with_time = [
    'Year', 'Month', 'Day', 'Hour', 'DA', 'ID', 'Diff',
    'Hour_Sin', 'Hour_Cos', 'Day_Sin', 'Day_Cos', 'Month_Sin', 
    'Month_Cos', 'Year_Scaled', 'Gas_Price'
]
data_with_time = final_data[columns_with_time]
data_with_time = data_with_time.sort_values(by=['Year', 'Month', 'Day', 'Hour'])
output_file_with_time = '/home/teitur/DTU/electricproject/deeplearning/DATAProcessing/Processed/final_data_with_time.csv'
data_with_time.to_csv(output_file_with_time, index=False)
print(f"Data with time columns has been processed and saved to '{output_file_with_time}'")

# 2. Version with only encoded values
columns_encoded_only = [
    'DA', 'ID', 'Diff', 'Hour_Sin', 'Hour_Cos', 
    'Day_Sin', 'Day_Cos', 'Month_Sin', 'Month_Cos', 
    'Year_Scaled', 'Gas_Price'
]
data_encoded_only = final_data[columns_encoded_only]
output_file_encoded_only = '/home/teitur/DTU/electricproject/deeplearning/DATAProcessing/Processed/final_data_encoded_only.csv'
data_encoded_only.to_csv(output_file_encoded_only, index=False)
print(f"Encoded data only has been processed and saved to '{output_file_encoded_only}'")

# Print sample outputs for verification
print("Sample data with time columns:")
print(data_with_time.head())
print("Sample encoded-only data:")
print(data_encoded_only.head())






Found 365 marginalpdbc files
Found 365 precious files
Processed 8760 rows from marginalpdbc files
Processed 8760 rows from precious files
Data has been processed and saved to '../TrainingData/trainingdata23.csv'
   Year  Month   Day  Hour  Price1  Hour_Sin  Hour_Cos   Day_Sin   Day_Cos  \
0  2023    4.0  21.0   1.0  122.50  0.258819  0.965926 -0.897805 -0.440394   
1  2023    4.0  21.0   2.0  112.39  0.500000  0.866025 -0.897805 -0.440394   
2  2023    4.0  21.0   3.0  117.60  0.707107  0.707107 -0.897805 -0.440394   
3  2023    4.0  21.0   4.0  119.84  0.866025  0.500000 -0.897805 -0.440394   
4  2023    4.0  21.0   5.0  115.01  0.965926  0.258819 -0.897805 -0.440394   

   Month_Sin  Month_Cos  Year_Scaled  Price2  
0   0.866025       -0.5          0.6  118.92  
1   0.866025       -0.5          0.6  106.69  
2   0.866025       -0.5          0.6  113.43  
3   0.866025       -0.5          0.6  113.48  
4   0.866025       -0.5          0.6  108.63  


## Here we create the training data, from both the 2023 and 2024 data.

In [2]:
import pandas as pd
import numpy as np
import glob
import os

# File paths for marginalpdbc and precious_pibcic files
marginalpdbc_files = glob.glob('../data_24/marginalpdbc*.*')  # Matches both .csv and .1 files
precious_files = glob.glob('../data_24/precios_pibcic*.*')    # Matches both .csv and .1 files
# Append data_23 file paths
marginalpdbc_files += glob.glob('../data_23/marginalpdbc*.*')  # Add 2023 marginal files
precious_files += glob.glob('../data_23/precios_pibcic*.*')    # Add 2023 precious files


print(f"Found {len(marginalpdbc_files)} marginalpdbc files")
print(f"Found {len(precious_files)} precious files")

# Function to encode time (day, month, hour) as sine and cosine
def encode_time(value, max_value):
    value_sin = np.sin(2 * np.pi * value / max_value)
    value_cos = np.cos(2 * np.pi * value / max_value)
    return value_sin, value_cos

# Process marginalpdbc files
marginal_data = []
for file in marginalpdbc_files:
    try:
        # Handle .1 files (same as .csv logic)
        data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
        data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price1', 'Unused']
        data = data[['Year', 'Month', 'Day', 'Hour', 'Price1']].dropna()

        # Convert columns to numeric
        data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Price1'] = pd.to_numeric(data['Price1'], errors='coerce')
        data = data.dropna(subset=['Year', 'Month', 'Day', 'Hour', 'Price1'])

        # Encode time features
        data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
        data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
        data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

        # Scale Year
        data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

        marginal_data.append(data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all marginalpdbc data
if marginal_data:
    marginal_data = pd.concat(marginal_data, ignore_index=True)
    print(f"Processed {len(marginal_data)} rows from marginalpdbc files")
else:
    print("No valid marginalpdbc data processed.")

# Process precious_pibcic files
precious_data = []
for file in precious_files:
    try:
        # Handle .1 files (same as .csv logic)
        data = pd.read_csv(file, delimiter=';', skiprows=2, encoding='latin1')  # Skip first two metadata rows
        data = data.rename(columns=lambda x: x.strip())  # Normalize column names
        data = data.rename(columns={
            'Año': 'Year',
            'Mes': 'Month',
            'Día': 'Day',
            'Hora': 'Hour',
            'MedioES': 'Price2'
        })[['Year', 'Month', 'Day', 'Hour', 'Price2']].dropna()

        # Convert columns to numeric
        data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Price2'] = pd.to_numeric(data['Price2'].str.replace(',', '.'), errors='coerce')  # Handle decimal commas
        data = data.dropna(subset=['Year', 'Month', 'Day', 'Hour', 'Price2'])

        precious_data.append(data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all precious_pibcic data
if precious_data:
    precious_data = pd.concat(precious_data, ignore_index=True)
    print(f"Processed {len(precious_data)} rows from precious files")
else:
    print("No valid precious data processed.")

# Merge marginalpdbc and precious_pibcic data on Year, Month, Day, and Hour
combined_data = pd.merge(
    marginal_data,
    precious_data,
    on=['Year', 'Month', 'Day', 'Hour'],
    how='inner',
    suffixes=('_marginal', '_precious')
)

# Save to a CSV file
output_file = '../TrainingData/trainingdata_23_24.csv'
combined_data.to_csv(output_file, index=False)
print(f"Data has been processed and saved to '{output_file}'")

# Debug: Preview the combined dataset
print(combined_data.head())


Found 700 marginalpdbc files
Found 700 precious files
Processed 16800 rows from marginalpdbc files
Processed 16800 rows from precious files
Data has been processed and saved to '../TrainingData/trainingdata_23_24.csv'
   Year  Month   Day  Hour  Price1  Hour_Sin  Hour_Cos   Day_Sin   Day_Cos  \
0  2024    3.0  24.0   1.0    0.16  0.258819  0.965926 -0.988468  0.151428   
1  2024    3.0  24.0   2.0    0.00  0.500000  0.866025 -0.988468  0.151428   
2  2024    3.0  24.0   3.0    0.00  0.707107  0.707107 -0.988468  0.151428   
3  2024    3.0  24.0   4.0    0.00  0.866025  0.500000 -0.988468  0.151428   
4  2024    3.0  24.0   5.0    0.00  0.965926  0.258819 -0.988468  0.151428   

   Month_Sin     Month_Cos  Year_Scaled  Price2  
0        1.0  6.123234e-17          0.7    0.22  
1        1.0  6.123234e-17          0.7    2.25  
2        1.0  6.123234e-17          0.7   -1.39  
3        1.0  6.123234e-17          0.7    0.90  
4        1.0  6.123234e-17          0.7    1.79  


In [5]:
import pandas as pd
import numpy as np
import glob
import os

# File paths for marginalpdbc and precious_pibcic files
marginalpdbc_files = glob.glob('../data_24/marginalpdbc*.1.csv') + glob.glob('../data_24/marginalpdbc*.1')  # Matches .1.csv and .1 files
precious_files = glob.glob('../data_24/precios_pibcic*.1.csv') + glob.glob('../data_24/precios_pibcic*.1')  # Matches .1.csv and .1 files

# Append data_23 file paths
marginalpdbc_files += glob.glob('../data_23/marginalpdbc*.1.csv') + glob.glob('../data_23/marginalpdbc*.1')  # Matches .1.csv and .1 files
precious_files += glob.glob('../data_23/precios_pibcic*.1.csv') + glob.glob('../data_23/precios_pibcic*.1')  # Matches .1.csv and .1 files

# Add additional file paths from data_more
marginalpdbc_files += glob.glob('../data_more/marginalpdbc_202*.1.csv') + glob.glob('../data_more/marginalpdbc_202*.1')  # Matches .1.csv and .1 files
precious_files += glob.glob('../data_more/precios_pibcic_202*.1.csv') + glob.glob('../data_more/precios_pibcic_202*.1')  # Matches .1.csv and .1 files

print(f"Found {len(marginalpdbc_files)} marginalpdbc files")
print(f"Found {len(precious_files)} precious files")



# Function to encode time (day, month, hour) as sine and cosine
def encode_time(value, max_value):
    value_sin = np.sin(2 * np.pi * value / max_value)
    value_cos = np.cos(2 * np.pi * value / max_value)
    return value_sin, value_cos

# Process marginalpdbc files
marginal_data = []
for file in marginalpdbc_files:
    try:
        # Handle .1 files (same as .csv logic)
        data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
        data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price1', 'Unused']
        data = data[['Year', 'Month', 'Day', 'Hour', 'Price1']].dropna()

        # Convert columns to numeric
        data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Price1'] = pd.to_numeric(data['Price1'], errors='coerce')
        data = data.dropna(subset=['Year', 'Month', 'Day', 'Hour', 'Price1'])

        # Encode time features
        data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
        data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
        data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

        # Scale Year
        data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

        marginal_data.append(data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all marginalpdbc data
if marginal_data:
    marginal_data = pd.concat(marginal_data, ignore_index=True)
    print(f"Processed {len(marginal_data)} rows from marginalpdbc files")
else:
    print("No valid marginalpdbc data processed.")

# Process precious_pibcic files
precious_data = []
for file in precious_files:
    try:
        # Handle .1 files (same as .csv logic)
        data = pd.read_csv(file, delimiter=';', skiprows=2, encoding='latin1')  # Skip first two metadata rows
        data = data.rename(columns=lambda x: x.strip())  # Normalize column names
        data = data.rename(columns={
            'Año': 'Year',
            'Mes': 'Month',
            'Día': 'Day',
            'Hora': 'Hour',
            'MedioES': 'Price2'
        })[['Year', 'Month', 'Day', 'Hour', 'Price2']].dropna()

        # Convert columns to numeric
        data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Price2'] = pd.to_numeric(data['Price2'].str.replace(',', '.'), errors='coerce')  # Handle decimal commas
        data = data.dropna(subset=['Year', 'Month', 'Day', 'Hour', 'Price2'])

        precious_data.append(data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all precious_pibcic data
if precious_data:
    precious_data = pd.concat(precious_data, ignore_index=True)
    print(f"Processed {len(precious_data)} rows from precious files")
else:
    print("No valid precious data processed.")

# Merge marginalpdbc and precious_pibcic data on Year, Month, Day, and Hour
combined_data = pd.merge(
    marginal_data,
    precious_data,
    on=['Year', 'Month', 'Day', 'Hour'],
    how='inner',
    suffixes=('_marginal', '_precious')
)

# Save to a CSV file
output_file = '../TrainingData/trainingdata_20_24.csv'
combined_data.to_csv(output_file, index=False)
print(f"Data has been processed and saved to '{output_file}'")

# Debug: Preview the combined dataset
print(combined_data.head())


Found 1795 marginalpdbc files
Found 1796 precious files
Processed 43080 rows from marginalpdbc files
Processed 43104 rows from precious files
Data has been processed and saved to '../TrainingData/trainingdata_20_24.csv'
   Year  Month   Day  Hour  Price1  Hour_Sin  Hour_Cos   Day_Sin   Day_Cos  \
0  2024    3.0  24.0   1.0    0.16  0.258819  0.965926 -0.988468  0.151428   
1  2024    3.0  24.0   2.0    0.00  0.500000  0.866025 -0.988468  0.151428   
2  2024    3.0  24.0   3.0    0.00  0.707107  0.707107 -0.988468  0.151428   
3  2024    3.0  24.0   4.0    0.00  0.866025  0.500000 -0.988468  0.151428   
4  2024    3.0  24.0   5.0    0.00  0.965926  0.258819 -0.988468  0.151428   

   Month_Sin     Month_Cos  Year_Scaled  Price2  
0        1.0  6.123234e-17          0.7    0.22  
1        1.0  6.123234e-17          0.7    2.25  
2        1.0  6.123234e-17          0.7   -1.39  
3        1.0  6.123234e-17          0.7    0.90  
4        1.0  6.123234e-17          0.7    1.79  


In [12]:
import pandas as pd
import numpy as np
import glob
import os


#print working directory
print(os.getcwd())

raw_data_dir = '/home/teitur/DTU/electricproject/deeplearning/raw_data'

# Lets print how many files are in the raw_data directory

marginalpdbc_files = glob.glob(os.path.join(raw_data_dir, 'marginalpdbc*.csv'))
precious_files = glob.glob(os.path.join(raw_data_dir, 'precios_pibcic*.csv'))

print(f"Found {len(marginalpdbc_files)} marginalpdbc files")
print(f"Found {len(precious_files)} precious files")

# Lets do the same for the gas prices in the data_gas directory
gas_price_dir = '/home/teitur/DTU/electricproject/deeplearning/DATA/data_gas'
gas_files = glob.glob(os.path.join(gas_price_dir, 'MIBGAS_Data_*.csv'))

print(f"Found {len(gas_files)} gas price files")      

# Function to encode time (day, month, hour) as sine and cosine
def encode_time(value, max_value):
    value_sin = np.sin(2 * np.pi * value / max_value)
    value_cos = np.cos(2 * np.pi * value / max_value)
    return value_sin, value_cos


# Load and preprocess all gas price data
all_gas_data = []
print(f"Found {len(gas_files)} gas price files")

for gas_file in gas_files:
    print(f"Processing file: {gas_file}")
    try:
        gas_data = pd.read_csv(gas_file)
        gas_data.columns = gas_data.columns.str.strip().str.replace('\n', ' ', regex=False)  # Normalize column names
        gas_data = gas_data[['Trading Day', 'Price [EUR/MWh]']].rename(columns={
            'Trading Day': 'Trading_Day',
            'Price [EUR/MWh]': 'Gas_Price'
        })
        gas_data['Trading_Day'] = pd.to_datetime(gas_data['Trading_Day'])
        gas_data['Year'] = gas_data['Trading_Day'].dt.year
        gas_data['Month'] = gas_data['Trading_Day'].dt.month
        gas_data['Day'] = gas_data['Trading_Day'].dt.day
        all_gas_data.append(gas_data[['Year', 'Month', 'Day', 'Gas_Price']])
    except Exception as e:
        print(f"Error processing file {gas_file}: {e}")

# Combine all gas price data
gas_data = pd.concat(all_gas_data, ignore_index=True)

# Forward-fill missing Gas_Price values
gas_data = gas_data.sort_values(by=['Year', 'Month', 'Day'])
gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='ffill')
gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='bfill')  # Backfill if forward-fill doesn't work

print(f"Combined gas data rows after filling: {len(gas_data)}")
print(gas_data.head())

# Process marginalpdbc files and merge all data
marginal_data = []

for file in marginalpdbc_files:
    print(f"Processing file: {file}")
    data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
    data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price1', 'Price2']
    data = data[['Year', 'Month', 'Day', 'Hour', 'Price1']].dropna()
    data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
    data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
    data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
    data = data.dropna(subset=['Hour', 'Month', 'Day'])
    
    # Encode time features
    data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
    data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
    data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

    # Ensure Year is numeric and scaled
    data['Year'] = data['Year'].astype(int)
    data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

    # Merge with gas price data
    merged_data = pd.merge(data, gas_data, on=['Year', 'Month', 'Day'], how='left')

    marginal_data.append(merged_data)

# Combine all marginal data
marginal_data = pd.concat(marginal_data, ignore_index=True)

# Select desired columns and save to a CSV file
columns_to_keep = [
    'Price1', 'Hour_Sin', 'Hour_Cos', 'Day_Sin', 'Day_Cos', 
    'Month_Sin', 'Month_Cos', 'Year_Scaled', 'Gas_Price'
]
marginal_data = marginal_data[columns_to_keep]

output_file = '/home/teitur/DTU/electricproject/deeplearning/DATAProcessing/Processed/data__gas.csv'
marginal_data.to_csv(output_file, index=False)
print(f"Data has been processed and saved to '{output_file}'")

print(marginal_data.head())



/home/teitur/DTU/electricproject/deeplearning/DATAProcessing
Found 365 marginalpdbc files
Found 365 precious files
Found 6 gas price files
Found 6 gas price files
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2023.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2021.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2018.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2020.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2019.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2022.csv
Combined gas data rows after filling: 2515
      Year  Month  Day  Gas_Price
1421  2018      1    1      24.50
1420  2018      1    2      23.95
1419  2018      1    3      21.90
1418  2018      1    4      19.63
1417  2018      1    5      19.85
Pr

  gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='ffill')
  gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='bfill')  # Backfill if forward-fill doesn't work


Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230410.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20231109.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230804.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230825.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230502.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20231024.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20231025.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230514.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230301.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230506.1.csv


In [32]:
import pandas as pd
import numpy as np
import glob
import os


#print working directory
print(os.getcwd())

raw_data_dir = '/home/teitur/DTU/electricproject/deeplearning/raw_data'

# Lets print how many files are in the raw_data directory

marginalpdbc_files = glob.glob(os.path.join(raw_data_dir, 'marginalpdbc*.csv'))
precious_files = glob.glob(os.path.join(raw_data_dir, 'precios_pibcic*.csv'))

print(f"Found {len(marginalpdbc_files)} marginalpdbc files")
print(f"Found {len(precious_files)} precious files")

# Lets do the same for the gas prices in the data_gas directory
gas_price_dir = '/home/teitur/DTU/electricproject/deeplearning/DATA/data_gas'
gas_files = glob.glob(os.path.join(gas_price_dir, 'MIBGAS_Data_*.csv'))

print(f"Found {len(gas_files)} gas price files")      

# Function to encode time (day, month, hour) as sine and cosine
def encode_time(value, max_value):
    value_sin = np.sin(2 * np.pi * value / max_value)
    value_cos = np.cos(2 * np.pi * value / max_value)
    return value_sin, value_cos


# Load and preprocess all gas price data
all_gas_data = []
print(f"Found {len(gas_files)} gas price files")

for gas_file in gas_files:
    print(f"Processing file: {gas_file}")
    try:
        gas_data = pd.read_csv(gas_file)
        gas_data.columns = gas_data.columns.str.strip().str.replace('\n', ' ', regex=False)  # Normalize column names
        gas_data = gas_data[['Trading Day', 'Price [EUR/MWh]']].rename(columns={
            'Trading Day': 'Trading_Day',
            'Price [EUR/MWh]': 'Gas_Price'
        })
        gas_data['Trading_Day'] = pd.to_datetime(gas_data['Trading_Day'])
        gas_data['Year'] = gas_data['Trading_Day'].dt.year
        gas_data['Month'] = gas_data['Trading_Day'].dt.month
        gas_data['Day'] = gas_data['Trading_Day'].dt.day
        all_gas_data.append(gas_data[['Year', 'Month', 'Day', 'Gas_Price']])
    except Exception as e:
        print(f"Error processing file {gas_file}: {e}")

# Combine all gas price data
gas_data = pd.concat(all_gas_data, ignore_index=True)
gas_data = gas_data.drop_duplicates(subset=['Year', 'Month', 'Day'])


# Forward-fill missing Gas_Price values
gas_data = gas_data.sort_values(by=['Year', 'Month', 'Day'])
gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='ffill')
gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='bfill')  # Backfill if forward-fill doesn't work

print(f"Combined gas data rows after filling: {len(gas_data)}")
print(gas_data.head())

# Process marginalpdbc files and merge all data
marginal_data = []

for file in marginalpdbc_files:
    print(f"Processing file: {file}")
    data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
    data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price1', 'Price2']
    data = data[['Year', 'Month', 'Day', 'Hour', 'Price1']].dropna()
    data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
    data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
    data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
    data = data.dropna(subset=['Hour', 'Month', 'Day'])
    
    # Encode time features
    data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
    data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
    data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

    # Ensure Year is numeric and scaled
    data['Year'] = data['Year'].astype(int)
    data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

    # Merge with gas price data
    merged_data = pd.merge(data, gas_data, on=['Year', 'Month', 'Day'], how='left')

    marginal_data.append(merged_data)

# Combine all marginal data
marginal_data = pd.concat(marginal_data, ignore_index=True)
marginal_data = marginal_data.drop_duplicates(subset=['Year', 'Month', 'Day', 'Hour'])

# Process precious files and merge with marginal data
precious_data = []

for file in precious_files:
    print(f"Processing file: {file}")
    try:
        data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
        data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price_Precious', 'Other_Column']
        data = data[['Year', 'Month', 'Day', 'Hour', 'Price_Precious']].dropna()
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        # Replace commas with dots and convert to numeric
        data['Price_Precious'] = pd.to_numeric(data['Price_Precious'].str.replace(',', '.'), errors='coerce')
        data = data.dropna(subset=['Hour', 'Month', 'Day'])

        # Encode time features
        data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
        data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
        data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

        # Ensure Year is numeric and scaled
        data['Year'] = data['Year'].astype(int)
        data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

        # Merge with gas price data
        merged_data = pd.merge(data, gas_data, on=['Year', 'Month', 'Day'], how='left')

        precious_data.append(merged_data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all precious data
precious_data = pd.concat(precious_data, ignore_index=True)
precious_data = precious_data.drop_duplicates(subset=['Year', 'Month', 'Day', 'Hour'])

# Merge precious data with marginal data using inner join to ensure consistency
final_data = pd.merge(
    marginal_data,
    precious_data,
    on=[
        'Year', 'Month', 'Day', 'Hour', 'Hour_Sin', 'Hour_Cos',
        'Day_Sin', 'Day_Cos', 'Month_Sin', 'Month_Cos', 'Year_Scaled',
        'Gas_Price'
    ],
    how='inner'  # Use 'inner' join to keep only consistent timestamps
)

# Rename columns for clarity
final_data = final_data.rename(columns={
    'Price1': 'DA',  # Day Ahead prices
    'Price_Precious': 'ID'  # Intra Day prices
})

# Add a new column for the price difference
final_data['Diff'] = final_data['DA'] - final_data['ID']

# Save two versions of the data

# 1. Version with original time columns
columns_with_time = [
    'Year', 'Month', 'Day', 'Hour', 'DA', 'ID', 'Diff',
    'Hour_Sin', 'Hour_Cos', 'Day_Sin', 'Day_Cos', 'Month_Sin', 
    'Month_Cos', 'Year_Scaled', 'Gas_Price'
]
data_with_time = final_data[columns_with_time]
data_with_time = data_with_time.sort_values(by=['Year', 'Month', 'Day', 'Hour'])
output_file_with_time = '/home/teitur/DTU/electricproject/deeplearning/DATAProcessing/Processed/final_data_with_time.csv'
data_with_time.to_csv(output_file_with_time, index=False)
print(f"Data with time columns has been processed and saved to '{output_file_with_time}'")

# 2. Version with only encoded values
columns_encoded_only = [
    'DA', 'ID', 'Diff', 'Hour_Sin', 'Hour_Cos', 
    'Day_Sin', 'Day_Cos', 'Month_Sin', 'Month_Cos', 
    'Year_Scaled', 'Gas_Price'
]
data_encoded_only = final_data[columns_encoded_only]
output_file_encoded_only = '/home/teitur/DTU/electricproject/deeplearning/DATAProcessing/Processed/final_data_encoded_only.csv'
data_encoded_only.to_csv(output_file_encoded_only, index=False)
print(f"Encoded data only has been processed and saved to '{output_file_encoded_only}'")

# Print sample outputs for verification
print("Sample data with time columns:")
print(data_with_time.head())
print("Sample encoded-only data:")
print(data_encoded_only.head())






/home/teitur/DTU/electricproject/deeplearning/DATAProcessing
Found 365 marginalpdbc files
Found 365 precious files
Found 6 gas price files
Found 6 gas price files
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2023.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2021.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2018.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2020.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2019.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/DATA/data_gas/MIBGAS_Data_2022.csv
Combined gas data rows after filling: 2089
      Year  Month  Day  Gas_Price
1421  2018      1    1      24.50
1420  2018      1    2      23.95
1419  2018      1    3      21.90
1418  2018      1    4      19.63
1417  2018      1    5      19.85
Pr

  gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='ffill')
  gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='bfill')  # Backfill if forward-fill doesn't work


Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230502.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20231024.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20231025.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230514.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230301.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230506.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230615.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230807.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230119.1.csv
Processing file: /home/teitur/DTU/electricproject/deeplearning/raw_data/marginalpdbc_20230531.1.csv


In [None]:
# Load the final data
final_data = pd.read_csv('/home/teitur/DTU/electricproject/deeplearning/DATAProcessing/Processed/final_data_with_time.csv')

print(final_data.head())

# Create a unique identifier for each timestamp using Year, Month, Day, and Hour
final_data['Timestamp'] = (
    final_data['Year'].astype(str) +
    final_data['Month'].astype(str).str.zfill(2) +
    final_data['Day'].astype(str).str.zfill(2) +
    final_data['Hour'].astype(str).str.zfill(2)
)

# Identify duplicated timestamps
duplicate_timestamps = final_data[final_data.duplicated(subset=['Timestamp'], keep=False)]

if not duplicate_timestamps.empty:
    print("Duplicate timestamps found:")
    print(duplicate_timestamps[['Year', 'Month', 'Day', 'Hour']])
else:
    print("No duplicate timestamps found")

   Year  Month  Day  Hour   DA    ID  Diff  Hour_Sin  Hour_Cos   Day_Sin  \
0  2023    1.0  1.0   1.0  0.0  0.14 -0.14  0.258819  0.965926  0.201299   
1  2023    1.0  1.0   2.0  0.0  0.13 -0.13  0.500000  0.866025  0.201299   
2  2023    1.0  1.0   3.0  0.0  0.12 -0.12  0.707107  0.707107  0.201299   
3  2023    1.0  1.0   4.0  0.0  0.00  0.00  0.866025  0.500000  0.201299   
4  2023    1.0  1.0   5.0  0.0  0.00  0.00  0.965926  0.258819  0.201299   

   Day_Cos  Month_Sin  Month_Cos  Year_Scaled  Gas_Price  
0  0.97953        0.5   0.866025          0.6        NaN  
1  0.97953        0.5   0.866025          0.6        NaN  
2  0.97953        0.5   0.866025          0.6        NaN  
3  0.97953        0.5   0.866025          0.6        NaN  
4  0.97953        0.5   0.866025          0.6        NaN  
No duplicate timestamps found
