In [1]:

import pandas as pd
import numpy as np
import glob
import os

#print working directory
print(os.getcwd())

data_dir = '../raw_data/'
output_dir = '../formatted_data/'

# Lets print how many files are in the raw_data directory

marginalpdbc_files = glob.glob(os.path.join(data_dir, 'data_electric/data_da*.csv'))
precious_files = glob.glob(os.path.join(data_dir, 'data_electric/data_id*.csv'))
gas_files = glob.glob(os.path.join(data_dir, 'data_gas/MIBGAS_Data_*.csv'))

print(f"Found {len(marginalpdbc_files)} marginalpdbc files")
print(f"Found {len(precious_files)} precious files")
print(f"Found {len(gas_files)} gas price files")      

# Function to encode time (day, month, hour) as sine and cosine
def encode_time(value, max_value):
    value_sin = np.sin(2 * np.pi * value / max_value)
    value_cos = np.cos(2 * np.pi * value / max_value)
    return value_sin, value_cos


# Load and preprocess all gas price data
all_gas_data = []
print(f"Found {len(gas_files)} gas price files")

for gas_file in gas_files:
    print(f"Processing file: {gas_file}")
    try:
        gas_data = pd.read_csv(gas_file)
        gas_data.columns = gas_data.columns.str.strip().str.replace('\n', ' ', regex=False)  # Normalize column names
        gas_data = gas_data[['Trading Day', 'Price [EUR/MWh]']].rename(columns={
            'Trading Day': 'Trading_Day',
            'Price [EUR/MWh]': 'Gas_Price'
        })
        gas_data['Trading_Day'] = pd.to_datetime(gas_data['Trading_Day'])
        gas_data['Year'] = gas_data['Trading_Day'].dt.year
        gas_data['Month'] = gas_data['Trading_Day'].dt.month
        gas_data['Day'] = gas_data['Trading_Day'].dt.day
        all_gas_data.append(gas_data[['Year', 'Month', 'Day', 'Gas_Price']])
    except Exception as e:
        print(f"Error processing file {gas_file}: {e}")

# Combine all gas price data
gas_data = pd.concat(all_gas_data, ignore_index=True)
gas_data = gas_data.drop_duplicates(subset=['Year', 'Month', 'Day'])


# Forward-fill missing Gas_Price values
gas_data = gas_data.sort_values(by=['Year', 'Month', 'Day'])
gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='ffill')
gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='bfill')  # Backfill if forward-fill doesn't work

print(f"Combined gas data rows after filling: {len(gas_data)}")
print(gas_data.head())

# Process marginalpdbc files and merge all data
marginal_data = []

# Example of marginalpdbc data
# Year,Month,Day,Hour,DA ES
# 2023.0,1.0,1.0,1.0,0.0

for file in marginalpdbc_files:
    print(f"Processing file: {file}")
    data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
    data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price1', 'Price2']
    data = data[['Year', 'Month', 'Day', 'Hour', 'Price1']].dropna()
    data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
    data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
    data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
    data = data.dropna(subset=['Hour', 'Month', 'Day'])
    
    # Encode time features
    data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
    data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
    data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

    # Ensure Year is numeric and scaled
    data['Year'] = data['Year'].astype(int)
    data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

    # Merge with gas price data
    merged_data = pd.merge(data, gas_data, on=['Year', 'Month', 'Day'], how='left')

    marginal_data.append(merged_data)

# Combine all marginal data
marginal_data = pd.concat(marginal_data, ignore_index=True)
marginal_data = marginal_data.drop_duplicates(subset=['Year', 'Month', 'Day', 'Hour'])

# Process precious files and merge with marginal data
precious_data = []

# Example of precious data
# Year,Month,Day,Hour,MaxES,MinES,AvgES
# 2023.0,1.0,1.0,1.0,0.14,-4.0,-0.72

for file in precious_files:
    print(f"Processing file: {file}")
    try:
        data = pd.read_csv(file, delimiter=';', header=None, skiprows=1, usecols=range(6), encoding='latin1').iloc[:-1, :]
        data.columns = ['Year', 'Month', 'Day', 'Hour', 'Price_Precious', 'Other_Column']
        data = data[['Year', 'Month', 'Day', 'Hour', 'Price_Precious']].dropna()
        data['Hour'] = pd.to_numeric(data['Hour'], errors='coerce')
        data['Month'] = pd.to_numeric(data['Month'], errors='coerce')
        data['Day'] = pd.to_numeric(data['Day'], errors='coerce')
        # Replace commas with dots and convert to numeric
        data['Price_Precious'] = pd.to_numeric(data['Price_Precious'].str.replace(',', '.'), errors='coerce')
        data = data.dropna(subset=['Hour', 'Month', 'Day'])

        # Encode time features
        data['Hour_Sin'], data['Hour_Cos'] = zip(*data['Hour'].apply(lambda x: encode_time(x, 24)))
        data['Day_Sin'], data['Day_Cos'] = zip(*data['Day'].apply(lambda x: encode_time(x, 31)))
        data['Month_Sin'], data['Month_Cos'] = zip(*data['Month'].apply(lambda x: encode_time(x, 12)))

        # Ensure Year is numeric and scaled
        data['Year'] = data['Year'].astype(int)
        data['Year_Scaled'] = (data['Year'] - 2018) * 0.1 + 0.1

        # Merge with gas price data
        merged_data = pd.merge(data, gas_data, on=['Year', 'Month', 'Day'], how='left')

        precious_data.append(merged_data)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine all precious data
precious_data = pd.concat(precious_data, ignore_index=True)
precious_data = precious_data.drop_duplicates(subset=['Year', 'Month', 'Day', 'Hour'])

# Merge precious data with marginal data using inner join to ensure consistency
final_data = pd.merge(
    marginal_data,
    precious_data,
    on=[
        'Year', 'Month', 'Day', 'Hour', 'Hour_Sin', 'Hour_Cos',
        'Day_Sin', 'Day_Cos', 'Month_Sin', 'Month_Cos', 'Year_Scaled',
        'Gas_Price'
    ],
    how='inner'  # Use 'inner' join to keep only consistent timestamps
)

# Rename columns for clarity
final_data = final_data.rename(columns={
    'Price1': 'DA',  # Day Ahead prices
    'Price_Precious': 'ID'  # Intra Day prices
})

# Add a new column for the price difference
final_data['Diff'] = final_data['DA'] - final_data['ID']

# Save two versions of the data

# 1. Version with original time columns
columns_with_time = [
    'Year', 'Month', 'Day', 'Hour', 'DA', 'ID', 'Diff',
    'Hour_Sin', 'Hour_Cos', 'Day_Sin', 'Day_Cos', 'Month_Sin', 
    'Month_Cos', 'Year_Scaled', 'Gas_Price'
]
data_with_time = final_data[columns_with_time]
data_with_time = data_with_time.sort_values(by=['Year', 'Month', 'Day', 'Hour'])
# We use the output directory we defined earlier
output_file_with_time = output_dir + 'final_data_with_time.csv'
data_with_time.to_csv(output_file_with_time, index=False)
print(f"Data with time columns has been processed and saved to '{output_file_with_time}'")

# 2. Version with only encoded values
columns_encoded_only = [
    'DA', 'ID', 'Diff', 'Hour_Sin', 'Hour_Cos', 
    'Day_Sin', 'Day_Cos', 'Month_Sin', 'Month_Cos', 
    'Year_Scaled', 'Gas_Price'
]
data_encoded_only = final_data[columns_encoded_only]
output_file_encoded_only = output_dir + 'final_data_encoded_only.csv'
data_encoded_only.to_csv(output_file_encoded_only, index=False)
print(f"Encoded data only has been processed and saved to '{output_file_encoded_only}'")

# Print sample outputs for verification
print("Sample data with time columns:")
print(data_with_time.head())
print("Sample encoded-only data:")
print(data_encoded_only.head())




/home/teitur/DTU/electricproject/deeplearning/data_processing
Found 2 marginalpdbc files
Found 2 precious files
Found 6 gas price files
Found 6 gas price files
Processing file: ../raw_data/data_gas/MIBGAS_Data_2023.csv
Processing file: ../raw_data/data_gas/MIBGAS_Data_2021.csv
Processing file: ../raw_data/data_gas/MIBGAS_Data_2018.csv
Processing file: ../raw_data/data_gas/MIBGAS_Data_2020.csv
Processing file: ../raw_data/data_gas/MIBGAS_Data_2019.csv
Processing file: ../raw_data/data_gas/MIBGAS_Data_2022.csv
Combined gas data rows after filling: 2089
      Year  Month  Day  Gas_Price
1421  2018      1    1      24.50
1420  2018      1    2      23.95
1419  2018      1    3      21.90
1418  2018      1    4      19.63
1417  2018      1    5      19.85
Processing file: ../raw_data/data_electric/data_da_23.csv


  gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='ffill')
  gas_data['Gas_Price'] = gas_data['Gas_Price'].fillna(method='bfill')  # Backfill if forward-fill doesn't work


ValueError: Usecols do not match columns, columns expected but not found: [1, 2, 3, 4, 5]

In [12]:
import pandas as pd
import numpy as np
import glob
import os

# Define directories
data_dir = '../raw_data/'
output_dir = '../formatted_data/'

# Discover files
marginal_files = glob.glob(os.path.join(data_dir, 'data_electric/data_da*.csv'))
intra_day_files = glob.glob(os.path.join(data_dir, 'data_electric/data_id*.csv'))
gas_files = glob.glob(os.path.join(data_dir, 'data_gas/MIBGAS_Data_*.csv'))

print(f"Found {len(marginal_files)} marginal files")
print(f"Found {len(intra_day_files)} intra-day files")
print(f"Found {len(gas_files)} gas price files")

# Time encoding function
def encode_time(value, max_value):
    value_sin = np.sin(2 * np.pi * value / max_value)
    value_cos = np.cos(2 * np.pi * value / max_value)
    return value_sin, value_cos

# Function to process marginal data
def process_marginal(file_path):
    print(f"Processing marginal file: {file_path}")
    try:
        data = pd.read_csv(file_path)
        data = data.rename(columns={'DA ES': 'DA'})
        data = data[['Year', 'Month', 'Day', 'Hour', 'DA']].dropna()
        return data
    except Exception as e:
        print(f"Error processing marginal file {file_path}: {e}")
        return None

# Function to process intra-day data
def process_intra_day(file_path):
    print(f"Processing intra-day file: {file_path}")
    try:
        data = pd.read_csv(file_path)
        data = data.rename(columns={'AvgES': 'ID'})
        data = data[['Year', 'Month', 'Day', 'Hour', 'ID']].dropna()
        return data
    except Exception as e:
        print(f"Error processing intra-day file {file_path}: {e}")
        return None

# Function to process gas price data
def process_gas(file_paths):
    print("Processing gas price files...")
    all_gas_data = []
    for file_path in file_paths:
        try:
            # Load the CSV file
            gas_data = pd.read_csv(file_path, delimiter=',')
            
            # Normalize column names by stripping spaces and removing newlines
            gas_data.columns = gas_data.columns.str.strip().str.replace('\n', ' ', regex=False)
            
            # Debugging: Print column names
            print("Columns in gas data:", gas_data.columns)
            
            # Rename columns based on expected order
            gas_data = gas_data.rename(columns={
                gas_data.columns[0]: 'Trading_Day',
                gas_data.columns[4]: 'Price_EUR_MWh',
                gas_data.columns[5]: 'Volume_MWh'
            })
            
            # Convert 'Trading_Day' to datetime
            gas_data['Trading_Day'] = pd.to_datetime(gas_data['Trading_Day'], errors='coerce')
            
            # Drop rows with invalid dates
            gas_data = gas_data.dropna(subset=['Trading_Day'])
            
            # Extract Year, Month, and Day for merging
            gas_data['Year'] = gas_data['Trading_Day'].dt.year
            gas_data['Month'] = gas_data['Trading_Day'].dt.month
            gas_data['Day'] = gas_data['Trading_Day'].dt.day
            
            # Keep only relevant columns
            gas_data = gas_data[['Year', 'Month', 'Day', 'Price_EUR_MWh', 'Volume_MWh']]
            
            # Append to the list
            all_gas_data.append(gas_data)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
    
    # Combine all gas data
    combined_gas_data = pd.concat(all_gas_data, ignore_index=True)
    
    # Remove duplicates
    combined_gas_data = combined_gas_data.drop_duplicates(subset=['Year', 'Month', 'Day'])
    
    # Handle missing values
    combined_gas_data['Price_EUR_MWh'] = combined_gas_data['Price_EUR_MWh'].fillna(method='ffill').fillna(method='bfill')
    combined_gas_data['Volume_MWh'] = combined_gas_data['Volume_MWh'].fillna(method='ffill').fillna(method='bfill')
    
    print("Gas data processed successfully.")
    return combined_gas_data

# Process files
marginal_data = pd.concat(
    [df for file in marginal_files if (df := process_marginal(file)) is not None and not df.empty],
    ignore_index=True
)
intra_day_data = pd.concat(
    [df for file in intra_day_files if (df := process_intra_day(file)) is not None and not df.empty],
    ignore_index=True
)
gas_data = process_gas(gas_files)

# Merge marginal and intra-day data
final_data = pd.merge(marginal_data, intra_day_data, on=['Year', 'Month', 'Day', 'Hour'], how='inner')

# Optionally merge gas price data
final_data = pd.merge(final_data, gas_data, on=['Year', 'Month', 'Day'], how='left')

# Add encoded time columns
final_data['Hour_Sin'], final_data['Hour_Cos'] = zip(*final_data['Hour'].apply(lambda x: encode_time(x, 24)))
final_data['Day_Sin'], final_data['Day_Cos'] = zip(*final_data['Day'].apply(lambda x: encode_time(x, 31)))
final_data['Month_Sin'], final_data['Month_Cos'] = zip(*final_data['Month'].apply(lambda x: encode_time(x, 12)))

# Add scaled Year and price difference
final_data['Year_Scaled'] = (final_data['Year'] - 2018) * 0.1 + 0.1
final_data['Diff'] = final_data['DA'] - final_data['ID']

# Save the processed data
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'formatted_data.csv')
final_data.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

# Print sample for verification
# print("Sample of final data:")
# print(final_data.head())


# Lets now go through the formatted_data.csv file and check if the data is correct

final_data = pd.read_csv(output_file)


# Check for missing values
missing_values = final_data.isnull().sum()
print("Missing values:")
print(missing_values)

# Check for duplicate dates
duplicate_dates = final_data.duplicated(subset=['Year', 'Month', 'Day', 'Hour'])

# Print how many duplicates we have
print("Number of duplicate dates:", duplicate_dates.sum())

Found 2 marginal files
Found 2 intra-day files
Found 6 gas price files
Processing marginal file: ../raw_data/data_electric/data_da_23.csv
Processing marginal file: ../raw_data/data_electric/data_da_24.csv
Processing intra-day file: ../raw_data/data_electric/data_id_24.csv
Processing intra-day file: ../raw_data/data_electric/data_id_23.csv
Processing gas price files...
Columns in gas data: Index(['Trading Day', 'Product', 'Delivery Zone', 'Gas type',
       'Price [EUR/MWh]', 'Volume  [MWh]'],
      dtype='object')
Columns in gas data: Index(['Trading Day', 'Product', 'Delivery Zone', 'Gas type',
       'Price [EUR/MWh]', 'Volume  [MWh]'],
      dtype='object')
Columns in gas data: Index(['Trading Day', 'Product', 'Delivery Zone', 'Gas type',
       'Price [EUR/MWh]', 'Volume  [MWh]'],
      dtype='object')
Columns in gas data: Index(['Trading Day', 'Product', 'Delivery Zone', 'Gas type',
       'Price [EUR/MWh]', 'Volume  [MWh]'],
      dtype='object')
Columns in gas data: Index(['Trad

  combined_gas_data['Price_EUR_MWh'] = combined_gas_data['Price_EUR_MWh'].fillna(method='ffill').fillna(method='bfill')
  combined_gas_data['Volume_MWh'] = combined_gas_data['Volume_MWh'].fillna(method='ffill').fillna(method='bfill')


Processed data saved to ../formatted_data/formatted_data.csv
Missing values:
Year                 0
Month                0
Day                  0
Hour                 0
DA                   0
ID                   0
Price_EUR_MWh    10415
Volume_MWh       10415
Hour_Sin             0
Hour_Cos             0
Day_Sin              0
Day_Cos              0
Month_Sin            0
Month_Cos            0
Year_Scaled          0
Diff                 0
dtype: int64
Number of duplicate dates: 0
