In [1]:
import requests # library for making HTTP requests
import pandas as pd # library for data analysis
import datetime as dt # library for handling date and time objects
import matplotlib.pyplot as plt # library for creating plots

path_to_images = ''
root_path = ''
data_path = f'{root_path}data/final_dataset.csv'
model_path = f'{root_path}models/'
path_to_images = f'{root_path}images/'
base_log_dir = f'{model_path}logs/'
base_tuning_dir = f'{model_path}tuning/'
results_dir = f'{root_path}results/'
tables_dir = f'{root_path}tables/'

# Specify the desired start and end time
start_time = pd.Timestamp(2019, 10, 31)
end_time = pd.Timestamp(2024, 7, 2)

# Investigate data
Here, I investigate the data to find a representative weather station

In [None]:
api_key = '' # insert your own key between the '' signs
DMI_URL = 'https://dmigw.govcloud.dk/v2/metObs/collections/observation/items'
r = requests.get(DMI_URL, params={'api-key': api_key}) # Issues a HTTP GET request



# Specify one or more station IDs or all_stations
stationId = '06109' #'06072' #Ødum
stationIds = [stationId]  

#Silstrup 06019: Har det hele, men ligger i vestjylland
#Isenvad 06068: Har det hele og ligger centralt

# Specify one or more parameter IDs or all_parameters
parameterIds = ['temp_mean_past1h', 'precip_past1h'] # 'radia_glob_past1h', 'wind_speed_past1h',', 'sun_last1h_glob'

# Derive datetime specifier string
datetime_str = start_time.tz_localize('UTC').isoformat() + '/' + end_time.tz_localize('UTC').isoformat()

dfs, dfi = [], []
for station in stationIds:
    for parameter in parameterIds:
        # Specify query parameters
        params = {
            'api-key' : api_key,
            'datetime' : datetime_str,
            'stationId' : station,
            'parameterId' : parameter,
            'limit' : '300000',  # max limit
        }

        # Submit GET request with url and parameters
        r = requests.get(DMI_URL, params=params)
        # Extract JSON object
        json = r.json() # Extract JSON object
        # Convert JSON object to a MultiIndex DataFrame and add to list
        dfi = pd.json_normalize(json['features'])
        if dfi.empty is False:
            dfi['time'] = pd.to_datetime(dfi['properties.observed'])
            
            # Drop other columns
            dfi = dfi[['time', 'properties.value', 'properties.stationId', 'properties.parameterId']]
            
            # Rename columns, e.g., 'properties.stationId' becomes 'stationId'
            dfi.columns = [c.replace('properties.', '') for c in dfi.columns]
            
            # Drop identical rows (considers both value and time stamp)
            dfi = dfi[~dfi.duplicated()]
            dfi = dfi.set_index(['time','stationId','parameterId'])
            dfi = dfi['value'].unstack(['stationId','parameterId'])
            dfs.append(dfi)

data_dmi = pd.concat(dfs, axis='columns').sort_index()
print(data_dmi)

In [None]:
#stationIds = ['06052', '06132']

data_dmi_filter = data_dmi.copy()
data_dmi_filter = data_dmi_filter[stationIds]
data_dmi_filter.columns = data_dmi_filter.columns.droplevel(0)

# Convert the 'time' column timezone without setting it to None explicitly, assuming 'time' is the index or a column after reset_index()
data_dmi_filter['time'] = data_dmi_filter.index #['time'].dt.tz_localize(None)
data_dmi_filter['time'] = data_dmi_filter['time'].dt.tz_localize(None)
data_dmi_filter = data_dmi_filter.set_index('time')
print(data_dmi_filter)


In [None]:
start_date = start_time.strftime('%Y-%m-%d') #2015-01-01'
end_date = end_time.strftime('%Y-%m-%d') #'2022-06-01'

price_area  = '{"PriceArea":["DK1"]}'
url = f'https://api.energidataservice.dk/dataset/Elspotprices?start={start_date}&end={end_date}&filter={price_area}'
response = requests.get(
    url=f'https://api.energidataservice.dk/dataset/Elspotprices?start={start_date}&end={end_date}&filter={price_area}')

if response.ok:  # More idiomatic way to check for a successful request
    records = response.json().get('records', [])
    # Directly filtering necessary columns and renaming them
    data_el_spot_DK1 = (pd.json_normalize(records)
                          .loc[:, ['HourUTC', 'SpotPriceDKK']]
                          .rename(columns={'HourUTC': 'time', 'SpotPriceDKK': 'SpotPriceDK1'}))

    # Convert 'time' column to datetime without timezone information
    data_el_spot_DK1['time'] = pd.to_datetime(data_el_spot_DK1['time']).dt.tz_localize(None)

    # Display the first few rows of the processed DataFrame
    print(data_el_spot_DK1)
else:
    print(f"Failed to fetch data: {response.status_code}")

In [None]:
price_area  = '{"PriceArea":["DK2"]}'

response = requests.get(
    url=f'https://api.energidataservice.dk/dataset/Elspotprices?start={start_date}&end={end_date}&filter={price_area}')

if response.ok:  # More idiomatic way to check for a successful request
    records = response.json().get('records', [])
    # Directly filtering necessary columns and renaming them
    data_el_spot_DK2 = (pd.json_normalize(records)
                          .loc[:, ['HourUTC', 'SpotPriceDKK']]
                          .rename(columns={'HourUTC': 'time', 'SpotPriceDKK': 'SpotPriceDK2'}))

    # Convert 'time' column to datetime without timezone information
    data_el_spot_DK2['time'] = pd.to_datetime(data_el_spot_DK2['time']).dt.tz_localize(None)

    # Display the first few rows of the processed DataFrame
    print(data_el_spot_DK2)
else:
    print(f"Failed to fetch data: {response.status_code}")

In [None]:
import requests
import pandas as pd

# Definer variable for start, slut, og prisområde
price_area = "DK1"

# Sammensæt URL med de angivne parametre
url = f"https://api.energidataservice.dk/dataset/Forecasts_Hour?offset=0&start={start_date}&end={end_date}&filter=%7B%22PriceArea%22:%5B%22{price_area}%22%5D%7D&sort=HourUTC%20ASC"

# Send an HTTP GET request to the API
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Convert the response data from JSON format to a Python dictionary
    data = response.json()

    # Load data into a pandas DataFrame assuming 'records' is the key containing the actual data
    if 'records' in data:
        df = pd.DataFrame(data['records'])

        # Pivot the DataFrame to have columns for each forecast type with corresponding values
        df_pivoted = df.pivot(index='HourUTC', columns='ForecastType', values='ForecastDayAhead')

        print("Data loaded and pivoted into DataFrame successfully:")
        print(df_pivoted.head(20))  # Display the first few rows of the pivoted DataFrame
    else:
        print("No 'records' key in JSON response")
else:
    print("Failed to retrieve data. Status code:", response.status_code)


In [None]:
forecast_weather = df_pivoted.copy().reset_index().rename(columns={'HourUTC': 'time'})
forecast_weather['time'] = pd.to_datetime(forecast_weather['time']).dt.tz_localize(None)
merged_elspot_weather_forecast = pd.merge(data_el_spot_DK1, forecast_weather, on='time', how='left').set_index('time').interpolate(method='time')
print(merged_elspot_weather_forecast)

In [None]:
nan_values = merged_elspot_weather_forecast[merged_elspot_weather_forecast.isna()]

# For at se rækkerne, hvor der findes NaN-værdier, kan du gøre følgende
rows_with_nans = merged_elspot_weather_forecast[merged_elspot_weather_forecast.isna().any(axis=1)]

# Vis de fundne rækker med NaN-værdier
print(rows_with_nans)

In [None]:
data_el_spot_DK1['time'] = pd.to_datetime(data_el_spot_DK1.loc[:,'time'])
data_el_spot_DK2['time'] = pd.to_datetime(data_el_spot_DK2.loc[:,'time'])

merged_elspot = pd.merge(data_el_spot_DK1, data_el_spot_DK2, on='time', how='inner').set_index('time')
print(merged_elspot)

In [None]:
import matplotlib.pyplot as plt

colors = ['#1E90FF', 'yellow']

# Assuming 'merged_elspot' is your DataFrame and 'path_to_images' is defined
ax = merged_elspot[['SpotPriceDK1', 'SpotPriceDK2']].plot(
    figsize=(8, 5), 
    legend=False, 
    fontsize=8, 
    rot=0, 
    subplots=True,
    color=colors,
    grid=True,
    xlabel=''   # List of colors for each line
    )

# Setting labels for each subplot
ax[0].set_ylabel('Day Ahead Spot Prices in DK1\n(DKK/MWh)', fontsize=8)
ax[1].set_ylabel('Day Ahead Spot Prices in DK2\n(DKK/MWh)', fontsize=8)  # Label for the second subplot

# Save the plot to a file
plt.savefig(f"{path_to_images}elspot_prices.png", dpi=150)

# Show the plot
plt.show()


In [None]:
merged_data = pd.merge(merged_elspot_weather_forecast.reset_index(), data_dmi_filter, on='time', how='right') \
                .set_index('time') 
print(merged_data)

In [11]:
#merged_data = merged_data.dropna()
merged_data.to_csv('data/elspot_and_weather_data_hourly.csv', index=True)

In [12]:
daily_data = merged_data.resample('D').mean() 
daily_data.to_csv('data/elspot_and_weather_data_daily.csv', index=True)

In [None]:
df_water = pd.read_excel('data/hydro_reservoir_data.xlsx') #.set_index('Uge')

# Start by melting the original DataFrame
df_long = pd.melt(df_water, id_vars=['Uge'], var_name='Year', value_name='hydro_reservoir')

# Directly convert 'Year' and 'Uge' to a datetime format representing Monday of each week
# This combines the conversion to string, zero-padding, and datetime conversion in one step
df_long['time'] = pd.to_datetime(df_long['Year'].astype(str) + 'W' + df_long['Uge'].astype(str).str.zfill(2) + '-1', format='%GW%V-%u')

# Sort, drop unnecessary columns, and set 'Date' as index in one chained command
df_water_final = df_long.sort_values(by='time', ascending=True).drop(columns=['Uge', 'Year']).dropna()
print(df_water_final)

In [None]:
# Determine the number of hours in a week
hours_in_week = 7 * 24

# Create a new DataFrame to hold the hourly data
# This involves repeating each weekly row 168 times (for each hour of the week)
# and then dividing the 'Level' by 168 to distribute it evenly
hourly_water_data = df_water_final.reindex(df_water_final.index.repeat(hours_in_week))

# Assuming 'Level' is the column with weekly data to be distributed
#hourly_water_data['hydro_reservoir'] /= hours_in_week
hourly_water_data = hourly_water_data.set_index('time')

# Generate an hourly time range that matches the length of the new DataFrame
hourly_range = pd.date_range(start=hourly_water_data.index[0], periods=len(hourly_water_data), freq='h')

# Assign this hourly range as the new index
hourly_water_data['time'] = hourly_range #.sort_index(descending=True)
hourly_water_data = hourly_water_data.reset_index(drop=True).sort_values(by='time', ascending=True)
print(hourly_water_data)

#hourly_water_data['time'] = pd.to_datetime(df_water_final['time'])


In [None]:
merged_data_updated = pd.merge(merged_data, hourly_water_data, on = 'time', how = 'left')

# Reverses order of index to get earliest date first
merged_data_updated = merged_data_updated.iloc[::-1].reset_index(drop=False)
merged_data_updated = merged_data_updated.set_index('time').drop(columns=['index']).interpolate(method='time').dropna()
print(merged_data_updated)

In [None]:
#hourly_water_data.set_index('time', inplace=True)  # Setting time as index if not already set

plot = pd.DataFrame(merged_data_updated['hydro_reservoir'])

# Creating a figure with dynamic subplots based on the number of columns
fig, axes = plt.subplots(nrows=len(plot.columns), ncols=1, figsize=(12, 5 * len(plot.columns)), sharex=True)

# Check if there's only one column and ensure 'axes' is iterable
if plot.columns.size == 1:
    axes = [axes]  # Make a list of axes if only one plot

# Loop through each column and create a bar plot on its respective subplot
for ax, column in zip(axes, plot.columns):
    ax.bar(plot.index, plot[column], width=0.8, label=column, color='skyblue')  # Adjust width as necessary
    ax.set_title(column)
    ax.set_ylabel('Values')
    ax.grid(True)
    ax.legend(loc='upper right')

# Formatting the x-axis to handle date labels better
plt.gcf().autofmt_xdate()  # Auto-format date labels for better readability
plt.tight_layout()
plt.show()

## Importing Carbon, Coal and Natural Gas prices

In [None]:
import pandas as pd

df_carbon = pd.read_csv('data/European Union Allowance (EUA) Yearly Futures Historical Data.csv', usecols=['Date', 'Price']).dropna()
df_carbon.rename(columns={'Price': 'carbon_price', 'Date': 'time'}, inplace=True)
df_carbon['time'] = pd.to_datetime(df_carbon['time'])
df_carbon.set_index('time', inplace=True)

# Load and prepare coal data
df_coal = pd.read_csv('data/Coal_09_06_24-12_10_18.csv', usecols=['Date', 'Close']).dropna()
df_coal.rename(columns={'Close': 'coal_price', 'Date': 'time'}, inplace=True)
df_coal['time'] = pd.to_datetime(df_coal['time'])
df_coal.set_index('time', inplace=True)

# Load and prepare natural gas data
df_natural_gas = pd.read_csv('data/Natural Gas (Henry Hub)_09_06_24-12_10_18.csv', usecols=['Date', 'Close']).dropna()
df_natural_gas.rename(columns={'Close': 'natural_gas_price', 'Date': 'time'}, inplace=True)
df_natural_gas['time'] = pd.to_datetime(df_natural_gas['time'])
df_natural_gas.set_index('time', inplace=True)

# Merge all datasets
df_commodity_merged = pd.merge(df_coal, df_carbon, left_index=True, right_index=True, how='right')
df_commodity_merged = pd.merge(df_commodity_merged, df_natural_gas, left_index=True, right_index=True, how='left')


## Interpolation

In [None]:
df = pd.DataFrame(df_commodity_merged)

# Upsample to hourly data, using forward fill to carry the last valid observation forward
df_hourly = df.resample('h').ffill()

# Now df_hourly contains the upsampled data, but it's constant throughout each day.
# To create a linear transition between days, use .interpolate()
df_commodity_merged_final = df_hourly.interpolate(method='time')
print(df_commodity_merged_final)
#df_carbon_final = df_hourly.drop('date', axis=1)

In [None]:
merged_data_full = pd.merge(merged_data_updated, df_commodity_merged_final, on = 'time', how = 'left')
print(merged_data_full)

# Illustrate data 

In [None]:
merged_data_full_inter = merged_data_full.interpolate(method='time')

merged_data_full_inter.to_csv('data/el_weather_hydro_coal_carbon_data.csv', index=True)
print(merged_data_full_inter)

In [None]:
import matplotlib.dates as mdates

names = {
    'SpotPriceDK1': ['Day Ahead Spot Price', '#1E90FF'],
    'Offshore Wind': ['Offshore Wind', 'skyblue'],
    'Onshore Wind': ['Onshore Wind', 'aquamarine'],
    'Solar': ['Solar', 'yellow'],
    'precip_past1h': ['Precipitation', 'lightskyblue'],
    'temp_mean_past1h': ['Temperature', 'red'],
    'hydro_reservoir': ['Hydro Reservoir Levels', 'skyblue'],
    'carbon_price': ['CO2 Price (Index)', 'lightgreen'],
    'coal_price': ['Coal Price', '#696969'],
    'natural_gas_price': ['Natural Gas Price', 'yellowgreen']
}

params_to_plot = list(names.keys())
end_time_plot = end_time

# Ensuring the 'time' column is a datetime object
data_to_plot = merged_data_full_inter.reset_index() #.interpolate(method='time')
data_to_plot['time'] = pd.to_datetime(data_to_plot['time'])
data_to_plot = data_to_plot[data_to_plot['time'] <= end_time_plot]

# Create a complete datetime index from min to max time, with a specified frequency if necessary
full_time_index = pd.date_range(start=data_to_plot['time'].min(), end=end_time_plot, freq='h')  # Adjust the frequency 'H' as needed

# Reindex the DataFrame to this full index
data_to_plot.set_index('time', inplace=True)
data_to_plot = data_to_plot.reindex(full_time_index).interpolate(method='time')
#final_dataset = data_to_plot.copy()

# Create a figure with dynamic subplots based on the number of parameters
fig, axes = plt.subplots(len(params_to_plot), 1, figsize=(12, 2 * len(params_to_plot)), sharex=True)

# If there's only one parameter, `axes` won't be a list, so we need to make it iterable
if len(params_to_plot) == 1:
    axes = [axes]

# Loop through each parameter and corresponding axis
for i, par in enumerate(params_to_plot):
    
    if par in ['Onshore Wind', 'Offshore Wind', 'Solar', 'precip', 'temp_mean', 'hydro_reservoir']:
        axes[i].bar(data_to_plot.index, data_to_plot[par], color=names[par][1])
    else:
        axes[i].plot(data_to_plot.index, data_to_plot[par], color=names[par][1])
    
    axes[i].set_ylabel(names[par][0])
    axes[i].tick_params(axis='y')
    axes[i].grid(True)

# Set x-axis limits explicitly using pd.Timestamp on the last subplot
axes[-1].set_xlim(pd.Timestamp(start_time), pd.Timestamp(end_time_plot))

# Use YearLocator and DateFormatter to format the x-axis on the last subplot
axes[-1].xaxis.set_major_locator(mdates.YearLocator())
axes[-1].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

# Rotate x-ticks for better readability on the last subplot
plt.savefig(f"{path_to_images}wheater_variables_and_hydro_coal_carbon_final_with_bars.png", dpi=150)
plt.tight_layout()
plt.show()

In [None]:
import mdates

names = {
    #'SpotPriceDK1': ['Day Ahead Spot Price', '#1E90FF'],
    'Offshore Wind': ['Offshore Wind\nproduction (MWh)', 'skyblue'],
    'Onshore Wind': ['Onshore Wind\nproduction (MWh)', 'aquamarine'],
    'Solar': ['Solar production\n(MWh)', 'yellow'],
    'precip_past1h': ['Precipitation\n(mm)', 'lightskyblue'],
    'temp_mean_past1h': ['Temperature\n(°C)', 'red'],
    'hydro_reservoir': ['Hydro Reservoir Levels\n(%)', 'skyblue'],
    'carbon_price': ['CO2 Price\n(Index)', 'lightgreen'],
    'coal_price': ['Coal Price\n(USD/Ton)', '#696969'],
    'natural_gas_price': ['Natural Gas Price\n(USD/MMBtu)', 'yellowgreen']
}

params_to_plot = list(names.keys())
end_time_plot = end_time

# Ensuring the 'time' column is a datetime object
data_to_plot = merged_data_full_inter.reset_index() #.interpolate(method='time')
data_to_plot['time'] = pd.to_datetime(data_to_plot['time'])
data_to_plot = data_to_plot[data_to_plot['time'] <= end_time_plot]

# Create a complete datetime index from min to max time, with a specified frequency if necessary
full_time_index = pd.date_range(start=data_to_plot['time'].min(), end=end_time_plot, freq='h')  # Adjust the frequency 'H' as needed

# Reindex the DataFrame to this full index
data_to_plot.set_index('time', inplace=True)
data_to_plot = data_to_plot.reindex(full_time_index).interpolate(method='time')
#final_dataset = data_to_plot.copy()

# Create a figure with dynamic subplots based on the number of parameters
fig, axes = plt.subplots(len(params_to_plot), 1, figsize=(12, 2 * len(params_to_plot)), sharex=True)

# If there's only one parameter, `axes` won't be a list, so we need to make it iterable
if len(params_to_plot) == 1:
    axes = [axes]

# Loop through each parameter and corresponding axis
for i, par in enumerate(params_to_plot):
    
    if par in ['Onshore Wind', 'Offshore Wind', 'Solar', 'precip', 'temp_mean', 'hydro_reservoir']:
        axes[i].bar(data_to_plot.index, data_to_plot[par], color=names[par][1])
    else:
        axes[i].plot(data_to_plot.index, data_to_plot[par], color=names[par][1])
    
    axes[i].set_ylabel(names[par][0])
    axes[i].tick_params(axis='y')
    axes[i].grid(True)

# Set x-axis limits explicitly using pd.Timestamp on the last subplot
axes[-1].set_xlim(pd.Timestamp(start_time), pd.Timestamp(end_time_plot))

# Use YearLocator and DateFormatter to format the x-axis on the last subplot
axes[-1].xaxis.set_major_locator(mdates.YearLocator())
axes[-1].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

# Rotate x-ticks for better readability on the last subplot
plt.savefig(f"{path_to_images}exogenous_variables.png", dpi=150)
plt.tight_layout()
plt.show()

In [None]:
names = {
    #'SpotPriceDK1': ['Day Ahead Spot Price', '#1E90FF'],
    'Offshore Wind': ['Offshore Wind\nproduction (MWh)', 'skyblue'],
    'Onshore Wind': ['Onshore Wind\nproduction (MWh)', 'aquamarine'],
    'Solar': ['Solar production\n(MWh)', 'yellow'],
    'precip_past1h': ['Precipitation\n(mm)', 'lightskyblue'],
    'temp_mean_past1h': ['Temperature\n(°C)', 'red'],
    'hydro_reservoir': ['Hydro Reservoir Levels\n(%)', 'skyblue'],
    'carbon_price': ['CO2 Price\n(Index)', 'lightgreen'],
    'coal_price': ['Coal Price\n(USD/Ton)', '#696969'],
    'natural_gas_price': ['Natural Gas Price\n(USD/MMBtu)', 'yellowgreen']
}

params_to_plot = list(names.keys())
end_time_plot = end_time

# Ensuring the 'time' column is a datetime object
data_to_plot = merged_data_full_inter.reset_index() #.interpolate(method='time')
data_to_plot['time'] = pd.to_datetime(data_to_plot['time'])
data_to_plot = data_to_plot[data_to_plot['time'] <= end_time_plot]

# Create a complete datetime index from min to max time, with a specified frequency if necessary
full_time_index = pd.date_range(start=data_to_plot['time'].min(), end=end_time_plot, freq='h')  # Adjust the frequency 'H' as needed

# Reindex the DataFrame to this full index
data_to_plot.set_index('time', inplace=True)
data_to_plot = data_to_plot.reindex(full_time_index).interpolate(method='time')
#final_dataset = data_to_plot.copy()


names = {
    'SpotPriceDK1': ['Day Ahead Spot Price', '#1E90FF'],
    'Offshore Wind': ['Offshore Wind', 'skyblue'],
    'Onshore Wind': ['Onshore Wind', 'aquamarine'],
    'Solar': ['Solar', 'yellow'],
    'precip_past1h': ['Precipitation', 'lightskyblue'],
    'temp_mean_past1h': ['Temperature', 'red'],
    'hydro_reservoir': ['Hydro Reservoir Levels', 'skyblue'],
    'carbon_price': ['CO2 Price', 'lightgreen'],
    'coal_price': ['Coal Price', '#696969'],
    'natural_gas_price': ['Natural Gas Price', 'yellowgreen']
}

# Rename the columns of final_dataset
final_dataset = data_to_plot.copy()

# Create a copy and rename the columns in one step
final_dataset = data_to_plot.rename(columns={old_name: new_name[0] for old_name, new_name in names.items()}).reset_index()

# Rename the 'index' column to 'time' directly while resetting the index
final_dataset.rename(columns={'index': 'time'}, inplace=True)

# Save the updated DataFrame to a CSV file
final_dataset.to_csv('data/dataset.csv', index=False)  # Use index=False since 'time' is now a column

# Print the first few rows to verify the changes
print(final_dataset.head())
print(final_dataset.index)

# Dummies 

In [None]:
import pandas as pd

# Antager at 'final_dataset' er dit datasæt
df = pd.DataFrame(final_dataset)

# Kontrollerer kolonnerne i DataFrame


# Reset index
df.reset_index(inplace=True)

# Konverterer 'time' kolonnen til datetime (opdater navnet hvis nødvendigt)
df['time'] = pd.to_datetime(df['time'])  # Sørg for at 'time' er korrekt

# Opretter dummyvariabler for ugedage
df = pd.get_dummies(df, columns=['time'], drop_first=False)
print(df.head())


In [None]:
import pandas as pd

# Antager, at 'final_dataset' er dit datasæt
df = pd.DataFrame(final_dataset.copy())

# Reset index
df.reset_index(inplace=True)

# Konverterer 'time' kolonnen til datetime (sørg for navnet er korrekt)
df['time'] = pd.to_datetime(df['time'])

# Opretter en ny kolonne for ugedagen
df['weekday'] = df['time'].dt.day_name()

# Oprettelse af dummyvariabler for hver dag i ugen
df = pd.get_dummies(df, columns=['weekday'], drop_first=False)

#df.set_index('time', inplace=True)
df.drop(columns=['index'], inplace=True)
# Viser resultatet
# Save the updated DataFrame to a CSV file
df.to_csv('data/dataset_with_dummies.csv', index=False) 
print(df.head())

# Lags

In [None]:
df_lags = df.copy()

# Variables to lag by 24 hours
lags_names = [
    'Precipitation', 
    'Temperature',
    'Hydro Reservoir Levels',
    'CO2 Price',
    'Coal Price',
    'Natural Gas Price'
]

# Lag each specified column by one day and replace the original
for column in lags_names:
    df_lags[column] = df_lags[column].shift(24)

# Variables to lag specified hours
lags_names = [
    'Day Ahead Spot Price',
]

# Lag each specified column by one day
for column in lags_names:
    df_lags[column + '_lagged_1_hour'] = df_lags[column].shift(1)

# Lag each specified column by one day
for column in lags_names:
    df_lags[column + '_lagged_2_hour'] = df_lags[column].shift(2)


df_lags.dropna(inplace=True)

# Display the DataFrame with lagged variables
print(df_lags.head())

df_lags.to_csv('data/dataset_with_dummies_and_lags.csv', index=False)


In [None]:
df_lags = df_lags.copy()

# Variables to lag
lags_names = [
    'Day Ahead Spot Price',
]
df_lags = df.copy()

# Lag each specified column by one day
for column in lags_names:
    df_lags[column + '_lagged_24_hour'] = df_lags[column].shift(24)

# Lag each specified column by one day
for column in lags_names:
    df_lags[column + '_lagged_25_hour'] = df_lags[column].shift(25)

df_lags.dropna(inplace=True)
# Display the DataFrame with lagged variables
print(df_lags.head())

df_lags.to_csv('data/final_dataset.csv', index=False) 