# Data Collection

In [1]:
import sys
from pathlib import Path

# Get the absolute path to the project root
current_dir = Path().absolute()
project_root = current_dir.parent if 'notebooks' in str(current_dir) else current_dir

# Add the project root to Python path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root added to path: {project_root}")

Project root added to path: d:\Projects\PORTF_2_German_Energy_forcast\german-energy-forcast


In [2]:
from src.data.data_loader import EntsoeClient
from src.config import ENTSOE_API_KEY

# Initialize ENTSO-E client
client = EntsoeClient(
    api_key=ENTSOE_API_KEY,
    country_code="DE"  # Germany's country code
)

# Fetch recent load data
actual_load, forecast_load = client.get_load_data(
    hours_back=24*365*2,    # Get last 2 year of actual load
    forecast_hours=24  # Get next 24 hours of forecast
)

Fetching load data: 100%|██████████| 105/105 [07:44<00:00,  4.42s/it]
2025-02-13 11:42:32,621 - INFO - Data collection completed. Shape: (70176, 2)
2025-02-13 11:42:32,628 - INFO - Actual load points: 69972
2025-02-13 11:42:32,628 - INFO - Forecast load points: 70176


### Basic information


In [5]:
print("=== Actual Load Data ===")
print(f"Total data points: {len(actual_load)}")
print(f"Time range: {actual_load.index.min()} to {actual_load.index.max()}")
print("\nDataset Shape:", actual_load.shape)
print(f"Time frequency: {actual_load.index.to_series().diff().mode()[0]}")
print(f"Missing values: {actual_load.isnull().sum()}")
print(f"Duplicate timestamps: {actual_load.index.duplicated().sum()}\n")

print("=== Forecast Load Data ===")
print(f"Total data points: {len(forecast_load)}")
print(f"Time range: {forecast_load.index.min()} to {forecast_load.index.max()}")
print(f"Time frequency: {forecast_load.index.to_series().diff().mode()[0]}")
print(f"Missing values: {forecast_load.isnull().sum()}")
print(f"Duplicate timestamps: {forecast_load.index.duplicated().sum()}")

=== Actual Load Data ===
Total data points: 69972
Time range: 2023-02-14 12:00:00+01:00 to 2025-02-13 10:45:00+01:00

Dataset Shape: (69972,)
Time frequency: 0 days 00:15:00
Missing values: 0
Duplicate timestamps: 0

=== Forecast Load Data ===
Total data points: 70176
Time range: 2023-02-14 11:45:00+01:00 to 2025-02-14 11:30:00+01:00
Time frequency: 0 days 00:15:00
Missing values: 0
Duplicate timestamps: 0


# Weather Data Collection for Load Forecasting

## 1. Data Source Selection
### Why Weather Data?

- Temperature strongly correlates with electricity consumption (heating/cooling needs)
- Weather conditions affect renewable energy generation
- Essential for accurate load forecasting models|

### Selected Data Source: DWD (Deutscher Wetterdienst)
**Reasons for choosing DWD:**
- Official German weather service with high-quality data
- Free access to historical data
- High spatial resolution across Germany
- Matches our temporal requirements (hourly data available)
- Reliable and well-maintained API

## 2. Data Collection Requirements
- Temporal Coverage: 2023-02-14 to 2025-02-13 (matching load data)
- Frequency: Hourly data (can be resampled to 15-min if needed)
- Parameters: Temperature, humidity, wind speed, cloud cover
- Multiple stations for better coverage of load zones

during peak hours, we really only need two key weather parameters:
Temperature (most important factor)
Cloud cover (affects both cooling needs and lighting usage)

In [20]:
from wetterdienst import Settings
from wetterdienst.provider.dwd.observation import DwdObservationRequest
import pandas as pd
import os

def fetch_weather_data(start_date, end_date):
    """
    Fetch essential weather data (temperature) from DWD for major German cities
    """
    try:
        # Just temperature - most important for load forecasting
        request = DwdObservationRequest(
            parameter="temperature_air",  # Correct parameter name
            resolution="hourly",
            start_date=start_date,
            end_date=end_date
        )
        
        # Selected stations covering major load centers
        station_ids = [
            "00433",  # Berlin-Tempelhof (Northeast)
            "01975",  # Hamburg-Fuhlsbüttel (North)
            "03379",  # Frankfurt/Main (Central/West) - major industrial area
        ]
        
        print(f"\nFetching temperature data for stations: {station_ids}")
        
        # Get values for selected stations
        values = request.filter_by_station_id(station_ids).values.all().df
        
        # Save raw data
        if len(values) > 0:
            os.makedirs("data/raw", exist_ok=True)
            output_path = "data/raw/weather_temperature_hourly.csv"
            values_pd = values.to_pandas()
            values_pd.to_csv(output_path, index=False)
            print(f"\nRaw hourly temperature data saved to: {output_path}")
            
        return values
        
    except Exception as e:
        print(f"Error fetching data: {str(e)}")
        raise

# Test the function
start_date = "2023-02-14"
end_date = "2025-02-13"

try:
    weather_df = fetch_weather_data(start_date, end_date)
    print("\n=== Weather Data ===")
    print(f"Total data points: {len(weather_df)}")
    if len(weather_df) > 0:
        print(f"Columns: {weather_df.columns}")
        print(f"Time range: {weather_df['date'].min()} to {weather_df['date'].max()}")
    
except Exception as e:
    print(f"Error in main execution: {str(e)}")

2025-02-13 13:24:22,909 - INFO - Wetterdienst cache is enabled [CACHE_DIR: C:\Users\risha\AppData\Local\wetterdienst\wetterdienst\Cache]
2025-02-13 13:24:22,910 - INFO - Processing request DwdObservationRequest(parameter=[(air_temperature/air_temperature)], resolution=hourly, period=[None], start_date=2023-02-14 00:00:00+00:00, end_date=2025-02-13 00:00:00+00:00, humanize=True, format=long, si_units=True)
2025-02-13 13:24:22,915 - INFO - Downloading file https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate//hourly/air_temperature/recent/TU_Stundenwerte_Beschreibung_Stationen.txt.
2025-02-13 13:24:22,927 - INFO - Downloading file https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate//hourly/air_temperature/historical/TU_Stundenwerte_Beschreibung_Stationen.txt.
2025-02-13 13:24:22,940 - INFO - Filtering for station_id=['00433', '01975', '03379']
2025-02-13 13:24:22,942 - INFO - 0%|          | 0/3 [00:00<?, ?it/s]
2025-02-13 13:24:22,943 - IN


Fetching temperature data for stations: ['00433', '01975', '03379']


2025-02-13 13:25:17,203 - INFO - Acquiring observation data for air_temperature/hourly/recent/00433/None.
2025-02-13 13:25:17,208 - INFO - Downloading file https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/recent/stundenwerte_TU_00433_akt.zip.
2025-02-13 13:25:27,481 - INFO - Invalid combination air_temperature/hourly/Period.NOW is skipped.
2025-02-13 13:25:27,926 - INFO - 33%|###3      | 1/3 [01:04<02:09, 64.98s/it]
2025-02-13 13:25:27,927 - INFO - Acquiring observation data for air_temperature/hourly/historical/01975/None.
2025-02-13 13:25:27,932 - INFO - Downloading file https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical/stundenwerte_TU_01975_19490101_20231231_hist.zip.
2025-02-13 13:25:28,843 - INFO - Acquiring observation data for air_temperature/hourly/recent/01975/None.
2025-02-13 13:25:28,851 - INFO - Downloading file https://opendata.dwd.de/climate_environment/CDC/obse


Raw hourly temperature data saved to: data/raw/weather_temperature_hourly.csv

=== Weather Data ===
Total data points: 105126
Columns: ['station_id', 'dataset', 'parameter', 'date', 'value', 'quality']
Time range: 2023-02-14 00:00:00+00:00 to 2025-02-13 00:00:00+00:00


### SAVE IN .CSV FILE FORMAT

In [None]:
# Combine with load forecast data
# Assuming load_forecast_df is your existing DataFrame
def combine_load_and_weather(load_df, weather_df):
    """
    Combine load forecast and weather data
    """
    # Resample weather data to 15-minute intervals
    weather_resampled = weather_df.resample('15T').ffill()
    
    # Merge the datasets
    combined_df = pd.merge(
        load_df,
        weather_resampled,
        left_index=True,
        right_index=True,
        how='left'
    )
    
    return combined_df

# Combine and save
load_forecast_weather_df = combine_load_and_weather(load_forecast_df, weather_df)

# Save to CSV
output_path = "D:/Projects/PORTF_2_German_Energy_forcast/german-energy-forcast/data/raw/load_forecast_weather_df.csv"
load_forecast_weather_df.to_csv(output_path)

print("\n=== Combined Dataset ===")
print(f"Total data points: {len(load_forecast_weather_df)}")
print(f"Time range: {load_forecast_weather_df.index.min()} to {load_forecast_weather_df.index.max()}")
print(f"Columns: {load_forecast_weather_df.columns.tolist()}")
print(f"Data saved to: {output_path}")

In [8]:
import pandas as pd
import numpy as np

load_forecast_df = pd.DataFrame({
    'Actual Load': actual_load,
    'Forecasted Load': forecast_load
})

# Define the path
output_path = r"D:/Projects/PORTF_2_German_Energy_forcast/german-energy-forcast/data/raw/load_forecast_data_2years.csv"

# Save DataFrame to CSV
load_forecast_df.to_csv(output_path, index=True)

print(f"Data saved successfully to: {output_path}")

Data saved successfully to: D:/Projects/PORTF_2_German_Energy_forcast/german-energy-forcast/data/raw/load_forecast_data_2years.csv
