# 1，preparing electricity price 

In [1]:
import pandas as pd
from pathlib import Path

# Set the data path
electricity_path = Path("electricity")  

def process_electricity():
    dfs = []

    for f in electricity_path.glob("SE3*.csv"):
        try:
            # **Read CSV and parse time**
            df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"], 
                            date_parser=lambda x: pd.to_datetime(x.split(" - ")[0], format="%d.%m.%Y %H:%M"))

            # **Rename columns**
            df.rename(columns={"MTU (CET/CEST)": "timestamp_local"}, inplace=True)

            # **Convert to UTC (handle daylight saving time issues)**
            df["timestamp"] = df["timestamp_local"].dt.tz_localize(
                "Europe/Stockholm", ambiguous="NaT", nonexistent="NaT"
            ).dt.tz_convert("UTC")

            # **Remove timezone information to ensure consistency when merging**
            df["timestamp"] = df["timestamp"].dt.tz_localize(None)

            # **Remove invalid timestamps**
            df.drop(columns=["timestamp_local"], inplace=True)

            # **Keep only data from 2019-01-01 onwards**
            df = df[df["timestamp"] >= "2019-01-01"]

            # **Select required columns**
            df = df[["timestamp", "Day-ahead Price [EUR/MWh]"]]

            # **Rename columns**
            df.rename(columns={"Day-ahead Price [EUR/MWh]": "price"}, inplace=True)

            # **Remove rows with "n/e" values**
            df = df[df["price"] != "n/e"]

            # **Store in list**
            dfs.append(df)
            print(f"✅ Read {f.name}, remaining {len(df)} rows after filtering")

        except Exception as e:
            print(f"❌ Failed to read {f.name}: {e}")

    # **Merge all files**
    if dfs:
        price_df = pd.concat(dfs).set_index("timestamp").sort_index()

        # **Save CSV**
        output_file = "electricity_prepared.csv"
        price_df.to_csv(output_file, index=True)
        print(f"✅ Saved {output_file}")
    else:
        print("⚠️ No valid data to save")

# **Run the function**
process_electricity()


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2016_entsoe.csv, remaining 0 rows after filtering


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2017_entsoe.csv, remaining 0 rows after filtering


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2018_entsoe.csv, remaining 0 rows after filtering


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2019_entsoe.csv, remaining 8757 rows after filtering


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2020_entsoe.csv, remaining 8782 rows after filtering


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2021_entsoe.csv, remaining 8758 rows after filtering


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2022_entsoe.csv, remaining 8758 rows after filtering


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2023_entsoe.csv, remaining 8758 rows after filtering


  df = pd.read_csv(f, sep=",", parse_dates=["MTU (CET/CEST)"],


✅ Read SE3_2024_entsoe.csv, remaining 6671 rows after filtering
✅ Saved electricity_prepared.csv


# 2 Air temperature preparing

In [2]:
import pandas as pd
from pathlib import Path

def process_air_temperature(data_path):
    dfs = []
    
    # Iterate over all CSV files in the parameter_2 folder
    for f in Path(data_path).glob("parameter_2/**/*.csv"):
        df = pd.read_csv(f, sep=",", parse_dates=["Datum"])
        
        # Standardize column names
        df.rename(columns={"Datum": "date", "Lufttemperatur": "AirTemp"}, inplace=True)
        
        # Filter data after 2019-01-01
        df = df[df["date"] >= "2019-01-01"]
        
        if df.empty:
            continue  # Skip the station if all data is filtered out

        # Compute the daily average to ensure only one temperature value per day
        df = df.groupby("date", as_index=False)["AirTemp"].mean()
        
        # Generate 24-hour data (expand each date to 24 rows)
        df = df.loc[df.index.repeat(24)].reset_index(drop=True)
        df["hour"] = list(range(24)) * (len(df) // 24)
        
        # Generate complete timestamps
        df["timestamp"] = pd.to_datetime(df["date"]) + pd.to_timedelta(df["hour"], unit="h")
        
        # Select required columns
        df = df[["timestamp", "AirTemp"]]
        
        dfs.append(df)
    
    # Merge data from all stations
    if dfs:
        air_temp_df = pd.concat(dfs).set_index("timestamp").sort_index()
        
        # If multiple stations have data for the same timestamp, compute the average
        air_temp_df = air_temp_df.groupby("timestamp").mean()
    else:
        air_temp_df = pd.DataFrame(columns=["timestamp", "AirTemp"]).set_index("timestamp")  # Return an empty DataFrame if no data
    
    return air_temp_df

# Specify the data path
data_path = "smhi_data_2022-today"

# Process data
air_temp_df = process_air_temperature(data_path)

# Specify the output file name
output_file = "AirTemp_prepared.csv"

# Save the processed data (automatically overwrite)
air_temp_df.to_csv(output_file, index=True)

# Print a reminder
print(f"File saved: {output_file} (automatically overwritten)")


File saved: AirTemp_prepared.csv (automatically overwritten)


# 3 precipitation_prepared

In [4]:
import pandas as pd
from pathlib import Path
import os

def process_precipitation(data_path):
    dfs = []
    
    # Iterate through all CSV files under the parameter_5 folder
    for f in Path(data_path).glob("parameter_5/**/*.csv"):
        try:
            # Read the CSV file
            df = pd.read_csv(f, sep=",", encoding="utf-8")

            # Handle separator issues
            if df.shape[1] < 2:  
                df = pd.read_csv(f, sep=",", encoding="utf-8")
            if df.shape[1] < 2:  
                df = pd.read_csv(f, sep=";", encoding="utf-8")
            if df.shape[1] < 2:  
                print(f"⚠️ File {f.name} may have issues (less than 2 columns), skipped")
                continue  

            # Get column names (ensure the second column is precipitation)
            col_names = list(df.columns)
            print(f"✅ Read {f.name}, columns: {col_names}")

            date_col, prec_col = col_names[0], col_names[1]  

            # **Remove whitespace to prevent parsing failure**
            df[date_col] = df[date_col].astype(str).str.strip()

            # **Print original `Datum` column (first 5 rows)**
            print(f"📅 {f.name} Original `Datum`:\n", df[date_col].head())

            # **Parse dates**
            df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")

            # **Check date format**
            print(f"📌 {f.name} `date` column data type: {df['date'].dtype}")

            # **Skip if conversion fails**
            if df["date"].isna().all():
                print(f"❌ {f.name} Date parsing failed, skipped")
                continue

            # **Print data range**
            print(f"📊 {f.name} Data range: {df['date'].min()} - {df['date'].max()}")

            # Select necessary columns and rename them
            df = df[["date", prec_col]].rename(columns={prec_col: "precipitation"})

            # **Select only data from after 2019-01-01**
            df = df[df["date"] >= "2019-01-01"]

            # **Print error if data is empty after filtering**
            if df.empty:
                print(f"⚠️ {f.name} Data is empty after filtering for dates after 2019-01-01, possibly all data is before 2019")
                continue  

            # **Fill NaN values**
            df["precipitation"] = df["precipitation"].fillna(0)

            # Take the average per day
            df = df.groupby("date", as_index=False)["precipitation"].mean()

            # **Expand to 24 hours per day**
            df = df.loc[df.index.repeat(24)].reset_index(drop=True)
            df["hour"] = list(range(24)) * (len(df) // 24)

            # **Generate complete timestamps**
            df["timestamp"] = pd.to_datetime(df["date"]) + pd.to_timedelta(df["hour"], unit="h")

            print(f"📊 {f.name} Processed data sample:\n", df.head())

            # **Select final necessary columns**
            df = df[["timestamp", "precipitation"]]

            dfs.append(df)
        
        except Exception as e:
            print(f"❌ Error processing {f.name}: {e}")
    
    # **Merge data from all stations**
    if dfs:
        precipitation_df = pd.concat(dfs).set_index("timestamp").sort_index()
        precipitation_df = precipitation_df.groupby("timestamp").mean()
    else:
        precipitation_df = pd.DataFrame(columns=["timestamp", "precipitation"]).set_index("timestamp")
    
    return precipitation_df

# **Specify data path**
data_path = "smhi_data_2022-today"

# **Process data**
precipitation_df = process_precipitation(data_path)

# **Specify output filename**
output_file = "precipitation_prepared.csv"

# **Delete file if it exists to avoid permission issues**
if os.path.exists(output_file):
    os.remove(output_file)

# **Check data quantity**
print(f"📊 Number of rows in processed data: {len(precipitation_df)}")

# **Save data**
if not precipitation_df.empty:
    precipitation_df.to_csv(output_file, index=True)
    print(f"✅ File saved: {output_file} (automatically overwritten)")
else:
    print(f"⚠️ No valid data found, {output_file} was not generated")


✅ Read station_102170-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_102170-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_102170-SE3.csv `date` column data type: datetime64[ns]
📊 station_102170-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_102170-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_102540-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_102540-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_102540-SE

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_103080-SE3.csv `date` column data type: datetime64[ns]
📊 station_103080-SE3.csv Data range: 1945-01-01 00:00:00 - 2024-07-10 00:00:00
📊 station_103080-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2024-01-10            0.0     0 2024-01-10 00:00:00
1 2024-01-10            0.0     1 2024-01-10 01:00:00
2 2024-01-10            0.0     2 2024-01-10 02:00:00
3 2024-01-10            0.0     3 2024-01-10 03:00:00
4 2024-01-10            0.0     4 2024-01-10 04:00:00
✅ Read station_103090-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_103090-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_103090-SE3.csv `date` column data type: datetime64[ns]
📊 station_103090-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_103090-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 20

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_103570-SE3.csv `date` column data type: datetime64[ns]
📊 station_103570-SE3.csv Data range: 1945-01-01 00:00:00 - 1993-12-02 00:00:00
⚠️ station_103570-SE3.csv Data is empty after filtering for dates after 2019-01-01, possibly all data is before 2019
✅ Read station_104090-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_104090-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_104090-SE3.csv `date` column data type: datetime64[ns]
📊 station_104090-SE3.csv Data range: 2014-01-01 00:00:00 - 2023-12-07 00:00:00
📊 station_104090-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_104580-SE3.csv `date` column data type: datetime64[ns]
📊 station_104580-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_104580-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_105220-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_105220-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_105220-SE3.csv `date` column data type: datetime64[ns]
📊 station_105220-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_105220-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 20

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_105310-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_105370-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_105370-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_105370-SE3.csv `date` column data type: datetime64[ns]
📊 station_105370-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_105370-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_106040-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_106040-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.4     0 2019-01-01 00:00:00
1 2019-01-01            0.4     1 2019-01-01 01:00:00
2 2019-01-01            0.4     2 2019-01-01 02:00:00
3 2019-01-01            0.4     3 2019-01-01 03:00:00
4 2019-01-01            0.4     4 2019-01-01 04:00:00
✅ Read station_106070-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_106070-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_106070-SE3.csv `date` column data type: datetime64[ns]
📊 station_106070-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_106070-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.8     0 2019-01-01 00:00:00
1 2019-01-01   

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_106360-SE3.csv `date` column data type: datetime64[ns]
📊 station_106360-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_106360-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_107040-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_107040-SE3.csv Original `Datum`:
 0    1953-04-01
1    1953-04-02
2    1953-04-03
3    1953-04-06
4    1953-04-07
Name: Datum, dtype: object
📌 station_107040-SE3.csv `date` column data type: datetime64[ns]
📊 station_107040-SE3.csv Data range: 1953-01-04 00:00:00 - 2024-12-10 00:00:00
📊 station_107040-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 20

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_71380-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_71420-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_71420-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_71420-SE3.csv `date` column data type: datetime64[ns]
📊 station_71420-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_71420-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_72300-SE3.csv Data range: 2014-01-01 00:00:00 - 2020-12-10 00:00:00
📊 station_72300-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_72400-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_72400-SE3.csv Original `Datum`:
 0    1945-01-01
1    1945-01-02
2    1945-01-03
3    1945-01-04
4    1945-01-05
Name: Datum, dtype: object


  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_72400-SE3.csv `date` column data type: datetime64[ns]
📊 station_72400-SE3.csv Data range: 1945-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_72400-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_72450-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_72450-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_72450-SE3.csv `date` column data type: datetime64[ns]
📊 station_72450-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_72450-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_74180-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.6     0 2019-01-01 00:00:00
1 2019-01-01            0.6     1 2019-01-01 01:00:00
2 2019-01-01            0.6     2 2019-01-01 02:00:00
3 2019-01-01            0.6     3 2019-01-01 03:00:00
4 2019-01-01            0.6     4 2019-01-01 04:00:00
✅ Read station_74240-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_74240-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_74240-SE3.csv `date` column data type: datetime64[ns]
📊 station_74240-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_74240-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_74440-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_74470-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_74470-SE3.csv Original `Datum`:
 0    1858-12-01
1    1858-12-02
2    1858-12-03
3    1858-12-04
4    1858-12-05
Name: Datum, dtype: object


  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_74470-SE3.csv `date` column data type: datetime64[ns]
📊 station_74470-SE3.csv Data range: 1858-01-12 00:00:00 - 2024-12-10 00:00:00
📊 station_74470-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_74480-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_74480-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_74480-SE3.csv `date` column data type: datetime64[ns]
📊 station_74480-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_74480-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_77220-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_77220-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_77220-SE3.csv `date` column data type: datetime64[ns]
📊 station_77220-SE3.csv Data range: 2014-01-01 00:00:00 - 2021-12-09 00:00:00
📊 station_77220-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.3     0 2019-01-01 00:00:00
1 2019-01-01            0.3     1 2019-01-01 01:00:00
2 2019-01-01            0.3     2 2019-01-01 02:00:00
3 2019-01-01            0.3     3 2019-01-01 03:00:00
4 2019-01-01            0.3     4 2019-01-01 04:00:00
✅ Read station_78140-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_78140-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_78140-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_78320-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_78390-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_78390-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_78390-SE3.csv `date` column data type: datetime64[ns]
📊 station_78390-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_78390-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.5     0 2019-01-01 00:00:00
1 2019-01-01            0.5     1 2019-01-01 01:00:00
2 2019-01-01            0.5     2 2019-01-01 0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_78420-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_78420-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_78420-SE3.csv `date` column data type: datetime64[ns]
📊 station_78420-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_78420-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.4     0 2019-01-01 00:00:00
1 2019-01-01            0.4     1 2019-01-01 01:00:00
2 2019-01-01            0.4     2 2019-01-01 02:00:00
3 2019-01-01            0.4     3 2019-01-01 03:00:00
4 2019-01-01            0.4     4 2019-01-01 04:00:00
✅ Read station_79580-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_79580-SE3.csv Original `Datum`:
 0    1946-12-31
1    1947-01-01
2    1947-01-02
3    1947-01-03
4    1947-01-04
Name: Datum, dtype: object
📌 station_79580-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_81210-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_81210-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_81210-SE3.csv `date` column data type: datetime64[ns]
📊 station_81210-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_81210-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_81540-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_81540-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_81540-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_82230-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_82230-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_82230-SE3.csv `date` column data type: datetime64[ns]
📊 station_82230-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_82230-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_82260-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_82260-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_82260-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_83090-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_83090-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_83090-SE3.csv `date` column data type: datetime64[ns]
📊 station_83090-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_83090-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_83150-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_83150-SE3.csv Original `Datum`:
 0    1945-01-01
1    1945-01-02
2    1945-01-03
3    1945-01-04
4    1945-01-05
Name: Datum, dtype: object
📌 station_83150-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_83210-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_83210-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_83210-SE3.csv `date` column data type: datetime64[ns]
📊 station_83210-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_83210-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_83230-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_83230-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_83230-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_83270-SE3.csv `date` column data type: datetime64[ns]
📊 station_83270-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_83270-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_83280-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_83280-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_83280-SE3.csv `date` column data type: datetime64[ns]
📊 station_83280-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_83280-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_84390-SE3.csv `date` column data type: datetime64[ns]
📊 station_84390-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_84390-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_84470-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_84470-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_84470-SE3.csv `date` column data type: datetime64[ns]
📊 station_84470-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_84470-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_85180-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_85240-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_85240-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_85240-SE3.csv `date` column data type: datetime64[ns]
📊 station_85240-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_85240-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.1     0 2019-01-01 00:00:00
1 2019-01-01            0.1     1 2019-01-01 01:00:00
2 2019-01-01            0.1     2 2019-01-01 0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_85280-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_85280-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_85280-SE3.csv `date` column data type: datetime64[ns]
📊 station_85280-SE3.csv Data range: 2014-01-01 00:00:00 - 2018-12-12 00:00:00
⚠️ station_85280-SE3.csv Data is empty after filtering for dates after 2019-01-01, possibly all data is before 2019
✅ Read station_85330-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_85330-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_85330-SE3.csv `date` column data type: datetime64[ns]
📊 station_85330-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_85330-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_85490-SE3.csv `date` column data type: datetime64[ns]
📊 station_85490-SE3.csv Data range: 2014-01-01 00:00:00 - 2023-12-12 00:00:00
📊 station_85490-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_86010-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_86010-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_86010-SE3.csv `date` column data type: datetime64[ns]
📊 station_86010-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_86010-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_86470-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_86470-SE3.csv Original `Datum`:
 0    2014-06-01
1    2014-06-02
2    2014-06-03
3    2014-06-04
4    2014-06-05
Name: Datum, dtype: object
📌 station_86470-SE3.csv `date` column data type: datetime64[ns]
📊 station_86470-SE3.csv Data range: 2014-01-06 00:00:00 - 2014-12-10 00:00:00
⚠️ station_86470-SE3.csv Data is empty after filtering for dates after 2019-01-01, possibly all data is before 2019
✅ Read station_86660-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_86660-SE3.csv Original `Datum`:
 0    1859-08-09
1    1859-08-10
2    1859-08-11
3    1859-08-12
4    1859-08-13
Name: Datum, dtype: object
📌 station_86660-SE3.csv `date` column data type: datetime64[ns]
📊 station_86660-SE3.csv Data range: 1859-01-09 00:00:00 - 2024-12-10 00:00:00
📊 station_86660-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2020-01-11            4.9     0 2020

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_87450-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_87570-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_87570-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_87570-SE3.csv `date` column data type: datetime64[ns]
📊 station_87570-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_87570-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2020-01-07            0.2     0 2020-01-07 00:00:00
1 2020-01-07            0.2     1 2020-01-07 01:00:00
2 2020-01-07            0.2     2 2020-01-07 0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_92100-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_92380-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_92380-SE3.csv Original `Datum`:
 0    1969-06-01
1    1969-06-02
2    1969-06-03
3    1969-06-04
4    1969-06-05
Name: Datum, dtype: object
📌 station_92380-SE3.csv `date` column data type: datetime64[ns]
📊 station_92380-SE3.csv Data range: 1969-01-06 00:00:00 - 2024-12-10 00:00:00
📊 station_92380-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_94140-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_94140-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_94140-SE3.csv `date` column data type: datetime64[ns]
📊 station_94140-SE3.csv Data range: 2014-01-01 00:00:00 - 2023-12-06 00:00:00
📊 station_94140-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_94180-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_94180-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_94180-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_94200-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_94450-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_94450-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_94450-SE3.csv `date` column data type: datetime64[ns]
📊 station_94450-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_94450-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_95030-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_95030-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_95030-SE3.csv `date` column data type: datetime64[ns]
📊 station_95030-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_95030-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_95160-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_95160-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_95160-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📅 station_95490-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_95490-SE3.csv `date` column data type: datetime64[ns]
📊 station_95490-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_95490-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.0     0 2019-01-01 00:00:00
1 2019-01-01            0.0     1 2019-01-01 01:00:00
2 2019-01-01            0.0     2 2019-01-01 02:00:00
3 2019-01-01            0.0     3 2019-01-01 03:00:00
4 2019-01-01            0.0     4 2019-01-01 04:00:00
✅ Read station_95530-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_95530-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_95530-SE3.csv `date` column data type: datetime64[ns]
📊 station_95530-SE3.csv Data range: 2014-0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_96140-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_96140-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_96140-SE3.csv `date` column data type: datetime64[ns]
📊 station_96140-SE3.csv Data range: 2014-01-01 00:00:00 - 2014-12-06 00:00:00
⚠️ station_96140-SE3.csv Data is empty after filtering for dates after 2019-01-01, possibly all data is before 2019
✅ Read station_96230-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_96230-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_96230-SE3.csv `date` column data type: datetime64[ns]
📊 station_96230-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-09 00:00:00
📊 station_96230-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.6     0 2019

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_96550-SE3.csv `date` column data type: datetime64[ns]
📊 station_96550-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_96550-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            1.0     0 2019-01-01 00:00:00
1 2019-01-01            1.0     1 2019-01-01 01:00:00
2 2019-01-01            1.0     2 2019-01-01 02:00:00
3 2019-01-01            1.0     3 2019-01-01 03:00:00
4 2019-01-01            1.0     4 2019-01-01 04:00:00
✅ Read station_97070-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_97070-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_97070-SE3.csv `date` column data type: datetime64[ns]
📊 station_97070-SE3.csv Data range: 2014-01-01 00:00:00 - 2018-12-10 00:00:00
⚠️ station_97070-SE3.csv Data is empty after filtering for dates after 2019-01-01, possibly all data is before 201

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📌 station_97120-SE3.csv `date` column data type: datetime64[ns]
📊 station_97120-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_97120-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.3     0 2019-01-01 00:00:00
1 2019-01-01            0.3     1 2019-01-01 01:00:00
2 2019-01-01            0.3     2 2019-01-01 02:00:00
3 2019-01-01            0.3     3 2019-01-01 03:00:00
4 2019-01-01            0.3     4 2019-01-01 04:00:00
✅ Read station_97170-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_97170-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_97170-SE3.csv `date` column data type: datetime64[ns]
📊 station_97170-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_97170-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


📊 station_97520-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.3     0 2019-01-01 00:00:00
1 2019-01-01            0.3     1 2019-01-01 01:00:00
2 2019-01-01            0.3     2 2019-01-01 02:00:00
3 2019-01-01            0.3     3 2019-01-01 03:00:00
4 2019-01-01            0.3     4 2019-01-01 04:00:00
✅ Read station_97530-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_97530-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_97530-SE3.csv `date` column data type: datetime64[ns]
📊 station_97530-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_97530-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            2.1     0 2019-01-01 00:00:00
1 2019-01-01            2.1     1 2019-01-01 01:00:00
2 2019-01-01            2.1     2 2019-01-01 0

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_98140-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_98140-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_98140-SE3.csv `date` column data type: datetime64[ns]
📊 station_98140-SE3.csv Data range: 2014-01-01 00:00:00 - 2024-12-10 00:00:00
📊 station_98140-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.8     0 2019-01-01 00:00:00
1 2019-01-01            0.8     1 2019-01-01 01:00:00
2 2019-01-01            0.8     2 2019-01-01 02:00:00
3 2019-01-01            0.8     3 2019-01-01 03:00:00
4 2019-01-01            0.8     4 2019-01-01 04:00:00
✅ Read station_98170-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_98170-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_98170-SE3.csv `d

  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")
  df["date"] = pd.to_datetime(df[date_col], infer_datetime_format=True, dayfirst=True, errors="coerce")


✅ Read station_98290-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_98290-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_98290-SE3.csv `date` column data type: datetime64[ns]
📊 station_98290-SE3.csv Data range: 2014-01-01 00:00:00 - 2014-12-12 00:00:00
⚠️ station_98290-SE3.csv Data is empty after filtering for dates after 2019-01-01, possibly all data is before 2019
✅ Read station_99270-SE3.csv, columns: ['Datum', 'Nederbördsmängd', 'Kvalitet']
📅 station_99270-SE3.csv Original `Datum`:
 0    2014-01-01
1    2014-01-02
2    2014-01-03
3    2014-01-04
4    2014-01-05
Name: Datum, dtype: object
📌 station_99270-SE3.csv `date` column data type: datetime64[ns]
📊 station_99270-SE3.csv Data range: 2014-01-01 00:00:00 - 2021-12-12 00:00:00
📊 station_99270-SE3.csv Processed data sample:
         date  precipitation  hour           timestamp
0 2019-01-01            0.1     0 2019

# 4 SnowDepth_prepared

In [6]:
import pandas as pd
from pathlib import Path
import os

def process_snow_depth(data_path):
    dfs = []
    
    # Traverse all CSV files under the parameter_8 folder
    for f in Path(data_path).glob("parameter_8/**/*.csv"):
        try:
            # First, try reading with a tab separator
            df = pd.read_csv(f, sep="\t", encoding="utf-8")

            # Handle file format anomalies (try different separators)
            if df.shape[1] < 3:  
                df = pd.read_csv(f, sep=",", encoding="utf-8")  # Try comma separator
            if df.shape[1] < 3:  
                df = pd.read_csv(f, sep=";", encoding="utf-8")  # Try semicolon separator
            if df.shape[1] < 3:  
                print(f"⚠️ File {f.name} may have an issue (fewer than 3 columns), skipped")
                continue  # Skip problematic files

            # Get column names (dynamic detection)
            col_names = list(df.columns)
            print(f"✅ Read {f.name}, Column names: {col_names}")  # Print column names

            date_col, time_col, snow_col = col_names[0], col_names[1], col_names[2]  # Take the first 3 columns

            # Parse datetime
            df["timestamp"] = pd.to_datetime(df[date_col] + " " + df[time_col], errors="coerce")

            # Select required columns
            df = df[["timestamp", snow_col]].rename(columns={snow_col: "SnowDepth"})

            # Filter data from 2019-01-01 onwards
            df = df[df["timestamp"] >= "2019-01-01"]

            if df.empty:
                continue  # Skip if all data for this station is filtered out

            # Compute daily average
            df["date"] = df["timestamp"].dt.date
            df = df.groupby("date", as_index=False)["SnowDepth"].mean()

            # Expand to 24 hours per day
            df = df.loc[df.index.repeat(24)].reset_index(drop=True)
            df["hour"] = list(range(24)) * (len(df) // 24)

            # Generate complete timestamps
            df["timestamp"] = pd.to_datetime(df["date"]) + pd.to_timedelta(df["hour"], unit="h")

            # Select required columns, ensuring `timestamp` is the second column
            df = df[["timestamp", "SnowDepth"]]

            dfs.append(df)
        
        except Exception as e:
            print(f"❌ Error processing {f.name}: {e}")
    
    # Merge data from all stations
    if dfs:
        snow_depth_df = pd.concat(dfs).set_index("timestamp").sort_index()
        snow_depth_df = snow_depth_df.groupby("timestamp").mean()
    else:
        snow_depth_df = pd.DataFrame(columns=["timestamp", "SnowDepth"]).set_index("timestamp")  # Empty table
    
    return snow_depth_df

# Specify data path
data_path = "smhi_data_2022-today"

# Process data
snow_depth_df = process_snow_depth(data_path)

# Specify output file name
output_file = "SnowDepth_prepared.csv"

# Delete the file if it exists to avoid permission errors
if os.path.exists(output_file):
    os.remove(output_file)

# Save the processed data (automatically overwrite)
snow_depth_df.to_csv(output_file, index=True)

# Print notification
print(f"✅ File saved: {output_file} (automatically overwritten)")


✅ Read station_102170-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_102540-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_103080-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_103090-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_103410-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_103570-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_104090-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_104300-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_105310-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_105370-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']
✅ Read station_105450-SE3.csv, Column names: ['Datum', 'Tid (UTC)', 'Snödjup', 'Kvalitet']

# 5 Sunshine Time preparing

In [7]:
import pandas as pd
from pathlib import Path
import os

def process_sunshine_time(data_path):
    dfs = []
    
    # Iterate over all CSV files in the parameter_10 folder
    for f in Path(data_path).glob("parameter_10/**/*.csv"):
        try:
            # Read CSV file
            df = pd.read_csv(f, sep="\t", encoding="utf-8")

            # Handle different delimiters
            if df.shape[1] < 2:
                df = pd.read_csv(f, sep=",", encoding="utf-8")
            if df.shape[1] < 2:
                df = pd.read_csv(f, sep=";", encoding="utf-8")

            # Ensure the correct number of columns
            if df.shape[1] < 3:
                print(f"⚠️ {f.name} might have issues (fewer than 3 columns), skipping")
                continue  

            # Get column names
            col_names = list(df.columns)
            print(f"✅ Read {f.name}, column names: {col_names}")

            date_col, time_col, sun_col = col_names[0], col_names[1], col_names[2]

            # Trim whitespace
            df[date_col] = df[date_col].astype(str).str.strip()
            df[time_col] = df[time_col].astype(str).str.strip()

            # **Try multiple date format parsing methods**
            df["date_parsed"] = pd.to_datetime(df[date_col], errors="coerce", format="%Y/%m/%d")
            if df["date_parsed"].isna().all():
                df["date_parsed"] = pd.to_datetime(df[date_col], errors="coerce", format="%Y-%m-%d")

            # **If parsing still fails, skip the file**
            if df["date_parsed"].isna().all():
                print(f"❌ {f.name} date parsing failed, skipping")
                continue

            # Parse time
            df["timestamp"] = pd.to_datetime(df["date_parsed"].astype(str) + " " + df[time_col], errors="coerce")

            # **Check if `timestamp` parsing was successful**
            if df["timestamp"].isna().all():
                print(f"❌ {f.name} `timestamp` parsing failed, skipping")
                continue
            
            # Select required columns and rename
            df = df[["timestamp", sun_col]].rename(columns={sun_col: "SunshineTime"})

            # **Filter data after 2019-01-01**
            df = df[df["timestamp"] >= "2019-01-01"]

            if df.empty:
                print(f"⚠️ {f.name} has no data after filtering")
                continue  

            # **Handle NaN values**
            df["SunshineTime"] = df["SunshineTime"].fillna(0)

            # **Calculate hourly average**
            df = df.groupby("timestamp", as_index=False)["SunshineTime"].mean()

            # **Sort by timestamp**
            df = df.sort_values("timestamp")

            print(f"📊 Processed data sample from {f.name}:\n", df.head())

            dfs.append(df)
        
        except Exception as e:
            print(f"❌ Error processing {f.name}: {e}")
    
    # **Merge data from all sites**
    if dfs:
        sunshine_df = pd.concat(dfs).set_index("timestamp").sort_index()
        sunshine_df = sunshine_df.groupby("timestamp").mean()
    else:
        sunshine_df = pd.DataFrame(columns=["timestamp", "SunshineTime"]).set_index("timestamp")
    
    return sunshine_df

# **Specify data path**
data_path = "smhi_data_2022-today"

# **Process data**
sunshine_df = process_sunshine_time(data_path)

# **Specify output file name**
output_file = "SunshineTime_prepared.csv"

# **Delete file if it exists to avoid permission issues**
if os.path.exists(output_file):
    os.remove(output_file)

# **Check data size**
print(f"📊 Number of rows in processed data: {len(sunshine_df)}")

# **Save data**
if not sunshine_df.empty:
    sunshine_df.to_csv(output_file, index=True)
    print(f"✅ File saved: {output_file} (automatically overwritten)")
else:
    print(f"⚠️ No valid data found, {output_file} was not created")


✅ Read station_105285-SE3.csv, column names: ['Datum', 'Tid (UTC)', 'Solskenstid', 'Kvalitet']
📊 Processed data sample from station_105285-SE3.csv:
             timestamp  SunshineTime
0 2019-01-01 00:00:00           0.0
1 2019-01-01 01:00:00           0.0
2 2019-01-01 02:00:00           0.0
3 2019-01-01 03:00:00           0.0
4 2019-01-01 04:00:00           0.0
✅ Read station_71415-SE3.csv, column names: ['Datum', 'Tid (UTC)', 'Solskenstid', 'Kvalitet']
📊 Processed data sample from station_71415-SE3.csv:
             timestamp  SunshineTime
0 2019-01-01 00:00:00           0.0
1 2019-01-01 01:00:00           0.0
2 2019-01-01 02:00:00           0.0
3 2019-01-01 03:00:00           0.0
4 2019-01-01 04:00:00           0.0
✅ Read station_77215-SE3.csv, column names: ['Datum', 'Tid (UTC)', 'Solskenstid', 'Kvalitet']
📊 Processed data sample from station_77215-SE3.csv:
             timestamp  SunshineTime
0 2019-01-01 00:00:00           0.0
1 2019-01-01 01:00:00           0.0
2 2019-01-01 02:0

# 6 Merge Data for Modeling

In [8]:
import pandas as pd

# Read all CSV files
files = {
    "SunshineTime": "SunshineTime_prepared.csv",
    "Precipitation": "precipitation_prepared.csv",
    "SnowDepth": "SnowDepth_prepared.csv",
    "AirTemp": "AirTemp_prepared.csv",
    "Export": "exported_prepared.csv",
    "ElectricityPrice": "electricity_prepared.csv",
}

dfs = {}

for name, file in files.items():
    try:
        # **Check column names first**
        temp_df = pd.read_csv(file, nrows=5)
        print(f"📌 {file} Column Names: {list(temp_df.columns)}")

        # **Automatically detect the time column**
        time_col = "timestamp" if "timestamp" in temp_df.columns else "timestamp_utc"

        # **Read the data**
        df = pd.read_csv(file, parse_dates=[time_col], index_col=time_col)

        # **Special handling for electricity_prepared.csv**
        if name == "ElectricityPrice":
            # **Remove rows containing "n/e"**
            df = df[df.iloc[:, 0] != "n/e"]
            df.iloc[:, 0] = pd.to_numeric(df.iloc[:, 0], errors="coerce")  # Convert to numeric

        dfs[name] = df
        print(f"✅ Successfully read {file}, with {len(df)} rows of data")
    
    except Exception as e:
        print(f"❌ Failed to read {file}: {e}")

# **Merge data**
merged_df = None

for name, df in dfs.items():
    if merged_df is None:
        merged_df = df  # Initialize
    else:
        merged_df = merged_df.merge(df, on=df.index.name, how="outer")

# **Sort by time**
merged_df = merged_df.sort_index()

# **Trim the time range to match the ElectricityPrice dataset**
if "ElectricityPrice" in dfs:
    start_time = dfs["ElectricityPrice"].index.min()
    end_time = dfs["ElectricityPrice"].index.max()
    merged_df = merged_df.loc[start_time:end_time]
    print(f"⏳ Data time range trimmed to {start_time} - {end_time}")

# **Save the merged data**
output_file = "Modeling_Data.csv"
merged_df.to_csv(output_file)

print(f"✅ Successfully merged all data and saved as {output_file} (automatically overwritten)")


📌 SunshineTime_prepared.csv Column Names: ['timestamp', 'SunshineTime']
✅ Successfully read SunshineTime_prepared.csv, with 51151 rows of data
📌 precipitation_prepared.csv Column Names: ['timestamp', 'precipitation']
✅ Successfully read precipitation_prepared.csv, with 51624 rows of data
📌 SnowDepth_prepared.csv Column Names: ['timestamp', 'SnowDepth']
✅ Successfully read SnowDepth_prepared.csv, with 51168 rows of data
📌 AirTemp_prepared.csv Column Names: ['timestamp', 'AirTemp']
✅ Successfully read AirTemp_prepared.csv, with 51144 rows of data
📌 exported_prepared.csv Column Names: ['timestamp', 'export']
✅ Successfully read exported_prepared.csv, with 52608 rows of data
📌 electricity_prepared.csv Column Names: ['timestamp', 'price']
✅ Successfully read electricity_prepared.csv, with 50484 rows of data
⏳ Data time range trimmed to 2019-01-01 00:00:00 - 2024-10-04 21:00:00
✅ Successfully merged all data and saved as Modeling_Data.csv (automatically overwritten)
