In [47]:
import requests
import pandas as pd

# Open-Meteo API Endpoint
BASE_URL = "https://api.open-meteo.com/v1/forecast"

# Location: Berlin, Germany (Example)
LAT, LON = 52.52, 13.41  # Change to your desired location

# Number of past days to fetch (Max = 92 days)
PAST_DAYS = 92  

# API Parameters with All Features
params = {
    "latitude": LAT,
    "longitude": LON,
    "past_days": PAST_DAYS,
    "hourly": "temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,"
              "precipitation_probability,precipitation,rain,showers,snowfall,snow_depth,"
              "weather_code,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,"
              "cloud_cover_mid,cloud_cover_high,visibility,evapotranspiration,"
              "et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_180m,"
              "wind_direction_180m,wind_gusts_10m,temperature_180m,soil_temperature_54cm,"
              "soil_moisture_27_to_81cm,is_day",
    "daily": "weather_code,temperature_2m_max,temperature_2m_min,apparent_temperature_max,"
             "apparent_temperature_min,sunrise,sunset,daylight_duration,sunshine_duration,"
             "uv_index_max,precipitation_sum,rain_sum,showers_sum,snowfall_sum,"
             "precipitation_hours,precipitation_probability_max,wind_speed_10m_max,"
             "wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,"
             "et0_fao_evapotranspiration",
    "timezone": "auto"
}

# Make API Request
response = requests.get(BASE_URL, params=params)

# Check for successful response
if response.status_code == 200:
    data = response.json()
    print("✅ Historical Weather Data Fetched Successfully!")
    
    # Extract and format hourly data
    hourly_data = pd.DataFrame(data["hourly"])
    if "time" in hourly_data:
        hourly_data["time"] = pd.to_datetime(hourly_data["time"])  # Convert time to datetime
    
    print("\n📊 Hourly Weather Data Sample:")
    print(hourly_data.head())

    # Extract and format daily data
    daily_data = pd.DataFrame(data["daily"])
    if "time" in daily_data:
        daily_data["time"] = pd.to_datetime(daily_data["time"])  # Convert time to datetime
    
    print("\n📊 Daily Weather Data Sample:")
    print(daily_data.head())

else:
    print(f"❌ Error {response.status_code}: {response.text}")


✅ Historical Weather Data Fetched Successfully!

📊 Hourly Weather Data Sample:
                 time  temperature_2m  relative_humidity_2m  dew_point_2m  \
0 2024-12-06 00:00:00             2.3                    79          -1.0   
1 2024-12-06 01:00:00             3.2                    73          -1.2   
2 2024-12-06 02:00:00             3.9                    65          -2.1   
3 2024-12-06 03:00:00             4.1                    67          -1.5   
4 2024-12-06 04:00:00             3.6                    79           0.3   

   apparent_temperature  precipitation_probability  precipitation  rain  \
0                  -3.2                         15            0.0   0.0   
1                  -2.1                         23            0.0   0.0   
2                  -1.6                         29            0.0   0.0   
3                  -1.5                         35            0.0   0.0   
4                  -1.8                         45            0.6   0.6   

   show

In [48]:
hourly_data.shape

(2376, 29)

In [51]:
daily_data.shape

(99, 22)

In [53]:
# Save Hourly Data
hourly_data.to_csv("hourly_weather_data.csv", index=False)

# Save Daily Data
daily_data.to_csv("daily_weather_data.csv", index=False)

print("✅ Data saved successfully as CSV files!")


✅ Data saved successfully as CSV files!


In [55]:
import os

# Get the current working directory
cwd = os.getcwd()
print("Files saved in:", cwd)


Files saved in: C:\Users\samyu\Downloads\project-bolt-sb1-8uymgqgg\project


In [57]:
hourly_data.to_csv("C:/Users/samyu/Documents/hourly_weather_data.csv", index=False)
daily_data.to_csv("C:/Users/samyu/Documents/daily_weather_data.csv", index=False)


In [11]:
import pandas as pd
import numpy as np

# Load dataset
file_path = r"C:\Users\samyu\Downloads\project-bolt-sb1-mdwuunns\model\daily_weather_data.csv"
output_file = r"C:\Users\samyu\Downloads\project-bolt-sb1-mdwuunns\model\standardized_weather_data.csv"

df = pd.read_csv(file_path)

# Print column names for debugging
print("Columns in dataset:", df.columns.tolist())

# Convert 'time' column to proper datetime format if it exists
if "time" in df.columns:
    df["Date Time (ISO 8601)"] = pd.to_datetime(df["time"]).dt.strftime("%Y-%m-%d %H:%M:%S")

# Rename columns to match required format
rename_dict = {
    "temperature_2m_max": "T (°C)",
    "temperature_2m_min": "Tdew (°C)",  # Assuming min temp is dew point
    "pressure_msl": "p (mbar)",  # Ensure correct pressure naming
    "relative_humidity_2m_max": "rh (%)",
    "wind_speed_10m_max": "wv (km/h)",
    "wind_gusts_10m_max": "max. wv (km/h)",
    "wind_direction_10m_dominant": "wd (°)",
}

df.rename(columns=rename_dict, inplace=True)

# Print column names after renaming
print("Columns after renaming:", df.columns.tolist())

# Ensure required columns exist
required_columns = rename_dict.values()
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    print("Warning: Missing columns after renaming:", missing_columns)
    for col in missing_columns:
        df[col] = np.nan  # Fill missing columns with NaN

# Constants
R_SPECIFIC = 287.05  # J/kg·K (Specific gas constant for dry air)

# Calculate vapor pressures
df['VPmax (mbar)'] = 6.1078 * 10 ** ((7.5 * df['T (°C)']) / (df['T (°C)'] + 237.3))
df['VPact (mbar)'] = 6.1078 * 10 ** ((7.5 * df['Tdew (°C)']) / (df['Tdew (°C)'] + 237.3))
df['VPdef (mbar)'] = df['VPmax (mbar)'] - df['VPact (mbar)']
df['rh (%)'] = 100 * (df['VPact (mbar)'] / df['VPmax (mbar)'])
df['rh (%)'].fillna(50, inplace=True)  # Default to 50% if NaN remains

# Calculate specific humidity (sh) and water content
df['sh (g/kg)'] = (0.622 * df['VPact (mbar)']) / (df['p (mbar)'].fillna(1013.25) - (0.378 * df['VPact (mbar)'])) * 1000
df['H2OC (mmol/mol)'] = df['sh (g/kg)'] * 1000 / 18

# Convert temperature to Kelvin
df["Tpot (K)"] = df["T (°C)"].fillna(0) + 273.15  

# Calculate air density (ρ)
df['rho (g/m**3)'] = df['p (mbar)'].fillna(1013.25) * 100 / (R_SPECIFIC * df['Tpot (K)'])

# **NEW: Calculate virtual temperature (Tv)**
df["q"] = df["sh (g/kg)"] / 1000  # Convert g/kg to kg/kg
df["Tv (K)"] = df["Tpot (K)"] * (1 + 0.61 * df["q"])

# **NEW: Calculate pressure (p in mbar) dynamically**
df["p (mbar)"] = (df["rho (g/m**3)"] * R_SPECIFIC * df["Tv (K)"]) / 100

# Convert wind speeds
df["wv (m/s)"] = df["wv (km/h)"].fillna(0) / 3.6
df["max. wv (m/s)"] = df["max. wv (km/h)"].fillna(0) / 3.6

# Ensure all required columns exist
desired_columns = [
    "Date Time (ISO 8601)", "p (mbar)", "T (°C)", "Tpot (K)", "Tdew (°C)",
    "rh (%)", "VPmax (mbar)", "VPact (mbar)", "VPdef (mbar)", "sh (g/kg)",
    "H2OC (mmol/mol)", "rho (g/m**3)", "wv (m/s)", "max. wv (m/s)", "wd (°)"
]

for col in desired_columns:
    if col not in df.columns:
        df[col] = np.nan  # Ensure all columns exist

df = df[desired_columns]  # Reorder columns

df.to_csv(output_file, index=False)
print(f"Processed dataset saved at: {output_file}")


Columns in dataset: ['time', 'weather_code', 'temperature_2m_max', 'temperature_2m_min', 'apparent_temperature_max', 'apparent_temperature_min', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'uv_index_max', 'precipitation_sum', 'rain_sum', 'showers_sum', 'snowfall_sum', 'precipitation_hours', 'precipitation_probability_max', 'wind_speed_10m_max', 'wind_gusts_10m_max', 'wind_direction_10m_dominant', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration']
Columns after renaming: ['time', 'weather_code', 'T (°C)', 'Tdew (°C)', 'apparent_temperature_max', 'apparent_temperature_min', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'uv_index_max', 'precipitation_sum', 'rain_sum', 'showers_sum', 'snowfall_sum', 'precipitation_hours', 'precipitation_probability_max', 'wv (km/h)', 'max. wv (km/h)', 'wd (°)', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration', 'Date Time (ISO 8601)']
Processed dataset saved at: C:\Users\samyu\Downloads\project-bolt-sb1-mdwuu

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rh (%)'].fillna(50, inplace=True)  # Default to 50% if NaN remains


In [151]:
df.info()
print(df['Date Time (ISO 8601)'].unique())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2376 entries, 0 to 2375
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Date Time (ISO 8601)  2376 non-null   object 
 1   p (mbar)              0 non-null      float64
 2   T (°C)                0 non-null      float64
 3   Tpot (K)              2376 non-null   float64
 4   Tdew (°C)             0 non-null      float64
 5   rh (%)                2376 non-null   float64
 6   VPmax (mbar)          0 non-null      float64
 7   VPact (mbar)          0 non-null      float64
 8   VPdef (mbar)          0 non-null      float64
 9   sh (g/kg)             0 non-null      float64
 10  H2OC (mmol/mol)       0 non-null      float64
 11  rho (g/m**3)          2376 non-null   float64
 12  wv (m/s)              2376 non-null   float64
 13  max. wv (m/s)         2376 non-null   float64
 14  wd (°)                0 non-null      float64
dtypes: float64(14), objec

In [23]:
#changing the API retrev

In [123]:
import pandas as pd

# Load API-extracted dataset
api_data = pd.read_csv("standardized_weather_data.csv")

# Ensure there's a timestamp column (modify this based on actual column names)
if 'timestamp' not in api_data.columns:
    raise ValueError("Timestamp column is missing in API dataset. Add or generate it.")

# Convert timestamp column to datetime format
api_data['timestamp'] = pd.to_datetime(api_data['timestamp'])

# Set timestamp as index
api_data.set_index('timestamp', inplace=True)

# Resample data to 10-minute intervals (matching Jena Climate dataset)
api_data_resampled = api_data.resample('10T').interpolate(method='linear')

# Save the resampled data
api_data_resampled.to_csv("resampled_weather_data.csv")

print("API dataset resampled to 10-minute intervals and saved as resampled_weather_data.csv")


ValueError: Timestamp column is missing in API dataset. Add or generate it.

In [119]:
df_shifted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       100 non-null    int64
 1   B       100 non-null    int64
dtypes: int64(2)
memory usage: 1.7 KB


In [43]:
import pandas as pd

# Rename 'Date Time' columns for consistency
df_shifted.rename(columns={"Date Time (ISO 8601)": "date_time"}, inplace=True)
jena_climate_2009_2016.rename(columns={"Date Time": "date_time"}, inplace=True)

# Convert 'date_time' to datetime format
df_shifted["date_time"] = pd.to_datetime(df_shifted["date_time"], errors="coerce")
jena_climate_2009_2016["date_time"] = pd.to_datetime(jena_climate_2009_2016["date_time"], errors="coerce")

# Find common columns
common_columns = list(set(df_shifted.columns) & set(jena_climate_2009_2016.columns))

# Select only the common columns from both dataframes
df_shifted = df_shifted[common_columns]
jena_climate_2009_2016 = jena_climate_2009_2016[common_columns]

# Concatenate row-wise
combined_df = pd.concat([df_shifted, jena_climate_2009_2016], ignore_index=True)

# Check for NaN values
print(combined_df.isna().sum())  # Count missing values per column
print(combined_df.shape)  # Check the final dataframe shape
print(combined_df.head())  # Preview data


VPdef (mbar)       0
wv (m/s)           0
p (mbar)           0
VPact (mbar)       0
Tpot (K)           0
VPmax (mbar)       0
max. wv (m/s)      0
rho (g/m**3)       0
H2OC (mmol/mol)    0
date_time          0
sh (g/kg)          0
rh (%)             0
dtype: int64
(420640, 12)
   VPdef (mbar)  wv (m/s)     p (mbar)  VPact (mbar)  Tpot (K)  VPmax (mbar)  \
0      2.339745  7.000000  1017.392663     10.874116    284.25     13.213861   
1      0.999421  6.333333  1017.392663     10.874116    282.65     11.873537   
2      4.232198  4.694444  1016.736722      9.158213    284.45     13.390411   
3      5.199762  6.166667  1016.935828      9.679296    286.05     14.879059   
4      1.509438  6.833333  1016.258621      7.906140    279.25      9.415579   

   max. wv (m/s)  rho (g/m**3)  H2OC (mmol/mol)  date_time  sh (g/kg)  \
0      16.388889      1.241820       372.357933 2024-12-16   6.702443   
1      14.111111      1.248849       372.357933 2024-12-17   6.702443   
2      11.000000      

In [51]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420640 entries, 0 to 420639
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   VPdef (mbar)     420640 non-null  float64       
 1   wv (m/s)         420640 non-null  float64       
 2   p (mbar)         420640 non-null  float64       
 3   VPact (mbar)     420640 non-null  float64       
 4   Tpot (K)         420640 non-null  float64       
 5   VPmax (mbar)     420640 non-null  float64       
 6   max. wv (m/s)    420640 non-null  float64       
 7   rho (g/m**3)     420640 non-null  float64       
 8   H2OC (mmol/mol)  420640 non-null  float64       
 9   date_time        420640 non-null  datetime64[ns]
 10  sh (g/kg)        420640 non-null  float64       
 11  rh (%)           420640 non-null  float64       
dtypes: datetime64[ns](1), float64(11)
memory usage: 38.5 MB


In [55]:
# Check for missing values
print(combined_df.isnull().sum())

# Option 1: Fill NaNs with interpolation (recommended for time-series)
combined_df.interpolate(method='linear', inplace=True)

# Option 2: Drop rows with NaNs (if very few)
combined_df.dropna(inplace=True)

VPdef (mbar)       0
wv (m/s)           0
p (mbar)           0
VPact (mbar)       0
Tpot (K)           0
VPmax (mbar)       0
max. wv (m/s)      0
rho (g/m**3)       0
H2OC (mmol/mol)    0
date_time          0
sh (g/kg)          0
rh (%)             0
dtype: int64


In [57]:
from sklearn.preprocessing import MinMaxScaler

# Select numerical columns (excluding 'date_time')
num_cols = combined_df.select_dtypes(include=['float64', 'int64']).columns

# Initialize scaler
scaler = MinMaxScaler()

# Fit and transform numerical columns
combined_df[num_cols] = scaler.fit_transform(combined_df[num_cols])

# Save scaler for future inverse transformation (if needed)
import joblib
joblib.dump(scaler, "scaler.pkl")

# Check normalized data
print(combined_df.head())


   VPdef (mbar)  wv (m/s)  p (mbar)  VPact (mbar)  Tpot (K)  VPmax (mbar)  \
0      0.050853  0.997857  1.000000      0.366296  0.554001      0.195222   
1      0.021722  0.997790  1.000000      0.366296  0.527659      0.173886   
2      0.091984  0.997627  0.993680      0.303967  0.557293      0.198033   
3      0.113014  0.997774  0.995599      0.322895  0.583635      0.221730   
4      0.032807  0.997840  0.989074      0.258487  0.471683      0.134759   

   max. wv (m/s)  rho (g/m**3)  H2OC (mmol/mol)  date_time  sh (g/kg)  \
0       0.999290      0.000024         1.000000 2024-12-16   0.351812   
1       0.999063      0.000029         1.000000 2024-12-17   0.351812   
2       0.998753      0.000023         0.841321 2024-12-18   0.291616   
3       0.999013      0.000018         0.889487 2024-12-19   0.309888   
4       0.999102      0.000039         0.725664 2024-12-20   0.247741   

     rh (%)  
0  0.796591  
1  0.903306  
2  0.636919  
3  0.598543  
4  0.815838  


In [61]:
import numpy as np

def create_sliding_window_sequences(data, window_size):
    X, Y = [], []
    
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])  # Past 'window_size' steps
        Y.append(data[i + window_size])    # Predict next step
    
    return np.array(X), np.array(Y)

# Example window size (choose based on your prediction goal)
window_size = 10  

# Convert DataFrame to NumPy array
data_array = combined_df.values  # Assuming merged_df is your final stacked dataset

# Create sequences
X, Y = create_sliding_window_sequences(data_array, window_size)

# Print shape to verify
print("Shape of X:", X.shape)  # (samples, window_size, features)
print("Shape of Y:", Y.shape)  # (samples, features)


Shape of X: (420630, 10, 12)
Shape of Y: (420630, 12)


In [81]:
import numpy as np
from sklearn.model_selection import train_test_split

def create_sliding_window_sequences(data, target_column, window_size):
    X, Y = [], []
    
    target_idx = combined_df.columns.get_loc(target_column)  # Get column index of target
    
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])  # Past 'window_size' steps
        Y.append(data[i + window_size, target_idx])  # Predict target value at next step
    
    return np.array(X), np.array(Y)

# Example window size
window_size = 10  
target_column = "T (°C)"  # Your target column

# Convert DataFrame to NumPy array
data_array = combined_df.values  # Assuming combined_df is your final stacked dataset

# Create sequences
X, y = create_sliding_window_sequences(data_array, target_column, window_size)

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)  # No shuffle for time-series!

# Reshape y to (samples, 1) for LSTM
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# Print final shapes
print(f"Training Data Shape: {X_train.shape}, Labels Shape: {y_train.shape}")
print(f"Testing Data Shape: {X_test.shape}, Labels Shape: {y_test.shape}")


KeyError: 'T (°C)'

In [85]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420640 entries, 0 to 420639
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   VPdef (mbar)     420640 non-null  float64       
 1   wv (m/s)         420640 non-null  float64       
 2   p (mbar)         420640 non-null  float64       
 3   VPact (mbar)     420640 non-null  float64       
 4   Tpot (K)         420640 non-null  float64       
 5   VPmax (mbar)     420640 non-null  float64       
 6   max. wv (m/s)    420640 non-null  float64       
 7   rho (g/m**3)     420640 non-null  float64       
 8   H2OC (mmol/mol)  420640 non-null  float64       
 9   date_time        420640 non-null  datetime64[ns]
 10  sh (g/kg)        420640 non-null  float64       
 11  rh (%)           420640 non-null  float64       
dtypes: datetime64[ns](1), float64(11)
memory usage: 38.5 MB


In [None]:
import keras_tuner as kt

# Define model function for Keras Tuner
def build_model(hp):
    model = Sequential()
    model.add(LSTM(hp.Int('units1', min_value=32, max_value=256, step=32), return_sequences=True, input_shape=(10, 12)))
    model.add(LSTM(hp.Int('units2', min_value=32, max_value=128, step=32), return_sequences=False))
    model.add(Dense(12))  # Output layer

    # Tune Learning Rate
    lr = hp.Choice('learning_rate', values=[0.001, 0.0005, 0.0001])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss='mse', metrics=['mae'])
    return model

# Initialize tuner
tuner = kt.Hyperband(
    build_model, 
    objective='val_loss', 
    max_epochs=30, 
    factor=3, 
    directory='tuner_results', 
    project_name='lstm_tuning'
)

# Search for best hyperparameters
tuner.search(X, Y, epochs=30, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

# Get the best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.hypermodel.build(best_hps)

# Train the best model
history = best_model.fit(X, Y, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate
loss, mae = best_model.evaluate(X, Y)
print(f"Optimized Loss: {loss}, MAE: {mae}")


In [121]:
import pandas as pd

# Load the Kaggle dataset
kaggle_df = pd.read_csv("jena_climate_2009_2016.csv", parse_dates=["timestamp"])  # Replace with actual filename

# Calculate time difference
time_diffs = kaggle_df["timestamp"].diff().dropna()

# Get the most common time step
time_step = time_diffs.value_counts().idxmax()

print("Detected time step:", time_step)


ValueError: Missing column provided to 'parse_dates': 'timestamp'

In [None]:
import pandas as pd

# Load dataset
file_path = r"C:\Users\samyu\Downloads\project-bolt-sb1-mdwuunns\model\daily_weather_data.csv"
output_file = r"C:\Users\samyu\Downloads\project-bolt-sb1-mdwuunns\model\standardized_weather_data.csv"

df = pd.read_csv(file_path)

# Print column names for debugging
print("Columns in dataset:", df.columns)

# Convert 'time' column to proper datetime format
df["Date Time (ISO 8601)"] = pd.to_datetime(df["time"]).dt.strftime("%Y-%m-%d %H:%M:%S")

# Rename columns to match standard format
rename_dict = {
    "temperature_2m_max": "T (°C)",
    "temperature_2m_min": "Tdew (°C)",  # Assuming Min Temp is Dew Point Temp
    "p (mbar)": "p (hPa)",  # Convert naming convention
    "wind_speed_max": "wv (km/h)",
    "wind_gust_max": "max. wv (km/h)",
    "wind_direction": "wd (°)"
}

df.rename(columns=rename_dict, inplace=True)

# Ensure missing wind speed columns exist and check if they contain actual values
if "wind_speed_max" in df.columns:
    df.rename(columns={"wind_speed_max": "wv (km/h)"}, inplace=True)
    df["wv (km/h)"] = df["wv (km/h)"].fillna(0)  # Replace NaN with 0

if "wind_gust_max" in df.columns:
    df.rename(columns={"wind_gust_max": "max. wv (km/h)"}, inplace=True)
    df["max. wv (km/h)"] = df["max. wv (km/h)"].fillna(0)

if "wind_direction" in df.columns:
    df.rename(columns={"wind_direction": "wd (°)"}, inplace=True)
    df["wd (°)"] = df["wd (°)"].fillna("Unknown")  # Keep Unknown for missing wind direction

# Print to verify changes
print(df.head())

# Rename wind-related columns
df.rename(columns={
    "wind_speed_10m_max": "wv (km/h)",
    "wind_gusts_10m_max": "max. wv (km/h)",
    "wind_direction_10m_dominant": "wd (°)"
}, inplace=True)

# Ensure no missing values in wind-related columns
df["wv (km/h)"] = df["wv (km/h)"].fillna(0)  # Replace NaN with 0
df["max. wv (km/h)"] = df["max. wv (km/h)"].fillna(0)
df["wd (°)"] = df["wd (°)"].fillna("Unknown")  # Replace NaN with 'Unknown'

# Verify the changes
print(df.head())


# Ensure the column exists before trying to modify it
if "p (hPa)" in df.columns:
    df["p (hPa)"] = df["p (hPa)"].astype(float)

# Fix Tpot (K) values
if "T (°C)" in df.columns:
    df["Tpot (K)"] = df["T (°C)"] + 273.15
else:
    df["Tpot (K)"] = None  # Handle missing values

# Ensure missing wind speed columns exist
for col in ["wv (km/h)", "max. wv (km/h)", "wd (°)"]:
    if col not in df.columns:
        df[col] = None  # Assign default value if missing

# Reorder columns to match standard format
standard_columns = [
    "Date Time (ISO 8601)", "p (hPa)", "T (°C)", "Tpot (K)", "Tdew (°C)",
    "rh (%)", "VPmax (hPa)", "VPact (hPa)", "VPdef (hPa)", "sh (g/kg)",
    "H2OC (mmol/mol)", "rho (g/m³)", "wv (km/h)", "max. wv (km/h)", "wd (°)"
]

# Keep only available columns from standard list
df = df[[col for col in standard_columns if col in df.columns]]

# Save to CSV
df.to_csv(output_file, index=False)

print(f"Converted dataset saved at: {output_file}")
