In [8]:
# importing libraries
import pandas as pd

In [9]:
# File paths
input_file = "../data/combined_weather_data.csv"
processed_file = "../data/processed_weather_data.csv"
feature_engineered_file = "../data/feature_engineered_weather_data.csv"

In [10]:
df = pd.read_csv(input_file, encoding='utf-8')
print("Initial Data Info:\n", df.info())

df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")
df.dropna(subset=["Date"], inplace=True)
df.drop_duplicates(inplace=True)

numeric_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in numeric_cols:
    if col != "Date":
        df[col] = pd.to_numeric(df[col], errors="coerce")

df.fillna(df.median(numeric_only=True), inplace=True)
print("Final Data Info:\n", df.info())
df.to_csv(processed_file, index=False)
print(f"Processed data saved to {processed_file}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2441 entries, 0 to 2440
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date                          2441 non-null   object 
 1   Air Temp degC - min           2441 non-null   float64
 2   Air Temp degC - ave           2441 non-null   float64
 3   Air Temp degC - max           2441 non-null   float64
 4   Air Temp (Canopy) degC - ave  2410 non-null   float64
 5   Humidity % - ave              2441 non-null   float64
 6   Humidity (Canopy) % - ave     2410 non-null   float64
 7   Wind Speed km/h - min         2422 non-null   float64
 8   Wind Speed km/h - ave         2422 non-null   float64
 9   Wind Speed km/h - max         2422 non-null   float64
 10  Rainfall mm - raw             2441 non-null   float64
 11  Solar Radiation W/m^2 - ave   2441 non-null   float64
 12  ETo mm - computed             2413 non-null   float64
 13  Soi

In [11]:
df = pd.read_csv(processed_file, encoding='utf-8')
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")

df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df["Weekday"] = df["Date"].dt.weekday  # Monday = 0, Sunday = 6

def assign_season(month):
    if month in [12, 1, 2]: return "Summer"
    elif month in [3, 4, 5]: return "Autumn"
    elif month in [6, 7, 8]: return "Winter"
    else: return "Spring"

df["Season"] = df["Month"].apply(assign_season)

df["Temp_Range"] = df["Air Temp degC - max"] - df["Air Temp degC - min"]
df["Humidity_Range"] = df["Humidity % - ave"] - df["Humidity (Canopy) % - ave"]

for col in ["Air Temp degC - ave", "Humidity % - ave", "Rainfall mm - raw"]:
    df[f"{col}_7d_avg"] = df[col].rolling(window=7).mean()
    df[f"{col}_30d_avg"] = df[col].rolling(window=30).mean()
    df[f"{col}_lag1"] = df[col].shift(1)

df.dropna(inplace=True)
df.to_csv(feature_engineered_file, index=False)
print(f"Feature-engineered dataset saved to {feature_engineered_file}")

Feature-engineered dataset saved to ../data/feature_engineered_weather_data.csv
