In [6]:
import pandas as pd

def preprocess_data(input_file, output_file):
    # Load dataset
    df = pd.read_csv(input_file, encoding='utf-8')

    # Display initial info
    print("Initial Data Info:\n", df.info())

    # Convert Date column to datetime format
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")
    else:
        raise KeyError("Error: 'Date' column not found in the dataset!")

    # Drop rows where Date conversion failed
    df.dropna(subset=["Date"], inplace=True)

    # Drop duplicate rows
    df.drop_duplicates(inplace=True)

    # Convert numeric columns to proper data types
    numeric_cols = df.select_dtypes(include=['object']).columns.tolist()
    for col in numeric_cols:
        if col != "Date":  # Exclude Date column
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Handle missing values - fill numerical columns with median
    df.fillna(df.median(numeric_only=True), inplace=True)

    # Display final info
    print("Final Data Info:\n", df.info())

    # Save cleaned dataset
    df.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")

if __name__ == "__main__":
    preprocess_data("../data/combined_weather_data.csv", "../data/processed_weather_data.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2441 entries, 0 to 2440
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date                          2441 non-null   object 
 1   Air Temp degC - min           2441 non-null   float64
 2   Air Temp degC - ave           2441 non-null   float64
 3   Air Temp degC - max           2441 non-null   float64
 4   Air Temp (Canopy) degC - ave  2410 non-null   float64
 5   Humidity % - ave              2441 non-null   float64
 6   Humidity (Canopy) % - ave     2410 non-null   float64
 7   Wind Speed km/h - min         2422 non-null   float64
 8   Wind Speed km/h - ave         2422 non-null   float64
 9   Wind Speed km/h - max         2422 non-null   float64
 10  Rainfall mm - raw             2441 non-null   float64
 11  Solar Radiation W/m^2 - ave   2441 non-null   float64
 12  ETo mm - computed             2413 non-null   float64
 13  Soi