In [1]:
import pandas as pd
import numpy as np
import zipfile
import urllib.request
import os

def advanced_load_and_clean():
    # --- MODULE 1: DATA COLLECTION ---
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
    if not os.path.exists('household_power_consumption.txt'):
        urllib.request.urlretrieve(url, "energy_data.zip")
        with zipfile.ZipFile("energy_data.zip", "r") as zip_ref:
            zip_ref.extractall()

    # Handle '?' as NaN
    df = pd.read_csv('household_power_consumption.txt', sep=';', low_memory=False, na_values=['?'])

    # Timestamp Formatting and Dataset Structuring [cite: 47, 81]
    df['dt'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], dayfirst=True)
    df.set_index('dt', inplace=True)
    df.drop(['Date', 'Time'], axis=1, inplace=True)

    # --- MODULE 2: CLEANING & ENCODING ---
    # Type Conversion / Encoding for ML Compatibility
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Missing Values: Using Forward Fill for time-series continuity
    df = df.ffill()

    # Outlier Detection (IQR Method) to protect model accuracy
    for col in ['Global_active_power', 'Voltage']:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df[col] = np.clip(df[col], Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)

    # Resampling to Hourly consumption [cite: 47, 81]
    df_hourly = df.resample('H').mean()
    print("Module 1 & 2 Complete.")
    return df_hourly

df_energy = advanced_load_and_clean()

Module 1 & 2 Complete.


  df_hourly = df.resample('H').mean()
