# Exploratory Data Analysis (EDA)

In this notebook, we perform exploratory data analysis on the preprocessed smart meter data.
We will review summary statistics and visualize the distribution and trends of key features.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load the processed data
data_path = '../data/processed/smart_meter_data_processed.csv'
data = pd.read_csv(data_path)
data.head()

In [3]:
# Display summary statistics
data.describe()

In [4]:
# Plot the distribution of energy consumption if the column exists
if 'energy_consumption' in data.columns:
    plt.figure(figsize=(10, 6))
    plt.hist(data['energy_consumption'], bins=30, alpha=0.7)
    plt.title('Distribution of Energy Consumption')
    plt.xlabel('Energy Consumption')
    plt.ylabel('Frequency')
    plt.show()
else:
    print('Column energy_consumption not found in the dataset.')

In [5]:
# Time series plot assuming columns 'timestamp' and 'energy_consumption' exist
if 'timestamp' in data.columns and 'energy_consumption' in data.columns:
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data.sort_values('timestamp', inplace=True)
    plt.figure(figsize=(14, 7))
    plt.plot(data['timestamp'], data['energy_consumption'], label='Energy Consumption')
    plt.xlabel('Time')
    plt.ylabel('Energy Consumption')
    plt.title('Time Series of Energy Consumption')
    plt.legend()
    plt.show()
else:
    print('Timestamp or energy_consumption column not found in the dataset.')