In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
file_path = "../data/household_power_consumption.txt"

df = pd.read_csv(
    file_path,
    sep=";",
    na_values="?",
    low_memory=False
)

df.head()


In [None]:
df["Datetime"] = pd.to_datetime(
    df["Date"] + " " + df["Time"],
    format="%d/%m/%Y %H:%M:%S"
)

df.set_index("Datetime", inplace=True)
df.drop(["Date", "Time"], axis=1, inplace=True)

df.head()


In [None]:
df.info()


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(df["Global_active_power"])
plt.title("Household Electricity Consumption Over Time")
plt.xlabel("Time")
plt.ylabel("Global Active Power (kilowatts)")
plt.show()


In [None]:
df.isnull().sum()


In [None]:
df_interpolated = df.interpolate(method="time")

df_interpolated.isnull().sum()


In [None]:
daily_consumption = df_interpolated["Global_active_power"].resample("D").mean()

daily_consumption.head()


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(daily_consumption)
plt.title("Daily Average Electricity Consumption")
plt.xlabel("Date")
plt.ylabel("Global Active Power (kilowatts)")
plt.show()


In [None]:
rolling_30 = daily_consumption.rolling(window=30).mean()

plt.figure(figsize=(12, 5))
plt.plot(daily_consumption, label="Daily Consumption", alpha=0.5)
plt.plot(rolling_30, label="30-Day Rolling Mean")
plt.title("Daily Energy Consumption with 30-Day Rolling Mean")
plt.xlabel("Date")
plt.ylabel("Global Active Power (kilowatts)")
plt.legend()
plt.show()


In [None]:
daily_consumption.head()


In [None]:
rolling_30 = daily_consumption.rolling(window=30).mean()

plt.figure(figsize=(12, 5))
plt.plot(daily_consumption, label="Daily Consumption", alpha=0.5)
plt.plot(rolling_30, label="30-Day Rolling Mean")
plt.title("Daily Energy Consumption with 30-Day Rolling Mean")
plt.xlabel("Date")
plt.ylabel("Global Active Power (kilowatts)")
plt.legend()
plt.show()


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decomposition = seasonal_decompose(
    daily_consumption.dropna(),
    model="additive",
    period=365
)

decomposition.plot()
plt.show()


In [None]:
daily_consumption.groupby(daily_consumption.index.month).mean().plot(
    kind="bar",
    figsize=(10, 4),
    title="Average Energy Consumption by Month"
)

plt.xlabel("Month")
plt.ylabel("Average Consumption")
plt.show()


In [None]:
from statsmodels.tsa.stattools import adfuller

adf_result = adfuller(daily_consumption.dropna())

print("ADF Statistic:", adf_result[0])
print("p-value:", adf_result[1])


In [None]:
daily_diff = daily_consumption.diff().dropna()

adf_diff = adfuller(daily_diff)

print("ADF Statistic (Differenced):", adf_diff[0])
print("p-value (Differenced):", adf_diff[1])


In [None]:
plt.figure(figsize=(12, 4))
plt.plot(daily_diff)
plt.title("Differenced Daily Energy Consumption")
plt.xlabel("Date")
plt.ylabel("Differenced Value")
plt.show()


In [None]:
train_size = int(len(daily_consumption) * 0.8)

train = daily_consumption.iloc[:train_size]
test = daily_consumption.iloc[train_size:]

print("Train size:", len(train))
print("Test size:", len(test))


In [None]:
from statsmodels.tsa.arima.model import ARIMA

arima_model = ARIMA(train, order=(1, 1, 1))
arima_result = arima_model.fit()

print(arima_result.summary())


In [None]:
forecast = arima_result.forecast(steps=len(test))

forecast.index = test.index  # align dates


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(train, label="Training Data")
plt.plot(test, label="Actual Consumption")
plt.plot(forecast, label="ARIMA Forecast")
plt.title("ARIMA Forecast vs Actual Energy Consumption")
plt.xlabel("Date")
plt.ylabel("Global Active Power")
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(test, forecast)
rmse = np.sqrt(mean_squared_error(test, forecast))

print("MAE:", mae)
print("RMSE:", rmse)


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

sarima_model = SARIMAX(
    train,
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 365),
    enforce_stationarity=False,
    enforce_invertibility=False
)

sarima_result = sarima_model.fit(disp=False)

print(sarima_result.summary())


In [None]:
sarima_forecast = sarima_result.forecast(steps=len(test))
sarima_forecast.index = test.index


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(test, label="Actual Consumption")
plt.plot(forecast, label="ARIMA Forecast")
plt.plot(sarima_forecast, label="SARIMA Forecast")
plt.title("ARIMA vs SARIMA Forecast Comparison")
plt.xlabel("Date")
plt.ylabel("Global Active Power")
plt.legend()
plt.show()


In [None]:
sarima_mae = mean_absolute_error(test, sarima_forecast)
sarima_rmse = np.sqrt(mean_squared_error(test, sarima_forecast))

print("ARIMA MAE:", mae)
print("ARIMA RMSE:", rmse)
print("SARIMA MAE:", sarima_mae)
print("SARIMA RMSE:", sarima_rmse)


In [None]:
print(sarima_result.aic)


In [None]:
sarima_forecast.head()


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(test, label="Actual Consumption")
plt.plot(forecast, label="ARIMA Forecast")
plt.plot(sarima_forecast, label="SARIMA Forecast")
plt.title("ARIMA vs SARIMA Forecast Comparison")
plt.xlabel("Date")
plt.ylabel("Global Active Power")
plt.legend()
plt.show()


In [None]:
df_ml = pd.DataFrame(daily_consumption)
df_ml.columns = ["y"]

# Create lag features
for lag in range(1, 8):
    df_ml[f"lag_{lag}"] = df_ml["y"].shift(lag)

df_ml.dropna(inplace=True)
df_ml.head()


In [None]:
X = df_ml.drop("y", axis=1)
y = df_ml["y"]

train_size = int(len(df_ml) * 0.8)

X_train = X.iloc[:train_size]
X_test = X.iloc[train_size:]

y_train = y.iloc[:train_size]
y_test = y.iloc[train_size:]


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf_model.fit(X_train, y_train)


In [None]:
rf_predictions = rf_model.predict(X_test)

rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))

print("Random Forest MAE:", rf_mae)
print("Random Forest RMSE:", rf_rmse)


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(y_test.index, y_test, label="Actual")
plt.plot(y_test.index, forecast, label="ARIMA Forecast")
plt.plot(y_test.index, rf_predictions, label="RF Forecast")
plt.title("ARIMA vs Random Forest Forecast")
plt.xlabel("Date")
plt.ylabel("Energy Consumption")
plt.legend()
plt.show()
