In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime



2025-11-11 06:16:12.418469: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
df = pd.read_csv('/Users/Ptanner/ml_projects/uci_eletric_power_consumption_rnn/data/household_power_consumption.csv',low_memory=False)

In [None]:
df.head()

In [None]:
format_code = "%d/%m/%Y %H:%M:%S"
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format=format_code)


In [None]:
df.isna().sum()

In [None]:
df['Sub_metering_3'].fillna(df['Sub_metering_3'].mean(), inplace=True) # Fill NaNs with mean




In [None]:
print("Number of records with '?' in Voltage:", len(df[df['Voltage']=='?']))
print("\nExample rows:")
df[df['Voltage']=='?'].head()


In [None]:
## Drop all records where any columns have '?'
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
### Convert object columns to numeric

columns_to_convert = ['Global_active_power', 'Global_reactive_power',
  'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
  'Sub_metering_3']
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric)

In [None]:
i = 1
cols = ['Global_active_power', 'Global_reactive_power',
  'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
  'Sub_metering_3']
plt.figure(figsize=(20, 10))
for col in cols:
    plt.subplot(len(cols), 1, i)
    monthly_data = df.resample('M', on='DateTime')[col].mean()
    plt.plot(monthly_data.index, monthly_data.values)
    plt.title(col + ' data resample over month for mean', y=0.75, loc='left')
    i += 1
plt.tight_layout()
plt.show()

In [None]:
i = 1 
cols = ['Global_active_power', 'Global_reactive_power',
  'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
  'Sub_metering_3']
plt.figure(figsize=(20, 10))
for col in cols:
    plt.subplot(len(cols), 1, i)
    daily_mean = df.resample('D', on='DateTime')[col].mean()
    plt.plot(daily_mean.index, daily_mean.values)
    plt.title(col + ' data resample over day for mean', y=0.75, loc='center')
    i += 1
plt.tight_layout()
plt.show()


In [None]:
print(f'Data set size: {df.shape}')

# Correlation HeatMap

Voltage appears to be negatively correlated with our potential target Global Active Power. After doing some research this appears to be related to Voltage dropoff at the moment power surges. Or if we abstract this away it's much like how the water pressure in the house temporarily drops off when you turn on a faucet do to the total release in pressure until the voltage returns to baseline. Could be a great candidate to drop. 


In [None]:
# Get correlation with Global_active_power specifically
correlations = df[columns_to_convert].corr()['Global_active_power'].sort_values(ascending=False)
print("Pearson Correlation with Global_active_power:")
print(correlations)

In [None]:
columns_to_convert.append('DateTime')

In [None]:
f = plt.figure(figsize=(30,10))

ax = f.add_subplot(131)
dfm = df[columns_to_convert].set_index('DateTime').resample('M').mean()
sns.heatmap(dfm.corr(), vmin=-1, vmax=1, annot=True)
plt.title('Monthly resampling', size=12)

ax = f.add_subplot(132)
dfd = df[columns_to_convert].set_index('DateTime').resample('D').mean()
sns.heatmap(dfd.corr(), vmin=-1, vmax=1, annot=True)
plt.title('Daily resampling', size=12)  

ax = f.add_subplot(133)
dfh = df[columns_to_convert].set_index('DateTime').resample('H').mean()
sns.heatmap(dfh.corr(), vmin=-1, vmax=1, annot=True)
plt.title('Hourly resampling', size=12)

In [None]:
def df_to_X_y(df, window_size = 5):
  df_as_np = df.to_numpy()
  X = list()
  y = list()
  for i in range(len(df_as_np) - window_size):
    row = [[a] for a in df_as_np[i:i+5]]
    X.append(row)
    label = df_as_np[i+5]
    y.append(label)
  return np.array(X), np.array(y)

In [None]:
df.index = pd.to_datetime(df['DateTime'],format = format_code)

In [None]:
df_array = df['Global_active_power']

In [None]:
df_array.shape

In [None]:
###plot global active power by week 
hourly_mean = df_array.resample('H').mean()
daily_mean = df_array.resample('D').mean()
weekly_mean = df_array.resample('W').mean()
monthly_mean = df_array.resample('M').mean()


In [None]:
print(f'Hourly Mean record count: {len(hourly_mean)}')
print(f'columns: {hourly_mean.shape}')
print(f'Daily Mean record count: {len(daily_mean)}')
print(f'Weekly Mean record count: {len(weekly_mean)}')
[print(f'monthly_mean record count: {len(monthly_mean)}')]

In [None]:
plt.figure(figsize=(20,10))
# plt.axes(y_lim = 4)
plt.ylim = 4
plt.plot(hourly_mean)

In [None]:
WINDOW_SIZE = 5
power_daily = daily_mean 
X,y = df_to_X_y(df =power_daily, window_size=WINDOW_SIZE)
print(f'X shape:{X.shape}')
print(f'Y shape {y.shape}')

In [None]:
X_train , y_train = X[:1000], y[:1000]
X_val, y_val = X[1000:1200], y[1000:1200]
X_test, y_test = X[1200:], y[1200:]

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_val shape: {X_val.shape}, y_val shape: {y_val.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import * 
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.metrics import RootMeanSquaredError