In [9]:
import pandas as pd

# Load the new dataset
df = pd.read_csv('/content/ZZZPowerSouthCental.csv')

# Display the first few rows of the dataframe to understand its structure
print(new_df.head())

        Hour_End      SOUTH_C  Temperature  Dew Point  DHI  GHI  \
0  1/1/2012 1:00  4162.162310         13.7        7.9    0    0   
1  1/1/2012 2:00  4070.416593         12.8        7.6    0    0   
2  1/1/2012 3:00  3946.994332         12.0        7.3    0    0   
3  1/1/2012 4:00  3884.929253         11.8        7.0    0    0   
4  1/1/2012 5:00  3911.610238         12.0        6.5    0    0   

   Relative Humidity  DNI  Solar Zenith Angle  Precipitable Water  Pressure  \
0              67.99    0              110.91                 1.1       987   
1              70.54    0              123.70                 1.1       988   
2              73.11    0              136.72                 1.1       989   
3              72.54    0              149.83                 1.2       989   
4              69.27    0              162.77                 1.2       990   

   Wind Speed  Global Horizontal UV Irradiance (280-400nm)  
0         1.6                                          0.0  


check for missing values

In [10]:
# Convert the Hour_End column to datetime format
df['Hour_End'] = pd.to_datetime(df['Hour_End'])

# Normalize the numerical columns except for 'Hour_End'
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Columns to be normalized
columns_to_normalize = ['SOUTH_C', 'Temperature', 'Dew Point', 'DHI', 'GHI', 'Relative Humidity', 'DNI', 'Solar Zenith Angle', 'Precipitable Water', 'Pressure', 'Wind Speed', 'Global Horizontal UV Irradiance (280-400nm)']

# Applying normalization
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# Display the first few rows of the normalized dataframe
print(df.head())

             Hour_End   SOUTH_C  Temperature  Dew Point  DHI  GHI  \
0 2012-01-01 01:00:00  0.078472     0.382353   0.612472  0.0  0.0   
1 2012-01-01 02:00:00  0.067168     0.363445   0.605791  0.0  0.0   
2 2012-01-01 03:00:00  0.051961     0.346639   0.599109  0.0  0.0   
3 2012-01-01 04:00:00  0.044314     0.342437   0.592428  0.0  0.0   
4 2012-01-01 05:00:00  0.047602     0.346639   0.581292  0.0  0.0   

   Relative Humidity  DNI  Solar Zenith Angle  Precipitable Water  Pressure  \
0           0.667532  0.0            0.623425            0.133333  0.439024   
1           0.694017  0.0            0.699430            0.133333  0.463415   
2           0.720710  0.0            0.776801            0.133333  0.487805   
3           0.714790  0.0            0.854706            0.150000  0.487805   
4           0.680827  0.0            0.931602            0.150000  0.512195   

   Wind Speed  Global Horizontal UV Irradiance (280-400nm)  
0    0.122951                                    

Normalize data.  Keeping outliers as they are extreme weather events perhpas.   Running correlations to power usage.

In [11]:
# Assuming 'SOUTH_C' is column B as per the user's reference
correlations = df.corr()['SOUTH_C'].sort_values()

# Display the correlations with 'SOUTH_C'
print(correlations)

Solar Zenith Angle                            -0.539157
Relative Humidity                             -0.242846
Pressure                                      -0.088345
Wind Speed                                     0.165219
Dew Point                                      0.283407
Precipitable Water                             0.401165
DNI                                            0.415813
DHI                                            0.436254
Temperature                                    0.482236
Global Horizontal UV Irradiance (280-400nm)    0.531249
GHI                                            0.532548
SOUTH_C                                        1.000000
Name: SOUTH_C, dtype: float64


  correlations = df.corr()['SOUTH_C'].sort_values()


Solar Zenith, nice negative correlation.
Relative humidity to a lesser degree.
Global Horizontal Irradiance and Global Horizontal UV irradiance show nice positive correlations.  

Split data into Train, Validation, and Test sets.

In [12]:
from sklearn.model_selection import train_test_split

# Assuming 'SOUTH_C' is the target variable and the rest are features
X = df.drop('SOUTH_C', axis=1)
y = df['SOUTH_C']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Printing the shapes of the splits to confirm
print('Training set shape:', X_train.shape, y_train.shape)
print('Testing set shape:', X_test.shape, y_test.shape)


# Further splitting the training set to create a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Printing the shapes of the new splits to confirm
print('Training set shape:', X_train.shape, y_train.shape)
print('Validation set shape:', X_val.shape, y_val.shape)
print('Testing set shape:', X_test.shape, y_test.shape)

Training set shape: (21023, 12) (21023,)
Testing set shape: (5256, 12) (5256,)
Training set shape: (15767, 12) (15767,)
Validation set shape: (5256, 12) (5256,)
Testing set shape: (5256, 12) (5256,)


Put data into 3D array shape.  Sample, Time Steps, and Features.
LSTM Model
Fit training data
Use validation to monitor
Evaluate

In [13]:
import numpy as np

# Reshaping the input features for LSTM model
# LSTM expects input to be in the format of [samples, time steps, features]
# Currently, our data is in the format of [samples, features]
# We will reshape it into [samples, 1, features] as we are using one timestep for each sample

X_train_reshaped = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_val_reshaped = np.reshape(X_val.values, (X_val.shape[0], 1, X_val.shape[1]))
X_test_reshaped = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

print('Reshaped training set shape:', X_train_reshaped.shape)
print('Reshaped validation set shape:', X_val_reshaped.shape)
print('Reshaped testing set shape:', X_test_reshaped.shape)

Reshaped training set shape: (15767, 1, 12)
Reshaped validation set shape: (5256, 1, 12)
Reshaped testing set shape: (5256, 1, 12)


In [15]:

# Since X_train_reshaped and y_train are numpy arrays, we'll convert them to pandas DataFrames to inspect their data types
import pandas as pd
import numpy as np

# Converting to DataFrame for inspection
X_train_df = pd.DataFrame(X_train_reshaped.reshape(X_train_reshaped.shape[0], X_train_reshaped.shape[2]))
y_train_df = pd.DataFrame(y_train)

# Checking the data types
print(X_train_df.dtypes)
print(y_train_df.dtypes)
# Converting the datetime objects in X_train to Unix timestamps
# Assuming the first column contains the datetime objects based on the dtypes output
X_train_df[0] = pd.to_datetime(X_train_df[0]).astype(int) / 10**9

# Now, converting the object types to numeric, assuming they are meant to be numeric
X_train_df = X_train_df.apply(pd.to_numeric, errors='coerce')

# Checking the conversion
print(X_train_df.head())
print(X_train_df.dtypes)

0     datetime64[ns]
1             object
2             object
3             object
4             object
5             object
6             object
7             object
8             object
9             object
10            object
11            object
dtype: object
SOUTH_C    float64
dtype: object
             0         1         2         3         4         5         6   \
0  1.369368e+09  0.663866  0.908686  0.000000  0.000000  0.689967  0.000000   
1  1.365408e+09  0.497899  0.819599  0.000000  0.000000  0.876298  0.000000   
2  1.400962e+09  0.789916  0.899777  0.829077  0.793040  0.466452  0.443050   
3  1.393434e+09  0.294118  0.427617  0.424361  0.203297  0.479643  0.008687   
4  1.357474e+09  0.100840  0.363029  0.000000  0.000000  0.755920  0.000000   

         7         8         9         10        11  
0  0.634716  0.550000  0.390244  0.385246  0.000000  
1  0.807939  0.383333  0.243902  0.295082  0.000000  
2  0.050986  0.650000  0.341463  0.336066  0.792391  
3  0.21660

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Defining a simple neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_df.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

# Compiling the model
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_absolute_error'])

# Training the model
history = model.fit(X_train_df, y_train_df, epochs=10, validation_split=0.2, verbose=1)

print('Model training completed.')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model training completed.


In [19]:


test_df.dtypes
# Convert 'Hour_End' to datetime and extract year, month, day, and hour
test_df['Hour_End'] = pd.to_datetime(test_df['Hour_End'])
test_df['Year'] = test_df['Hour_End'].dt.year
test_df['Month'] = test_df['Hour_End'].dt.month
test_df['Day'] = test_df['Hour_End'].dt.day
test_df['Hour'] = test_df['Hour_End'].dt.hour

# Drop the original 'Hour_End' column as it's no longer needed
test_df = test_df.drop('Hour_End', axis=1)

# Display the modified dataframe to verify the changes
test_df.head()

NameError: name 'test_df' is not defined