In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error



In [10]:
# Load datasets

temp_df = pd.read_csv("../train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[2:total_columns] 
train_data = pd.read_csv("../train.csv", usecols=columns_to_use)


train_data = train_data.drop(columns=[ 'site_id', 'sub_primaryspaceusage', 'season'])

test_data = pd.read_csv("../test.csv", usecols=columns_to_use)

test_data = test_data.drop(columns=[ 'site_id', 'sub_primaryspaceusage', 'season'])
train_data.head()


Unnamed: 0,building_id,meter,date,meter_reading,building_id_kaggle,site_id_kaggle,sqm,sqft,timezone,airTemperature,cloudCoverage,dewTemperature,precipDepth1HR,precipDepth6HR,seaLvlPressure,windDirection,windSpeed
0,Bear_education_Alfredo,electricity,2016-01-01,2.905,636.0,4.0,609.8,6564.0,US/Pacific,5.246861,1.927009,0.254484,0.351088,10.801125,1018.888301,172.924863,3.807399
1,Bear_education_Alfredo,electricity,2016-01-02,2.77,636.0,4.0,609.8,6564.0,US/Pacific,5.993973,1.997893,0.892188,0.409453,11.105558,1014.347411,181.359441,4.202455
2,Bear_education_Alfredo,electricity,2016-01-03,2.6725,636.0,4.0,609.8,6564.0,US/Pacific,5.660314,1.946017,0.778475,0.552568,11.167389,1010.396019,208.978674,4.015919
3,Bear_education_Alfredo,electricity,2016-01-04,4.565,636.0,4.0,609.8,6564.0,US/Pacific,5.048507,1.987616,-0.268905,0.479493,11.089874,1008.903334,211.37704,3.909701
4,Bear_education_Alfredo,electricity,2016-01-05,4.7825,636.0,4.0,609.8,6564.0,US/Pacific,4.745567,2.007311,0.321921,1.033857,11.723586,1012.7477,170.002007,3.528571


In [11]:
# Preprocessing
def preprocess_data(data):
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    
    categorical_features = ['meter']
    numerical_features = ['sqm', 'airTemperature', 'cloudCoverage', 'dewTemperature', 
                          'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 
                          'windDirection', 'windSpeed']
    
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    numerical_transformer = StandardScaler()
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor.fit_transform(data), data['meter_reading']  # Return processed features and target

X_train, y_train = preprocess_data(train_data)
X_test, y_test = preprocess_data(test_data)

In [58]:
meter_types = train_data['meter'].unique()  # Get unique meter types from the dataset

# Create a dictionary to hold the models and their history
models = {}
histories = {}
evaluations = {}
predictions = {}
rmses = {}


for meter_type in meter_types:
    # Segment the datasets by meter type
    train_data_meter = train_data[train_data['meter'] == meter_type]
    test_data_meter = test_data[test_data['meter'] == meter_type]
    
    # Preprocess the data
    X_train_meter, y_train_meter = preprocess_data(train_data_meter)
    X_test_meter, y_test_meter = preprocess_data(test_data_meter)
    
    # Create the model for this meter type
    input_shape = X_train_meter.shape[1]  # Get input shape from the preprocessed data
    model = create_model(input_shape)
    
    # Train the model
    print(f'Training model for {meter_type}...')
    history = model.fit(X_train_meter, y_train_meter, validation_split=0.2, epochs=50, batch_size=32)
    
    # Save the model and its history
    models[meter_type] = model
    histories[meter_type] = history
    
    # Evaluate the model
    print(f'Evaluating model for {meter_type}...')
    loss = model.evaluate(X_test_meter, y_test_meter)
    evaluations[meter_type] = loss
    
    # Generate predictions
    print(f'Predicting with model for {meter_type}...')
    prediction = model.predict(X_test_meter).flatten()  # Ensure predictions match the shape of y_test_meter
    
    # Handling NaNs in predictions or test labels, if needed
    if np.isnan(prediction).any() or np.isnan(y_test_meter).any():
        prediction = np.nan_to_num(prediction, nan=0.0)  # Replace NaNs in predictions
        y_test_meter = np.nan_to_num(y_test_meter, nan=0.0)  # Replace NaNs in test labels

    # Calculate RMSE
    rmse = mean_squared_error(y_test_meter, prediction, squared=False)
    rmses[meter_type] = rmse
    print(f'RMSE for {meter_type}: {rmse}\n')
    
    # Store the predictions along with building_id and meter
    results_df = pd.DataFrame({
        'BuildingID': test_data_meter['building_id'].values,
        'Meter': meter_type,
        'ActualMeterReading': y_test_meter,
        'PredictedMeterReading': prediction
    })
    
    # Optionally save to CSV
    results_df.to_csv(f'predictions_{meter_type}.csv', index=False)
    
    # Add to the dictionary if you want to keep in memory
    predictions[meter_type] = results_df
    
    print(f'Completed {meter_type}.\n')  # Separator for readability


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

Training model for electricity...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model for electricity...
Predicting with model for electricity...




RMSE for electricity: 5643.688955366916



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

Completed electricity.

Training model for chilledwater...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model for chilledwater...
Predicting with model for chilledwater...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

RMSE for chilledwater: 7523772.530578289

Completed chilledwater.

Training model for gas...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model for gas...
Predicting with model for gas...
RMSE for gas: 52345.64513098371

Completed gas.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

Training model for hotwater...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model for hotwater...
Predicting with model for hotwater...
RMSE for hotwater: 2393173.4056963995

Completed hotwater.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

Training model for solar...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model for solar...
Predicting with model for solar...
RMSE for solar: 1506.5089486904917

Completed solar.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

Training model for water...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model for water...
Predicting with model for water...
RMSE for water: 15573.747483523011

Completed water.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

Training model for steam...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model for steam...
Predicting with model for steam...
RMSE for steam: 20007.570771014744

Completed steam.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

Training model for irrigation...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluating model for irrigation...
Predicting with model for irrigation...
RMSE for irrigation: 432.687690271091

Completed irrigation.





In [59]:
rmses

{'electricity': 5643.688955366916,
 'chilledwater': 7523772.530578289,
 'gas': 52345.64513098371,
 'hotwater': 2393173.4056963995,
 'solar': 1506.5089486904917,
 'water': 15573.747483523011,
 'steam': 20007.570771014744,
 'irrigation': 432.687690271091}

### RNN

In [69]:
temp_df = pd.read_csv("../train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[2:total_columns] 
train_data = pd.read_csv("../train.csv", usecols=columns_to_use)


train_data = train_data.drop(columns=[ 'site_id', 'sub_primaryspaceusage', 'season'])

test_data = pd.read_csv("../test.csv", usecols=columns_to_use)

test_data = test_data.drop(columns=[ 'site_id', 'sub_primaryspaceusage', 'season'])
train_data.head()


train_data = train_data[train_data['meter'] == 'electricity']
test_data = test_data[test_data['meter'] == 'electricity']

In [70]:
# Define categorical and numerical features from your dataset
categorical_features = ['meter']  # Add other categorical features as needed
numerical_features = ['airTemperature', 'cloudCoverage', 'dewTemperature', 'precipDepth1HR', 
                      'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']  # Based on your dataset

# Combine all features for processing
all_features = categorical_features + numerical_features
target = 'meter_reading'

# One-hot encode categorical features and normalize numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Fit on 2016 data and transform both 2016 and 2017 data
X_2016 = preprocessor.fit_transform(train_data[all_features])
X_2017 = preprocessor.transform(test_data[all_features])

# Extract target values
y_2016 = train_data[target].values
y_2017 = test_data[target].values


In [71]:
def create_sequences(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

# Define your time_steps (e.g., 30 days)
time_steps = 30

# Create sequences from the prepared data
X_train, y_train = create_sequences(X_2016, y_2016, time_steps=time_steps)
X_test, y_test = create_sequences(X_2017, y_2017, time_steps=time_steps)


In [72]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=len(X_train)).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)


In [74]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

# Define the model
model = Sequential([
    # SimpleRNN layer: Adjust 'units' as needed, input_shape should be [time_steps, num_features]
    SimpleRNN(units=32, input_shape=(None, X_train.shape[2]), activation='relu'),
    # Output layer: Predict the meter reading, hence one unit
    Dense(1)
])

# Compile the model: Use Mean Squared Error for regression tasks
model.compile(optimizer='adam', loss='mse', metrics=[tf.metrics.RootMeanSquaredError()])

# Display the model's architecture
model.summary()

# Train the model: Adjust epochs, batch_size as necessary
history = model.fit(
    train_ds,
    epochs=20,  # Number of epochs to train for
    validation_data=test_ds,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]  # Stop training when validation loss stops improving
)


Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 32)                1344      
                                                                 
 dense_78 (Dense)            (None, 1)                 33        
                                                                 
Total params: 1377 (5.38 KB)
Trainable params: 1377 (5.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20


In [75]:
predictions_2017 = model.predict(test_ds)



In [83]:
# Extract the dates and building IDs for the days we're predicting
# Assuming the 'date' and 'building_id' for the last day of each sequence are what we're predicting for
# This part needs to be adjusted based on how you've structured your data
predicted_dates = test_data['date'].iloc[time_steps:][::time_steps].reset_index(drop=True)
predicted_building_ids = test_data['building_id'].iloc[time_steps:][::time_steps].reset_index(drop=True)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'date': predicted_dates,
    'building_id': predicted_building_ids,
    'predicted_meter_reading': predictions_2017.flatten()  # Flatten in case predictions have an extra dimension
})

# Display the head of the predictions DataFrame to check
predictions_df.head()


ValueError: array length 220430 does not match index length 7348

In [88]:
expected_sequences = len(test_data) - time_steps

if len(predictions_2017) == expected_sequences:
    predicted_dates = test_data['date'].iloc[time_steps:].reset_index(drop=True)
    predicted_building_ids = test_data['building_id'].iloc[time_steps:].reset_index(drop=True)

    predictions_df = pd.DataFrame({
        'date': predicted_dates,
        'building_id': predicted_building_ids,
        'predicted_meter_reading': predictions_2017.flatten()  # Adjust this if not one-to-one
    })
else:
    print("The lengths do not match, check sequence creation and prediction steps.")


In [92]:
actual_meter_readings = test_data.loc[test_data.index.isin(predictions_df.index), 'meter_reading']

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_meter_readings, predictions_df['predicted_meter_reading']))
print("RMSE:", rmse)

ValueError: Found input variables with inconsistent numbers of samples: [95600, 220430]