In [1]:
# Importing the required libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,  r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load datasets

# Train Data 
temp_df = pd.read_csv("../../data/cleaned/train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[1:total_columns] 
train_data = pd.read_csv("../../data/cleaned/train.csv", usecols=columns_to_use)

# Test Data 
test_data = pd.read_csv("../../data/cleaned/test.csv", usecols=columns_to_use)

# Dropping the columns that are not relevant to our analysis 
train_data = train_data.drop(columns=['building_name', 'site_name','date'])
test_data = test_data.drop(columns=['building_name', 'site_name','date'])

# Building index on building_id for furhter assessment 
train_data.set_index('building_id', inplace=True)
test_data.set_index('building_id', inplace=True)

In [3]:
# Filtering for solar meter_reading
train_data = train_data[train_data['meter'] == 'solar']
test_data = test_data[test_data['meter'] == 'solar']

train_data = train_data.drop(columns=['meter'])
test_data = test_data.drop(columns=['meter'])

In [4]:
# Inspecting the data frames
print(train_data.sample(2))
print('-------------------------------------------------------------')
print(test_data.sample(2))

             meter_reading sub_primaryspaceusage      sqm      sqft  \
building_id                                                           
72                  218.47        Student Center  17358.0  186840.0   
68                    0.00              Academic  11254.8  121146.0   

                timezone  airTemperature  cloudCoverage  dewTemperature  \
building_id                                                               
72           US/Mountain       22.499341       1.961674       13.727692   
68           US/Mountain       21.304167       1.935755       13.459868   

             precipDepth1HR  precipDepth6HR  seaLvlPressure  windDirection  \
building_id                                                                  
72                 1.130758       17.704461     1017.221018     193.535068   
68                 0.763789       12.655104     1014.832614     145.953550   

             windSpeed  season  site_id  
building_id                              
72            3.5

In [5]:
# Separating into X and Y dataframes 
X_train = train_data.drop(columns=['meter_reading'])  # Exclude target variable
y_train = train_data['meter_reading']

X_test = test_data.drop(columns=['meter_reading'])  # Exclude target variable
y_test = test_data['meter_reading']

In [6]:
# Convert 'site_id' from numeric to categorical 
X_train['site_id'] = X_train['site_id'].astype('category')
X_test['site_id'] = X_test['site_id'].astype('category')

In [7]:
print(X_train.dtypes)
print(X_train.columns)

sub_primaryspaceusage      object
sqm                       float64
sqft                      float64
timezone                   object
airTemperature            float64
cloudCoverage             float64
dewTemperature            float64
precipDepth1HR            float64
precipDepth6HR            float64
seaLvlPressure            float64
windDirection             float64
windSpeed                 float64
season                     object
site_id                  category
dtype: object
Index(['sub_primaryspaceusage', 'sqm', 'sqft', 'timezone', 'airTemperature',
       'cloudCoverage', 'dewTemperature', 'precipDepth1HR', 'precipDepth6HR',
       'seaLvlPressure', 'windDirection', 'windSpeed', 'season', 'site_id'],
      dtype='object')


In [8]:
# Define features and types based on your dataset
numerical_features = ['sqm', 'sqft', 'airTemperature', 'cloudCoverage', 'dewTemperature',
                      'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
categorical_features = ['timezone', 'season', 'sub_primaryspaceusage', 'site_id']

In [9]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor on the training data and transform both training and test data
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [10]:
# Convert the processed data back to dense DataFrames
X_train_processed_df = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test_processed_df = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())

In [11]:
# Checking the columns 
X_train_processed_df.columns

Index(['num__sqm', 'num__sqft', 'num__airTemperature', 'num__cloudCoverage',
       'num__dewTemperature', 'num__precipDepth1HR', 'num__precipDepth6HR',
       'num__seaLvlPressure', 'num__windDirection', 'num__windSpeed',
       'cat__timezone_US/Mountain', 'cat__season_Fall', 'cat__season_Spring',
       'cat__season_Summer', 'cat__season_Winter',
       'cat__sub_primaryspaceusage_Academic',
       'cat__sub_primaryspaceusage_Student Center', 'cat__site_id_2'],
      dtype='object')

In [12]:
# Assume y_train and y_test are Pandas Series with 'meter_reading' for training and testing datasets
scaler = MinMaxScaler()

# Fit scaler on the training set
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

# Only transform the test set, do not fit the scaler to it to avoid data leakage
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

### Random Forest

In [13]:
rf = RandomForestRegressor()
rf.fit(X_train_processed, y_train_scaled)

  return fit_method(estimator, *args, **kwargs)


In [14]:
y_pred = rf.predict(X_test_processed)

In [15]:
-mean_squared_error(y_pred,y_test_scaled)

-0.027587999444376166