In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import random

from scipy.stats import kruskal
from sklearn.manifold import TSNE

from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedShuffleSplit

random.seed(0)
np.random.seed(0)

In [None]:
!dir ashrae-energy-prediction

# Load and Merge Data 
This code assumes that the data folder is located in the same directory as the notebook

In [None]:
building = pd.read_csv("./ashrae-energy-prediction/building_metadata.csv")
train = pd.read_csv("./ashrae-energy-prediction/train.csv")
weather_train = pd.read_csv("./ashrae-energy-prediction/weather_train.csv")

In [None]:
building.describe()

In [None]:
train.describe()

In [None]:
weather_train.describe()

In [None]:
df = building.copy()
df = df.merge(train, on='building_id', how='left')
df = df.merge(weather_train, on=['site_id', 'timestamp'], how='left')
del building, train, weather_train

In [None]:
plt.figure(figsize=(5,4))
sns.heatmap(df.corr())
plt.title("Pre-Feature Engineering Feature Correlations")
plt.show()

# Basic Feature Engineering / Manipulation

In [None]:
df['log_square_feet'] = np.log1p(df['square_feet'])
df['log_precip_depth_1_hr'] = np.log1p(df['precip_depth_1_hr'])
df['log_meter_reading'] = np.log1p(df['meter_reading'])


# Fill any infinity values with zero 
df['log_square_feet'].replace([np.inf, -np.inf, np.nan], 0, inplace=True)
df['log_precip_depth_1_hr'].replace([np.inf, -np.inf, np.nan], 0, inplace=True)

# Get time (granular down to the hour, and only one year)
df['timestamp'] = pd.to_datetime(df['timestamp']) 
df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['month'] = df['timestamp'].dt.month
df['weekday'] = df['timestamp'].dt.dayofweek

# Track weather metrics means (to detect spikes) 
weather_features = ['cloud_coverage', 'dew_temperature', 'air_temperature', 
                    'sea_level_pressure', 'wind_direction', 'wind_speed', 'precip_depth_1_hr',]

hourly_by_site = df.groupby(['hour', 'month', 'site_id'])[weather_features].mean().reset_index()

df = df.merge(hourly_by_site, on=['hour', 'month', 'site_id'], how='left', suffixes=(None, '_hourly_by_site'))
del hourly_by_site

for feature in weather_features:
    df[feature + "_diff_hourly_from_mean"] = df[feature] - df[feature + "_hourly_by_site"]
    
df = df.drop(columns = [feat + "_hourly_by_site" for feat in weather_features])

# Map meter values to their true name 
# df['meter'] = df['meter'].replace({
#     0: 'electricity',
#     1: 'chilledwater',
#     2: 'steam',
#     3: 'hotwater'
# })

In [None]:
df.head()

In [None]:
fig, axes = plt.subplots(5, 7, figsize=(20, 10))

# Generate a histogram for each feature 
i = 0
for feat in df.columns:
    try:
        sub_axis = axes[i // 7][i % 7]
        df[feat].plot.hist(ax=sub_axis)
        sub_axis.set_yscale('log')
        sub_axis.set_title(feat)
        
        i += 1 
    except:
        print(f"Skipping {feat}")
       

plt.suptitle("Distributions ")
plt.show()

# Hypothesis Testing 
Given the non-Gaussian nature of the data, I conducted parametric tests to observe the data

In [None]:
# Compare metrics across building IDs 
# for feature in df.columns:
#     _, pval = kruskal(
#         *[df[df['building_id'] == sid][feature].dropna() for sid in df['building_id'].unique()]
#     )
#     print(feature, pval)

In [None]:
# data isn't necessarily linear, observe it with T-SNE
# for meter in df['meter'].unique():
#     data = df[df['meter'] == meter]
#     tsne = TSNE(n_jobs=4)
#     transformed_df = tsne.fit_transform(data)
#     plt.scatter(transformed_df[:,0], transformed_df[:,1], c=data['building_id'], cmap='viridis')
#     plt.title(f"TSNE {meter}")
#     plt.colorbar()
#     plt.show()

# Random Forest
Intuition: climate different per month, should train on 75% of each month's data and then test on 25% of it. This is achieved using a stratified shuffle split, where the month is treated as the class 

In [None]:
splitter = StratifiedShuffleSplit(
    n_splits=4, 
)

n_trees = 50
split = 1 
for train_idx, test_idx in splitter.split(df, df['month']):
    train, test = df.iloc[train_idx], df.iloc[test_idx]
    
    regressor = RandomForestRegressor(
        n_estimators=n_trees, 
        random_state=0,
        max_samples=0.1
    )
    
    regressor.fit(train.drop(columns='meter_reading'), train['meter_reading'])
    
    y_hat = regressor.predict(test.drop(columns='meter_reading'))
    
    print(mean_squared_error(test['meter_reading'], y_hat))
    split += 1 