In [16]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_log_error
# WHEN CHECKING THE MODEL: use as np.sqrt(mean_squared_log_error( y_test, predictions ))
from pandas.tseries.holiday import USFederalHolidayCalendar
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import metrics
from sklearn import tree
from sklearn import preprocessing
%matplotlib inline

---
# Load the Data

### Building and Weather Data

In [2]:
# Note: These assume that the data has been saved to a subdirectory named "energy"
bldgData = pd.read_csv("energy/building_metadata.csv")
weatherTrain = pd.read_csv("energy/weather_train.csv")
weatherTest = pd.read_csv("energy/weather_test.csv")
allWeather = weatherTrain.append(weatherTest)

I'm combining the weather data into one dataframe. There's really no need for it to be separate. Plus, I have a hunch that the _current_ meter reading is better modeled as a function of the average of the _current_ weather data and the _last_ weather data. In other words, how much energy a building used in the last hour is based on the weather for the last hour, not necessarily the weather at the current timestamp. If this is the case, then the first prediction in the test data will need the last weather data from the training set.

### Training Data

In [3]:
energyTrain = pd.read_csv("energy/train.csv")

### Test Data
Warning: This will use a lot of memory! My virtual machine is sitting at 5.5 GB of RAM utilization right now.

In [4]:
energyTest = pd.read_csv("energy/test.csv")

# Decision Tree Model

In [5]:
trainJoinBldg = pd.merge(energyTrain, bldgData, on='building_id')
fullTraining = pd.merge(trainJoinBldg, allWeather, how='left', on=['site_id', 'timestamp'])

In [6]:
fullTraining5 = fullTraining
fullTraining5["timestamp"] = pd.to_datetime(fullTraining5["timestamp"])

In [8]:
#this takes a while (around 3 minutes)
fullTraining5['workday'] = [0 if x.weekday() > 4 else 1 for x in fullTraining5['timestamp']]

In [9]:
sr = fullTraining5['timestamp']
fullTraining5['hour'] = sr.dt.hour

In [10]:
airTempMean = fullTraining5['air_temperature'].mean()
fullTraining5['air_temperature'] - airTempMean
fullTraining5['air_temperature'] = fullTraining5['air_temperature'].abs()
fullTraining5['year_built'] = 2019 - fullTraining5['year_built']

In [43]:
# fullTraining5Dummies = pd.get_dummies(fullTraining5, columns=['meter', 'primary_use', 'site_id'], drop_first=True)

In [18]:
le = preprocessing.LabelEncoder()

In [19]:
le.fit(fullTraining5['primary_use'])

LabelEncoder()

In [23]:
fullTraining5['primary_use'] = le.transform(fullTraining5['primary_use'])

In [57]:
fullTraining5['primary_use'].unique().tolist()

[0, 4, 6, 1, 7, 11, 8, 9, 15, 2, 10, 3, 14, 13, 5, 12]

In [30]:
#Splitting into features and labels
X = fullTraining5[['meter', 'site_id', 'primary_use', 'square_feet',
       'year_built', 'floor_count', 'air_temperature', 'workday', 'hour']].values
y = fullTraining5['meter_reading'].values

In [31]:
#replace null values with column means
indsX = np.where(np.isnan(X))
col_mean = np.nanmean(X, axis=1)
X[indsX] = np.take(col_mean, indsX[1])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [45]:
model = tree.DecisionTreeRegressor()
model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [46]:
model.score(X, y)

0.46961054778219996

## Predictions for split data

In [37]:
predicted = model.predict(X_test)

In [38]:
print(predicted)

[104.5        128.5096       2.8209     ...   0.         367.58136667
 109.568     ]


In [39]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': predicted})
df1 = df.head(25)
df1

Unnamed: 0,Actual,Predicted
0,109.0,104.5
1,17.425,128.5096
2,4.1231,2.8209
3,1089.8,1042.27
4,82.944,82.944
5,65.4333,51.275
6,6.0,7.214286
7,8.7921,0.0
8,185.0,108.666667
9,0.3332,4.7481


# Testing Tree Model

In [47]:
testJoinBldg = pd.merge(energyTest, bldgData, sort=True, on='building_id')
fullTesting5 = pd.merge(testJoinBldg, allWeather, sort=True, how='left', on=['site_id', 'timestamp'])

In [48]:
fullTesting5["timestamp"] = pd.to_datetime(fullTesting5["timestamp"])

In [49]:
#this takes a while (around 3 minutes)
fullTesting5['workday'] = [0 if x.weekday() > 4 else 1 for x in fullTesting5['timestamp']]

In [50]:
sr = fullTesting5['timestamp']
fullTesting5['hour'] = sr.dt.hour

In [51]:
airTempMean = fullTesting5['air_temperature'].mean()
fullTesting5['air_temperature'] - airTempMean
fullTesting5['air_temperature'] = fullTesting5['air_temperature'].abs()
fullTesting5['year_built'] = 2019 - fullTesting5['year_built']

In [52]:
le_test = preprocessing.LabelEncoder()

In [53]:
le_test.fit(fullTesting5['primary_use'])

LabelEncoder()

In [54]:
fullTesting5['primary_use'] = le_test.transform(fullTesting5['primary_use'])

In [56]:
fullTesting5['primary_use'].unique().tolist()

[0, 4, 6, 1, 7, 11, 8, 9, 15, 2, 10, 3, 14, 13, 5, 12]

In [58]:
#Splitting into features and labels
X_test = fullTesting5[['meter', 'site_id', 'primary_use', 'square_feet',
       'year_built', 'floor_count', 'air_temperature', 'workday', 'hour']].values

In [59]:
#replace null values with column means
indsX = np.where(np.isnan(X_test))
col_mean = np.nanmean(X_test, axis=1)
X_test[indsX] = np.take(col_mean, indsX[1])

In [60]:
y_test_pred = model.predict(X_test)

In [61]:
print(y_test_pred[:5])

[ 54.83233333  19.3847       2.04766667 113.41866667 472.33      ]


In [62]:
print(X_test[:5])

[[0.00000e+00 0.00000e+00 0.00000e+00 7.43200e+03 1.10000e+01 1.00460e+03
  1.78000e+01 0.00000e+00 0.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 2.72000e+03 1.50000e+01 1.00460e+03
  1.78000e+01 0.00000e+00 0.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 5.37600e+03 2.80000e+01 1.00460e+03
  1.78000e+01 0.00000e+00 0.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 2.36850e+04 1.70000e+01 1.00460e+03
  1.78000e+01 0.00000e+00 0.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 1.16607e+05 4.40000e+01 1.00460e+03
  1.78000e+01 0.00000e+00 0.00000e+00]]


In [63]:
row_ids = fullTesting5['row_id']
pred = pd.DataFrame(y_test_pred)
tree01 = pd.concat([row_ids, pred], axis=1)
tree01.columns = ['row_id', 'meter_reading']

In [64]:
print(tree01.head(5))
print(tree01.columns)
print(tree01.shape)
print(row_ids.head(5))

   row_id  meter_reading
0       0      54.832333
1       1      19.384700
2       2       2.047667
3       3     113.418667
4       4     472.330000
Index(['row_id', 'meter_reading'], dtype='object')
(41697600, 2)
0    0
1    1
2    2
3    3
4    4
Name: row_id, dtype: int64


In [65]:
tree01.to_csv('decisionTree01.csv', index=False)

In [66]:
resultTest = pd.read_csv("decisionTree01.csv")
print(resultTest.head(5))

   row_id  meter_reading
0       0      54.832333
1       1      19.384700
2       2       2.047667
3       3     113.418667
4       4     472.330000


In [67]:
print(resultTest.shape)

(41697600, 2)
