# Import Statements and Magic Commands

In [2]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_log_error
# WHEN CHECKING THE MODEL: use as np.sqrt(mean_squared_log_error( y_test, predictions ))
from pandas.tseries.holiday import USFederalHolidayCalendar

---
# Load the Data

### Building and Weather Data

In [6]:
# Note: These assume that the data has been saved to a subdirectory named "energy"
bldgData = pd.read_csv("energy/building_metadata.csv")
weatherTrain = pd.read_csv("energy/weather_train.csv")
weatherTest = pd.read_csv("energy/weather_test.csv")
allWeather = weatherTrain.append(weatherTest)

I'm combining the weather data into one dataframe. There's really no need for it to be separate. Plus, I have a hunch that the _current_ meter reading is better modeled as a function of the average of the _current_ weather data and the _last_ weather data. In other words, how much energy a building used in the last hour is based on the weather for the last hour, not necessarily the weather at the current timestamp. If this is the case, then the first prediction in the test data will need the last weather data from the training set.

### Training Data

In [7]:
energyTrain = pd.read_csv("energy/train.csv")

### Test Data
Warning: This will use a lot of memory! My virtual machine is sitting at 5.5 GB of RAM utilization right now.

In [8]:
energyTest = pd.read_csv("energy/test.csv")

### Explore the Data

In [9]:
print(bldgData.head(5))
print(allWeather.head(5))
#print(allWeather.tail(5))
print(energyTrain.head(5))

   site_id  building_id primary_use  square_feet  year_built  floor_count
0        0            0   Education         7432      2008.0          NaN
1        0            1   Education         2720      2004.0          NaN
2        0            2   Education         5376      1991.0          NaN
3        0            3   Education        23685      2002.0          NaN
4        0            4   Education       116607      1975.0          NaN
   site_id            timestamp  air_temperature  cloud_coverage  \
0        0  2016-01-01 00:00:00             25.0             6.0   
1        0  2016-01-01 01:00:00             24.4             NaN   
2        0  2016-01-01 02:00:00             22.8             2.0   
3        0  2016-01-01 03:00:00             21.1             2.0   
4        0  2016-01-01 04:00:00             20.0             2.0   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_direction  \
0             20.0                NaN              1019.7             

---
# 0. No Model

Just submit the "sample_submission.csv" and see what score that yields.

### Kaggle RMSLE score for No Model = 

---
# 1.a. The "Naïve Model" - Version 1

This model computes the average usage for a `(building_id, meter, hourly time)` tuple. This version of the Naïve Model is not aware of "workdays" vs. "non-workdays" (i.e. weekends and holidays).

**Train**

In [10]:
naiveModel = {}

for row in energyTrain.itertuples(index=False):
    building_id = str(row.building_id)
    meter = str(row.meter)
    time = row.timestamp.split(" ")[1]
    meter_reading = row.meter_reading
    
    key = building_id + "-" + meter + "-" + time
    
    if key in naiveModel:
        naiveModel[key][0] += meter_reading
        naiveModel[key][1] += 1.0
    else:
        naiveModel[key] = [meter_reading,1.0]
        
for key in naiveModel.keys():
    totalUsed = naiveModel[key][0]
    readingCount = naiveModel[key][1]
    naiveModel[key] = totalUsed / readingCount

print(naiveModel)

{'0-0-00:00:00': 146.7318142076503, '1-0-00:00:00': 74.31176448087434, '2-0-00:00:00': 7.909866120218582, '3-0-00:00:00': 226.09059754098354, '4-0-00:00:00': 939.8014480874317, '5-0-00:00:00': 14.828896994535565, '6-0-00:00:00': 64.51872540983607, '7-0-00:00:00': 299.99732513661206, '8-0-00:00:00': 229.94260655737693, '9-0-00:00:00': 56.95684808743167, '10-0-00:00:00': 824.1969316939891, '11-0-00:00:00': 292.1693169398905, '12-0-00:00:00': 150.7227442622951, '13-0-00:00:00': 184.7947841530054, '14-0-00:00:00': 178.3496420765027, '15-0-00:00:00': 118.53797267759558, '16-0-00:00:00': 802.9102158469941, '17-0-00:00:00': 59.86387650273227, '18-0-00:00:00': 1223.8417213114758, '19-0-00:00:00': 117.4764262295082, '20-0-00:00:00': 266.0975983606555, '21-0-00:00:00': 44.489502185792375, '22-0-00:00:00': 16.675217213114756, '23-0-00:00:00': 490.98085792349747, '24-0-00:00:00': 411.4271584699452, '25-0-00:00:00': 162.3849890710384, '26-0-00:00:00': 43.1120644808743, '27-0-00:00:00': 177.49827868

**Predict**

In [11]:
# Note: Create a subdirectory named "results" before running
naiveResults = "row_id,meter_reading\n"

for row in energyTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = str(row.building_id)
    meter = str(row.meter)
    time = row.timestamp.split(" ")[1]
    
    key = building_id + "-" + meter + "-" + time
    
    pred = naiveModel[key]

    naiveResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
outFile = open("results/naiveResults01.csv","w")
outFile.write(naiveResults)
outFile.close()

In [17]:
resultTest = pd.read_csv("results/naiveResults01.csv")
print(resultTest.head(5))

   row_id  meter_reading
0       0       146.7318
1       1        74.3118
2       2         7.9099
3       3       226.0906
4       4       939.8014


### Kaggle RMSLE score for the Naïve Method, Version 1 = 1.46

---
# 1.b. The "Naïve Model" - Version 2

This model improves on Version 1 by computing two averages for each `(building_id, meter, hourly time)` tuple: one for "workdays" and one for "non-workdays" (i.e. weekends and holidays).

**Train**

In [18]:
cal = USFederalHolidayCalendar() # This assumes all of the buildings are in the US, but they are not.
holidays = cal.holidays(start='2016-01-01', end='2018-12-31').to_pydatetime()

naiveModel = {}

for row in energyTrain.itertuples(index=False):
    building_id = str(row.building_id)
    meter = str(row.meter)
    dateStr, time = row.timestamp.split(" ")
    meter_reading = row.meter_reading
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    if (date in holidays) or (date.weekday() > 4):
        workday = '0'
    else:
        workday = '1'
    
    key = workday + "-" + building_id + "-" + meter + "-" + time
    
    if key in naiveModel:
        naiveModel[key][0] += meter_reading
        naiveModel[key][1] += 1.0
    else:
        naiveModel[key] = [meter_reading,1.0] 
        
for key in naiveModel.keys():
    totalUsed = naiveModel[key][0]
    readingCount = naiveModel[key][1]
    naiveModel[key] = totalUsed / readingCount

KeyboardInterrupt: 

**Predict**

In [15]:
# Note: Create a subdirectory named "results" before running
naiveResults = "row_id,meter_reading\n"

for row in energyTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = str(row.building_id)
    meter = str(row.meter)
    dateStr, time = row.timestamp.split(" ")
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    if (date in holidays) or (date.weekday() > 4):
        workday = '0'
    else:
        workday = '1'
    
    key = workday + "-" + building_id + "-" + meter + "-" + time
    
    pred = naiveModel[key]

    naiveResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
    print(naiveResults)
    
outFile = open("results/naiveResults02.csv","w")
outFile.write(naiveResults)
outFile.close()

NameError: name 'holidays' is not defined

### Kaggle RMSLE score for the Naïve Method, Version 2 = 1.45

---
# Intermission: Merge Training Data prior to Machine Learning

In [12]:
trainJoinBldg = pd.merge(energyTrain, bldgData, on='building_id')
fullTraining = pd.merge(trainJoinBldg, allWeather, on=['site_id', 'timestamp'])

In [21]:
# Explore new dataframes
print(energyTrain.shape)
print(trainJoinBldg.shape)
thrownOut = energyTrain.shape[0] - trainJoinBldg.shape[0]
thrownOutPercent = 100.* thrownOut * 1./ energyTrain.shape[0]
print("% thrown out by merging with building data: " + '{0:.2f}'.format(thrownOutPercent) + "%")
print(fullTraining.shape)
thrownOut = trainJoinBldg.shape[0] - fullTraining.shape[0]
thrownOutPercent = 100.* thrownOut * 1./ trainJoinBldg.shape[0]
print("% thrown out by merging with weather data: " + '{0:.2f}'.format(thrownOutPercent) + "%")
#print(trainJoinBldg.head(5))
#print(fullTraining.head(5))
print(fullTraining.columns)

(20216100, 4)
(20216100, 9)
% thrown out by merging with building data: 0.00%
(20125605, 16)
% thrown out by merging with weather data: 0.45%
Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed'],
      dtype='object')


`fullTraining` now has all the provided training data merged into one dataframe. i.e. Each row contains the meter reading, building info, and weather for that site+timestamp all in one row.

Only 0.45% of the rows get lost when merging with the weather data. This implies that not every site+timestamp is present in the weather data.

I assume it will also be necessary to repeat these steps for the test data prior to making predictions; however, if weather data is not present for a given site+timestamp, we cannot just drop the row - we must still make a prediction.

---
# 2. Linear Regression

I have a hunch that we'll need separate models for workdays and holidays/weekends.
So, we may end up needing a model for each of the following:

* workday / electricity
* workday / chilledwater
* workday / steam
* workday / hotwater
* non-workday / electricity
* non-workday / chilledwater
* non-workday / steam
* non-workday / hotwater

And we could maybe even add the `primary_use` as yet another variable if we really want/need to.