# Import Statements and Magic Commands

In [1]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_log_error
# WHEN CHECKING THE MODEL: use as np.sqrt(mean_squared_log_error( y_test, predictions ))
from pandas.tseries.holiday import USFederalHolidayCalendar

---
# Load the Data

### Building and Weather Data

In [2]:
# Note: These assume that the data has been saved to a subdirectory named "energy"
bldgData = pd.read_csv("energy/building_metadata.csv")
weatherTrain = pd.read_csv("energy/weather_train.csv")
weatherTest = pd.read_csv("energy/weather_test.csv")
allWeather = weatherTrain.append(weatherTest)

I'm combining the weather data into one dataframe. There's really no need for it to be separate. Plus, I have a hunch that the _current_ meter reading is better modeled as a function of the average of the _current_ weather data and the _last_ weather data. In other words, how much energy a building used in the last hour is based on the weather for the last hour, not necessarily the weather at the current timestamp. If this is the case, then the first prediction in the test data will need the last weather data from the training set.

### Training Data

In [3]:
energyTrain = pd.read_csv("energy/train.csv")

### Test Data
Warning: This will use a lot of memory! My virtual machine is sitting at 5.5 GB of RAM utilization right now.

In [4]:
energyTest = pd.read_csv("energy/test.csv")

### Explore the Data

In [4]:
#print(bldgData.head(5))
#print(allWeather.head(5))
#print(allWeather.tail(5))
print(energyTrain.head(5))

   building_id  meter            timestamp  meter_reading
0            0      0  2016-01-01 00:00:00            0.0
1            1      0  2016-01-01 00:00:00            0.0
2            2      0  2016-01-01 00:00:00            0.0
3            3      0  2016-01-01 00:00:00            0.0
4            4      0  2016-01-01 00:00:00            0.0


---
# 0. No Model

Just submit the "sample_submission.csv" and see what score that yields.

### Kaggle RMSLE score for No Model = 

---
# 1.a. The "Naïve Model" - Version 1

This model computes the average usage for a `(building_id, meter, hourly time)` tuple. This version of the Naïve Model is not aware of "workdays" vs. "non-workdays" (i.e. weekends and holidays).

**Train**

In [6]:
naiveModel = {}

for row in energyTrain.itertuples(index=False):
    building_id = str(row.building_id)
    meter = str(row.meter)
    time = row.timestamp.split(" ")[1]
    meter_reading = row.meter_reading
    
    key = building_id + "-" + meter + "-" + time
    
    if key in naiveModel:
        naiveModel[key][0] += meter_reading
        naiveModel[key][1] += 1.0
    else:
        naiveModel[key] = [meter_reading,1.0]
        
for key in naiveModel.keys():
    totalUsed = naiveModel[key][0]
    readingCount = naiveModel[key][1]
    naiveModel[key] = totalUsed / readingCount

**Predict**

In [7]:
# Note: Create a subdirectory named "results" before running
naiveResults = "row_id,meter_reading\n"

for row in energyTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = str(row.building_id)
    meter = str(row.meter)
    time = row.timestamp.split(" ")[1]
    
    key = building_id + "-" + meter + "-" + time
    
    pred = naiveModel[key]

    naiveResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
outFile = open("results/naiveResults01.csv","w")
outFile.write(naiveResults)
outFile.close()

### Kaggle RMSLE score for the Naïve Method, Version 1 = 1.46

---
# 1.b. The "Naïve Model" - Version 2

This model improves on Version 1 by computing two averages for each `(building_id, meter, hourly time)` tuple: one for "workdays" and one for "non-workdays" (i.e. weekends and holidays).

**Train**

In [8]:
cal = USFederalHolidayCalendar() # This assumes all of the buildings are in the US, but they are not.
holidays = cal.holidays(start='2016-01-01', end='2018-12-31').to_pydatetime()

naiveModel = {}

for row in energyTrain.itertuples(index=False):
    building_id = str(row.building_id)
    meter = str(row.meter)
    dateStr, time = row.timestamp.split(" ")
    meter_reading = row.meter_reading
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    if (date in holidays) or (date.weekday() > 4):
        workday = '0'
    else:
        workday = '1'
    
    key = workday + "-" + building_id + "-" + meter + "-" + time
    
    if key in naiveModel:
        naiveModel[key][0] += meter_reading
        naiveModel[key][1] += 1.0
    else:
        naiveModel[key] = [meter_reading,1.0] 
        
for key in naiveModel.keys():
    totalUsed = naiveModel[key][0]
    readingCount = naiveModel[key][1]
    naiveModel[key] = totalUsed / readingCount

**Predict**

In [9]:
# Note: Create a subdirectory named "results" before running
naiveResults = "row_id,meter_reading\n"

for row in energyTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = str(row.building_id)
    meter = str(row.meter)
    dateStr, time = row.timestamp.split(" ")
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    if (date in holidays) or (date.weekday() > 4):
        workday = '0'
    else:
        workday = '1'
    
    key = workday + "-" + building_id + "-" + meter + "-" + time
    
    pred = naiveModel[key]

    naiveResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
outFile = open("results/naiveResults02.csv","w")
outFile.write(naiveResults)
outFile.close()

### Kaggle RMSLE score for the Naïve Method, Version 2 = 1.45

---
# Intermission: Merge Training Data prior to Machine Learning

In [4]:
trainJoinBldg = pd.merge(energyTrain, bldgData, on='building_id')
fullTraining = pd.merge(trainJoinBldg, allWeather, on=['site_id', 'timestamp'])

del energyTrain
del trainJoinBldg
del bldgData
del weatherTrain
del weatherTest
del allWeather

In [10]:
buildingMeters = []

for row in fullTraining.itertuples(index=False):
    if (row.building_id, row.meter) not in buildingMeters:
        buildingMeters.append((row.building_id, row.meter))
    
print(len(buildingMeters))

2380


In [16]:
trainedModels = {}

counter = 0
oldPercent = 0.
for building_id, meter in buildingMeters:
    buildingMeterData = fullTraining[(fullTraining['building_id'] == building_id) & (fullTraining['meter'] == meter)]
    # TRAIN A MODEL USING buildingMeterData
    # y_vals = buildingMeterData[meter_reading]
    # x_vals = buildingMeterData[whatever,else,we,need]
    trainedModel = "I'm a model" #training happens here
    trainedModels[(building_id, meter)] = trainedModel
    counter += 1
    percent = 100. * counter / 2380
    if percent - oldPercent > 5:
        print('{0:.2f}'.format(percent))
        oldPercent = percent

print(len(trainedModels))

5.04
10.08
15.13
20.17
25.21
30.25
35.29
40.34
45.38
50.42
55.46
60.50
65.55
70.59
75.63
80.67
85.71
90.76
95.80
2380


In [8]:
print(fullTraining.shape)

(20125605, 16)


In [7]:
# Explore new dataframes
#print(energyTrain.shape)
#print(trainJoinBldg.shape)
#thrownOut = energyTrain.shape[0] - trainJoinBldg.shape[0]
#thrownOutPercent = 100.* thrownOut * 1./ energyTrain.shape[0]
#print("% thrown out by merging with building data: " + '{0:.2f}'.format(thrownOutPercent) + "%")
#print(fullTraining.shape)
#thrownOut = trainJoinBldg.shape[0] - fullTraining.shape[0]
#thrownOutPercent = 100.* thrownOut * 1./ trainJoinBldg.shape[0]
#print("% thrown out by merging with weather data: " + '{0:.2f}'.format(thrownOutPercent) + "%")
#print(trainJoinBldg.head(5))
#print(fullTraining.head(5))
print(fullTraining.columns)

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed'],
      dtype='object')


`fullTraining` now has all the provided training data merged into one dataframe. i.e. Each row contains the meter reading, building info, and weather for that site+timestamp all in one row.

Only 0.45% of the rows get lost when merging with the weather data. This implies that not every site+timestamp is present in the weather data.

I assume it will also be necessary to repeat these steps for the test data prior to making predictions; however, if weather data is not present for a given site+timestamp, we cannot just drop the row - we must still make a prediction.

---
# Intermission: Manipulate the Data

In [5]:
# Define functions that create new columns based on data from other columns
def getMonth(row):
    date = datetime.datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S')
    return date.strftime('%b')

In [5]:
elecTrain = fullTraining[fullTraining['meter'] == 0]
chillWaterTrain = fullTraining[fullTraining['meter'] == 1]
steamTrain = fullTraining[fullTraining['meter'] == 2]
hotWaterTrain = fullTraining[fullTraining['meter'] == 3]

del fullTraining

In [8]:
# Create new columns
# elecTrain['month'] = elecTrain.apply(lambda row: getMonth(row), axis=1) ## Lambda way
elecTrain["month"] = ""
for i, row in elecTrain.iterrows():
     elecTrain.at[i,'month'] = datetime.datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S').strftime('%b')

In [9]:
print(elecTrain.head(5))

   building_id  meter            timestamp  meter_reading  site_id  \
0            0      0  2016-01-01 00:00:00            0.0        0   
1            1      0  2016-01-01 00:00:00            0.0        0   
2            2      0  2016-01-01 00:00:00            0.0        0   
3            3      0  2016-01-01 00:00:00            0.0        0   
4            4      0  2016-01-01 00:00:00            0.0        0   

  primary_use  square_feet  year_built  floor_count  air_temperature  \
0   Education         7432      2008.0          NaN             25.0   
1   Education         2720      2004.0          NaN             25.0   
2   Education         5376      1991.0          NaN             25.0   
3   Education        23685      2002.0          NaN             25.0   
4   Education       116607      1975.0          NaN             25.0   

   cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
0             6.0             20.0                NaN              10

---
# 2. Linear Regression

I have a hunch that we'll need separate models for workdays and holidays/weekends.
So, we may end up needing a model for each of the following:

* workday / electricity
* workday / chilledwater
* workday / steam
* workday / hotwater
* non-workday / electricity
* non-workday / chilledwater
* non-workday / steam
* non-workday / hotwater

And we could maybe even add the `primary_use` as yet another variable if we really want/need to.