# Import Statements and Magic Commands

In [1]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
#from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
#from xgboost import XGBRegressor
#from sklearn import linear_model
#from sklearn.metrics import mean_squared_log_error
# WHEN CHECKING THE MODEL: use as np.sqrt(mean_squared_log_error( y_test, predictions ))
from pandas.tseries.holiday import USFederalHolidayCalendar
%matplotlib inline

---
# Load the Data

### Building and Weather Data

In [6]:
# Note: These assume that the data has been saved to a subdirectory named "energy"
bldgData = pd.read_csv("energy/building_metadata.csv")
weatherTrain = pd.read_csv("energy/weather_train.csv")
weatherTest = pd.read_csv("energy/weather_test.csv")
allWeather = weatherTrain.append(weatherTest)

I'm combining the weather data into one dataframe. There's really no need for it to be separate. Plus, I have a hunch that the _current_ meter reading is better modeled as a function of the average of the _current_ weather data and the _last_ weather data. In other words, how much energy a building used in the last hour is based on the weather for the last hour, not necessarily the weather at the current timestamp. If this is the case, then the first prediction in the test data will need the last weather data from the training set.

### Training Data

In [7]:
energyTrain = pd.read_csv("energy/train.csv")

### Test Data
Warning: This will use a lot of memory! My virtual machine is sitting at 5.5 GB of RAM utilization right now.

In [8]:
energyTest = pd.read_csv("energy/test.csv")

### Explore the Data

In [4]:
#print(bldgData.head(5))
#print(allWeather.head(5))
#print(allWeather.tail(5))
print(energyTrain.head(5))

   building_id  meter            timestamp  meter_reading
0            0      0  2016-01-01 00:00:00            0.0
1            1      0  2016-01-01 00:00:00            0.0
2            2      0  2016-01-01 00:00:00            0.0
3            3      0  2016-01-01 00:00:00            0.0
4            4      0  2016-01-01 00:00:00            0.0


---
# 0. No Model

Just submit the "sample_submission.csv" and see what score that yields.

### Kaggle RMSLE score for No Model = 

---
# 1.a. The "Naïve Model" - Version 1

This model computes the average usage for a `(building_id, meter, hourly time)` tuple. This version of the Naïve Model is not aware of "workdays" vs. "non-workdays" (i.e. weekends and holidays).

**Train**

In [6]:
naiveModel = {}

for row in energyTrain.itertuples(index=False):
    building_id = str(row.building_id)
    meter = str(row.meter)
    time = row.timestamp.split(" ")[1]
    meter_reading = row.meter_reading
    
    key = building_id + "-" + meter + "-" + time
    
    if key in naiveModel:
        naiveModel[key][0] += meter_reading
        naiveModel[key][1] += 1.0
    else:
        naiveModel[key] = [meter_reading,1.0]
        
for key in naiveModel.keys():
    totalUsed = naiveModel[key][0]
    readingCount = naiveModel[key][1]
    naiveModel[key] = totalUsed / readingCount

**Predict**

In [7]:
# Note: Create a subdirectory named "results" before running
naiveResults = "row_id,meter_reading\n"

for row in energyTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = str(row.building_id)
    meter = str(row.meter)
    time = row.timestamp.split(" ")[1]
    
    key = building_id + "-" + meter + "-" + time
    
    pred = naiveModel[key]

    naiveResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
outFile = open("results/naiveResults01.csv","w")
outFile.write(naiveResults)
outFile.close()

### Kaggle RMSLE score for the Naïve Method, Version 1 = 1.46

---
# 1.b. The "Naïve Model" - Version 2

This model improves on Version 1 by computing two averages for each `(building_id, meter, hourly time)` tuple: one for "workdays" and one for "non-workdays" (i.e. weekends and holidays).

**Train**

In [8]:
cal = USFederalHolidayCalendar() # This assumes all of the buildings are in the US, but they are not.
holidays = cal.holidays(start='2016-01-01', end='2018-12-31').to_pydatetime()

naiveModel = {}

for row in energyTrain.itertuples(index=False):
    building_id = str(row.building_id)
    meter = str(row.meter)
    dateStr, time = row.timestamp.split(" ")
    meter_reading = row.meter_reading
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    if (date in holidays) or (date.weekday() > 4):
        workday = '0'
    else:
        workday = '1'
    
    key = workday + "-" + building_id + "-" + meter + "-" + time
    
    if key in naiveModel:
        naiveModel[key][0] += meter_reading
        naiveModel[key][1] += 1.0
    else:
        naiveModel[key] = [meter_reading,1.0] 
        
for key in naiveModel.keys():
    totalUsed = naiveModel[key][0]
    readingCount = naiveModel[key][1]
    naiveModel[key] = totalUsed / readingCount

**Predict**

In [9]:
# Note: Create a subdirectory named "results" before running
naiveResults = "row_id,meter_reading\n"

for row in energyTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = str(row.building_id)
    meter = str(row.meter)
    dateStr, time = row.timestamp.split(" ")
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    if (date in holidays) or (date.weekday() > 4):
        workday = '0'
    else:
        workday = '1'
    
    key = workday + "-" + building_id + "-" + meter + "-" + time
    
    pred = naiveModel[key]

    naiveResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
outFile = open("results/naiveResults02.csv","w")
outFile.write(naiveResults)
outFile.close()

### Kaggle RMSLE score for the Naïve Method, Version 2 = 1.45

---
# 2.a. Decision Tree Regressor - Version 1

In [8]:
trainJoinBldg = pd.merge(energyTrain, bldgData, on='building_id')
fullTraining = pd.merge(trainJoinBldg, allWeather, on=['site_id', 'timestamp'])

del energyTrain
del trainJoinBldg
del bldgData
del weatherTrain
del weatherTest
del allWeather

In [9]:
print(fullTraining.shape)
fullTraining = fullTraining[fullTraining.meter_reading != 0.0]
print(fullTraining.shape)

slimTraining = fullTraining[['building_id', 'meter', 'timestamp', 'meter_reading']]

del fullTraining

print(slimTraining.shape)

(20125605, 16)
(18257718, 16)
(18257718, 5)


In [10]:
slimTraining["hour"] = ""
slimTraining["month"] = ""
slimTraining["dayOfWeek"] = ""

for i, row in slimTraining.iterrows():
    slimTraining.at[i,'hour'] = row['timestamp'].split(" ")[1]
    date = datetime.datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S')
    slimTraining.at[i,'month'] = date.strftime('%b')
    slimTraining.at[i,'dayOfWeek'] = str(date.weekday())
    
slimTraining.drop("timestamp", axis=1, inplace=True)

In [13]:
print(slimTraining.shape)
slimTraining.to_csv("custom/hourMonthDayofweek.csv")

(18257718, 7)


In [2]:
slimTraining = pd.read_csv("custom/hourMonthDayofweek.csv")
slimTraining.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,hour,month,dayOfWeek
0,45,46,0,53.2397,25.0,00:00:00,Jan,4
1,72,74,0,43.0013,25.0,00:00:00,Jan,4
2,91,93,0,52.4206,25.0,00:00:00,Jan,4
3,123,20,0,91.9886,24.4,01:00:00,Jan,4
4,148,46,0,53.6492,24.4,01:00:00,Jan,4


In [3]:
buildingMeters = []

for row in slimTraining.itertuples(index=False):
    if (row.building_id, row.meter) not in buildingMeters:
        buildingMeters.append((row.building_id, row.meter))
    
print(len(buildingMeters))

2380


In [4]:
readyToTrain = pd.get_dummies(slimTraining, columns=['hour', 'month', 'dayOfWeek'])

In [5]:
readyToTrain.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,hour_00:00:00,hour_01:00:00,hour_02:00:00,hour_03:00:00,hour_04:00:00,...,month_Nov,month_Oct,month_Sep,dayOfWeek_0,dayOfWeek_1,dayOfWeek_2,dayOfWeek_3,dayOfWeek_4,dayOfWeek_5,dayOfWeek_6
0,45,46,0,53.2397,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,72,74,0,43.0013,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,91,93,0,52.4206,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,123,20,0,91.9886,24.4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,148,46,0,53.6492,24.4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
trainedModels = {}

counter = 0
oldPercent = 0.
for building_id, meter in buildingMeters:
    buildingMeterData = readyToTrain[(readyToTrain['building_id'] == building_id) & (readyToTrain['meter'] == meter)]
    # TRAIN A MODEL USING buildingMeterData
    y = buildingMeterData['meter_reading']
    X = buildingMeterData[[
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ]]
    dtRegressor = tree.DecisionTreeRegressor()
    trainedModels[(building_id, meter)] = dtRegressor.fit(X, y)
    counter += 1
    percent = 100. * counter / 2380
    if percent - oldPercent > 5:
        print('{0:.2f}'.format(percent))
        oldPercent = percent

print(len(trainedModels))

5.04
10.08
15.13
20.17
25.21
30.25
35.29
40.34
45.38
50.42
55.46
60.50
65.55
70.59
75.63
80.67
85.71
90.76
95.80
2380


In [7]:
#print(trainedModels[(93,0)].feature_importances_)
del slimTraining
del readyToTrain

In [9]:
print(energyTest.shape)

(41697600, 4)


In [14]:
# Note: Create a subdirectory named "results" before running
dtrResults = "row_id,meter_reading\n"

counter = 0
oldPercent = 0.
for row in energyTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = row.building_id
    meter = row.meter
    dateStr, hour = row.timestamp.split(" ")
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    month = date.strftime('%b')
    dayOfWeek = str(date.weekday())
    
    features = pd.Series(0, index=[
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ])
    
    features['hour_' + hour] = 1
    features['month_' + month] = 1
    features['dayOfWeek_' + dayOfWeek] = 1
    
    pred = trainedModels[(building_id, meter)].predict([features])[0]

    dtrResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
    counter += 1
    percent = 100. * counter / 41697600
    if percent - oldPercent > 5:
        print('{0:.2f}'.format(percent))
        oldPercent = percent
    
outFile = open("results/decisionTreeRegressor.csv","w")
outFile.write(dtrResults)
outFile.close()

5.00
10.00
15.00
20.00
25.00
30.00
35.00
40.00
45.00
50.00
55.00
60.00
65.00
70.00
75.00
80.00
85.00
90.00
95.00


### Kaggle RMSLE score for the Decision Tree Regressor, Version 1 = 1.38

---
# 2.b. Decision Tree Regressor - Version 2

In [2]:
slimTraining = pd.read_csv("custom/hourMonthDayofweek.csv")
slimTraining.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,hour,month,dayOfWeek
0,45,46,0,53.2397,25.0,00:00:00,Jan,4
1,72,74,0,43.0013,25.0,00:00:00,Jan,4
2,91,93,0,52.4206,25.0,00:00:00,Jan,4
3,123,20,0,91.9886,24.4,01:00:00,Jan,4
4,148,46,0,53.6492,24.4,01:00:00,Jan,4


In [3]:
buildingMeters = []

for row in slimTraining.itertuples(index=False):
    if (row.building_id, row.meter) not in buildingMeters:
        buildingMeters.append((row.building_id, row.meter))
    
print(len(buildingMeters))

2380


In [4]:
readyToTrain = pd.get_dummies(slimTraining, columns=['hour', 'month', 'dayOfWeek'])

In [5]:
readyToTrain.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,hour_00:00:00,hour_01:00:00,hour_02:00:00,hour_03:00:00,hour_04:00:00,...,month_Nov,month_Oct,month_Sep,dayOfWeek_0,dayOfWeek_1,dayOfWeek_2,dayOfWeek_3,dayOfWeek_4,dayOfWeek_5,dayOfWeek_6
0,45,46,0,53.2397,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,72,74,0,43.0013,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,91,93,0,52.4206,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,123,20,0,91.9886,24.4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,148,46,0,53.6492,24.4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
print(readyToTrain.shape)
readyToTrain = readyToTrain[pd.notnull(readyToTrain['air_temperature'])]
print(readyToTrain.shape)

(18257718, 48)
(18252676, 48)


In [9]:
trainedModels = {}

counter = 0
oldPercent = 0.
for building_id, meter in buildingMeters:
    buildingMeterData = readyToTrain[(readyToTrain['building_id'] == building_id) & (readyToTrain['meter'] == meter)]
    # TRAIN A MODEL USING buildingMeterData
    y = buildingMeterData['meter_reading']
    X = buildingMeterData[[
        'air_temperature',
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ]]
    dtRegressor = tree.DecisionTreeRegressor()
    trainedModels[(building_id, meter)] = dtRegressor.fit(X, y)
    counter += 1
    percent = 100. * counter / 2380
    if percent - oldPercent > 5:
        print('{0:.2f}'.format(percent))
        oldPercent = percent

print(len(trainedModels))

5.04
10.08
15.13
20.17
25.21
30.25
35.29
40.34
45.38
50.42
55.46
60.50
65.55
70.59
75.63
80.67
85.71
90.76
95.80
2380


In [12]:
print(trainedModels[(46,0)].feature_importances_)
#del slimTraining
#del readyToTrain

[3.93925253e-01 1.03464868e-02 7.34604696e-03 4.23519032e-03
 3.11844798e-03 3.81030062e-03 3.08926658e-03 5.25449611e-03
 9.26175569e-03 7.13031299e-03 7.71541332e-03 7.72026709e-03
 7.75983110e-03 9.52487387e-03 8.98468725e-03 9.25170509e-03
 1.14937883e-02 9.79922901e-03 8.38481383e-03 6.63852892e-03
 4.09198792e-02 7.50164560e-03 6.10773359e-03 7.39284342e-03
 9.56555767e-03 1.76016391e-01 7.97085490e-03 2.87905758e-04
 0.00000000e+00 9.70105676e-03 3.25079957e-03 2.44662761e-03
 3.87167277e-02 2.78061799e-03 5.28982929e-03 4.08859964e-03
 4.82367443e-03 1.26597327e-02 1.80941843e-02 2.56455750e-02
 2.20017978e-02 1.93342865e-02 2.17269423e-02 1.88860431e-02]


In [13]:
energyTest = pd.read_csv("energy/test.csv")
print(energyTest.shape)

bldgData = pd.read_csv("energy/building_metadata.csv")
weatherTest = pd.read_csv("energy/weather_test.csv")

testJoinBldg = pd.merge(energyTest, bldgData, on='building_id')
fullTest = pd.merge(testJoinBldg, weatherTest, on=['site_id', 'timestamp'])

del energyTest
del testJoinBldg
del bldgData
del weatherTest

print(fullTest.shape)

(41697600, 4)
(41498571, 16)


In [15]:
print(fullTest.shape)
fullTest = fullTest[pd.notnull(fullTest['air_temperature'])]
print(fullTest.shape)

(41498571, 16)
(41475699, 16)


In [16]:
# Note: Create a subdirectory named "results" before running
dtrResults = "row_id,meter_reading\n"

counter = 0
oldPercent = 0.
for row in fullTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = row.building_id
    meter = row.meter
    air_temperature = row.air_temperature
    dateStr, hour = row.timestamp.split(" ")
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    month = date.strftime('%b')
    dayOfWeek = str(date.weekday())
    
    features = pd.Series(0, index=[
        'air_temperature',
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ])
    
    features['air_temperature'] = air_temperature
    features['hour_' + hour] = 1
    features['month_' + month] = 1
    features['dayOfWeek_' + dayOfWeek] = 1
    
    pred = trainedModels[(building_id, meter)].predict([features])[0]

    dtrResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
    counter += 1
    percent = 100. * counter / 41475699
    if percent - oldPercent > 1:
        print('{0:.2f}'.format(percent))
        oldPercent = percent
    
outFile = open("results/decisionTreeRegressor02.csv","w")
outFile.write(dtrResults)
outFile.close()

1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
25.00
26.00
27.00
28.00
29.00
30.00
31.00
32.00
33.00
34.00
35.00
36.00
37.00
38.00
39.00
40.00
41.00
42.00
43.00
44.00
45.00
46.00
47.00
48.00
49.00
50.00
51.00
52.00
53.00
54.00
55.00
56.00
57.00
58.00
59.00
60.00
61.00
62.00
63.00
64.00
65.00
66.00
67.00
68.00
69.00
70.00
71.00
72.00
73.00
74.00
75.00
76.00
77.00
78.00
79.00
80.00
81.00
82.00
83.00
84.00
85.00
86.00
87.00
88.00
89.00
90.00
91.00
92.00
93.00
94.00
95.00
96.00
97.00
98.00
99.00


In [17]:
# Merge DTR2 results with DTR1 (to fill in missing predictions)
dtr1results = pd.read_csv("results/decisionTreeRegressor.csv")
dtr2results = pd.read_csv("results/decisionTreeRegressor02.csv")

dtr2results.rename(columns={"meter_reading":"meter_reading2"}, inplace=True)
dtr2results.head(5)


Unnamed: 0,row_id,meter_reading2
0,0,37.5408
1,1,14.1973
2,2,1.6381
3,3,336.502
4,4,86.0026


In [18]:
mergedResults = pd.merge(dtr1results, dtr2results, on='row_id', how='left')
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading,meter_reading2
41697595,41697595,4.475,
41697596,41697596,5.3375,
41697597,41697597,10.775,
41697598,41697598,170.0938,
41697599,41697599,4.4625,


In [19]:
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading,meter_reading2
0,0,37.5408,37.5408
1,1,13.1052,14.1973
2,2,1.6381,1.6381
3,3,385.817,336.502
4,4,86.0026,86.0026
5,5,32.8311,4.4366
6,6,91.5723,99.1077
7,7,489.9072,526.937
8,8,981.3908,886.418
9,9,379.5888,478.133


In [20]:
mergedResults.drop("meter_reading", axis=1, inplace=True)
mergedResults.rename(columns={"meter_reading2":"meter_reading"}, inplace=True)
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading
0,0,37.5408
1,1,14.1973
2,2,1.6381
3,3,336.502
4,4,86.0026
5,5,4.4366
6,6,99.1077
7,7,526.937
8,8,886.418
9,9,478.133


In [21]:
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading
41697595,41697595,
41697596,41697596,
41697597,41697597,
41697598,41697598,
41697599,41697599,


In [22]:
dtr1results.update(mergedResults)
print(dtr1results.shape)

(41697600, 2)


In [23]:
dtr1results.head(10)

Unnamed: 0,row_id,meter_reading
0,0,37.5408
1,1,14.1973
2,2,1.6381
3,3,336.502
4,4,86.0026
5,5,4.4366
6,6,99.1077
7,7,526.937
8,8,886.418
9,9,478.133


In [24]:
dtr1results.tail(10)

Unnamed: 0,row_id,meter_reading
41697590,41697590,297.875
41697591,41697591,298.0062
41697592,41697592,84.1
41697593,41697593,35.291
41697594,41697594,80.0625
41697595,41697595,4.475
41697596,41697596,5.3375
41697597,41697597,10.775
41697598,41697598,170.0938
41697599,41697599,4.4625


In [26]:
dtr1results.to_csv("results/decisionTreeRegressor02_merged.csv", index=False)

### Kaggle RMSLE score for the Decision Tree Regressor, Version 2 = 1.34

---
# 3. Random Forest Regressor

In [2]:
energyTrain = pd.read_csv("energy/train.csv")
bldgData = pd.read_csv("energy/building_metadata.csv")
weatherTrain = pd.read_csv("energy/weather_train.csv")

trainJoinBldg = pd.merge(energyTrain, bldgData, on='building_id')
fullTraining = pd.merge(trainJoinBldg, weatherTrain, on=['site_id', 'timestamp'])

del energyTrain
del bldgData
del weatherTrain
del trainJoinBldg

print(fullTraining.shape)

KeyboardInterrupt: 

In [21]:
#print(fullTraining.shape)
#fullTraining = fullTraining[fullTraining.meter_reading != 0.0]
#print(fullTraining.shape)

In [22]:
#fullTraining.head(5)

In [3]:
fullTraining["hour"] = ""
fullTraining["month"] = ""
fullTraining["dayOfWeek"] = ""

counter = 0
oldPercent = 0.
for i, row in fullTraining.iterrows():
    fullTraining.at[i,'hour'] = row['timestamp'].split(" ")[1]
    date = datetime.datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S')
    fullTraining.at[i,'month'] = date.strftime('%b')
    fullTraining.at[i,'dayOfWeek'] = str(date.weekday())
    
    counter += 1
    percent = 100. * counter / 20125605
    if percent - oldPercent > 1:
        print('{0:.2f}'.format(percent))
        oldPercent = percent

1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
25.00
26.00
27.00
28.00
29.00
30.00
31.00
32.00
33.00
34.00
35.00
36.00
37.00
38.00
39.00
40.00
41.00
42.00
43.00
44.00
45.00
46.00
47.00
48.00
49.00
50.00
51.00
52.00
53.00
54.00
55.00
56.00
57.00
58.00
59.00
60.00
61.00
62.00
63.00
64.00
65.00
66.00
67.00
68.00
69.00
70.00
71.00
72.00
73.00
74.00
75.00
76.00
77.00
78.00
79.00
80.00
81.00
82.00
83.00
84.00
85.00
86.00
87.00
88.00
89.00
90.00
91.00
92.00
93.00
94.00
95.00
96.00
97.00
98.00
99.00


In [4]:
fullTraining.head(5)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,month,dayOfWeek
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,00:00:00,Jan,4
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,00:00:00,Jan,4
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,00:00:00,Jan,4
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,00:00:00,Jan,4
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,00:00:00,Jan,4


In [5]:
print(fullTraining.shape)
fullTraining.to_csv("custom/FULL_hourMonthDayofweek.csv")

(20125605, 19)


In [2]:
fullTraining = pd.read_csv("custom/FULL_hourMonthDayofweek.csv")

In [3]:
fullTraining = fullTraining[fullTraining.meter_reading != 0.0]
fullTraining = fullTraining[pd.notnull(fullTraining['air_temperature'])]
fullTraining.drop("timestamp", axis=1, inplace=True)
fullTraining.drop("site_id", axis=1, inplace=True)
fullTraining.drop("primary_use", axis=1, inplace=True)
fullTraining.drop("square_feet", axis=1, inplace=True)
fullTraining.drop("year_built", axis=1, inplace=True)
fullTraining.drop("floor_count", axis=1, inplace=True)
fullTraining.drop("sea_level_pressure", axis=1, inplace=True)
fullTraining.drop("dew_temperature", axis=1, inplace=True)
fullTraining.fillna(0, inplace=True)
print(fullTraining.shape)

(18252676, 12)


In [7]:
fullTraining.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,cloud_coverage,precip_depth_1_hr,wind_direction,wind_speed,hour,month,dayOfWeek
45,45,46,0,53.2397,25.0,6.0,0.0,0.0,0.0,00:00:00,Jan,4
72,72,74,0,43.0013,25.0,6.0,0.0,0.0,0.0,00:00:00,Jan,4
91,91,93,0,52.4206,25.0,6.0,0.0,0.0,0.0,00:00:00,Jan,4
123,123,20,0,91.9886,24.4,0.0,-1.0,70.0,1.5,01:00:00,Jan,4
148,148,46,0,53.6492,24.4,0.0,-1.0,70.0,1.5,01:00:00,Jan,4


In [4]:
buildingMeters = []

for row in fullTraining.itertuples(index=False):
    if (row.building_id, row.meter) not in buildingMeters:
        buildingMeters.append((row.building_id, row.meter))
    
print(len(buildingMeters))

2380


In [5]:
readyToTrain = pd.get_dummies(fullTraining, columns=['hour', 'month', 'dayOfWeek'])

In [11]:
readyToTrain.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,cloud_coverage,precip_depth_1_hr,wind_direction,wind_speed,hour_00:00:00,...,month_Nov,month_Oct,month_Sep,dayOfWeek_0,dayOfWeek_1,dayOfWeek_2,dayOfWeek_3,dayOfWeek_4,dayOfWeek_5,dayOfWeek_6
45,45,46,0,53.2397,25.0,6.0,0.0,0.0,0.0,1,...,0,0,0,0,0,0,0,1,0,0
72,72,74,0,43.0013,25.0,6.0,0.0,0.0,0.0,1,...,0,0,0,0,0,0,0,1,0,0
91,91,93,0,52.4206,25.0,6.0,0.0,0.0,0.0,1,...,0,0,0,0,0,0,0,1,0,0
123,123,20,0,91.9886,24.4,0.0,-1.0,70.0,1.5,0,...,0,0,0,0,0,0,0,1,0,0
148,148,46,0,53.6492,24.4,0.0,-1.0,70.0,1.5,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
trainedModels = {}

counter = 0
oldPercent = 0.
for building_id, meter in buildingMeters:
    buildingMeterData = readyToTrain[(readyToTrain['building_id'] == building_id) & (readyToTrain['meter'] == meter)]
    # TRAIN A MODEL USING buildingMeterData
    y = buildingMeterData['meter_reading']
    X = buildingMeterData[[
        'air_temperature',
        'cloud_coverage',
        'precip_depth_1_hr',
        'wind_speed',
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ]]
    rf = RandomForestRegressor(n_estimators = 10)
    trainedModels[(building_id, meter)] = rf.fit(X, y)
    counter += 1
    percent = 100. * counter / 2380
    if percent - oldPercent > 1:
        print('{0:.2f}'.format(percent))
        oldPercent = percent

print(len(trainedModels))

1.01
2.02
3.03
4.03
5.04
6.05
7.06
8.07
9.08
10.08
11.09
12.10
13.11
14.12
15.13
16.13
17.14
18.15
19.16
20.17
21.18
22.18
23.19
24.20
25.21
26.22
27.23
28.24
29.24
30.25
31.26
32.27
33.28
34.29
35.29
36.30
37.31
38.32
39.33
40.34
41.34
42.35
43.36
44.37
45.38
46.39
47.39
48.40
49.41
50.42
51.43
52.44
53.45
54.45
55.46
56.47
57.48
58.49
59.50
60.50
61.51
62.52
63.53
64.54
65.55
66.55
67.56
68.57
69.58
70.59
71.60
72.61
73.61
74.62
75.63
76.64
77.65
78.66
79.66
80.67
81.68
82.69
83.70
84.71
85.71
86.72
87.73
88.74
89.75
90.76
91.76
92.77
93.78
94.79
95.80
96.81
97.82
98.82
99.83
2380


In [7]:
del fullTraining
del readyToTrain

In [8]:
energyTest = pd.read_csv("energy/test.csv")
print(energyTest.shape)

bldgData = pd.read_csv("energy/building_metadata.csv")
bldgData.drop("primary_use", axis=1, inplace=True)
bldgData.drop("square_feet", axis=1, inplace=True)
bldgData.drop("year_built", axis=1, inplace=True)
bldgData.drop("floor_count", axis=1, inplace=True)

weatherTest = pd.read_csv("energy/weather_test.csv")
weatherTest.drop("sea_level_pressure", axis=1, inplace=True)
weatherTest.drop("dew_temperature", axis=1, inplace=True)

testJoinBldg = pd.merge(energyTest, bldgData, on='building_id')
fullTest = pd.merge(testJoinBldg, weatherTest, on=['site_id', 'timestamp'])

del energyTest
del testJoinBldg
del bldgData
del weatherTest

print(fullTest.shape)

(41697600, 4)
(41498571, 10)


In [9]:
print(fullTest.shape)
fullTest = fullTest[pd.notnull(fullTest['air_temperature'])]
fullTest.drop("site_id", axis=1, inplace=True)
fullTest.fillna(0, inplace=True)
print(fullTest.shape)

(41498571, 10)
(41475699, 9)


In [10]:
# Note: Create a subdirectory named "results" before running
rfResults = "row_id,meter_reading\n"

counter = 0
oldPercent = 0.
for row in fullTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = row.building_id
    meter = row.meter
    air_temperature = row.air_temperature
    cloud_coverage = row.cloud_coverage
    precip_depth_1_hr = row.precip_depth_1_hr
    wind_speed = row.wind_speed
    dateStr, hour = row.timestamp.split(" ")
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    month = date.strftime('%b')
    dayOfWeek = str(date.weekday())
    
    features = pd.Series(0, index=[
        'air_temperature',
        'cloud_coverage',
        'precip_depth_1_hr',
        'wind_speed',
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ])
    
    features['air_temperature'] = air_temperature
    features['cloud_coverage'] = cloud_coverage
    features['precip_depth_1_hr'] = precip_depth_1_hr
    features['wind_speed'] = wind_speed
    features['hour_' + hour] = 1
    features['month_' + month] = 1
    features['dayOfWeek_' + dayOfWeek] = 1
    
    pred = trainedModels[(building_id, meter)].predict([features])[0]

    rfResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
    counter += 1
    percent = 100. * counter / 41475699
    if percent - oldPercent > 1:
        print('{0:.2f}'.format(percent))
        oldPercent = percent
    
outFile = open("results/randomForestRegressor.csv","w")
outFile.write(dtrResults)
outFile.close()

1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
25.00
26.00
27.00
28.00
29.00
30.00
31.00
32.00
33.00
34.00
35.00
36.00
37.00
38.00
39.00
40.00
41.00
42.00
43.00
44.00
45.00
46.00
47.00
48.00
49.00
50.00
51.00
52.00
53.00
54.00
55.00
56.00
57.00
58.00
59.00
60.00
61.00
62.00
63.00
64.00
65.00
66.00
67.00
68.00
69.00
70.00
71.00
72.00
73.00
74.00
75.00
76.00
77.00
78.00
79.00
80.00
81.00
82.00
83.00
84.00
85.00
86.00
87.00
88.00
89.00
90.00
91.00
92.00
93.00
94.00
95.00
96.00
97.00
98.00
99.00


NameError: name 'dtrResults' is not defined

In [11]:
outFile.write(rfResults)
outFile.close()

In [2]:
# Merge DTR2 results with DTR1 (to fill in missing predictions)
dtr1results = pd.read_csv("results/decisionTreeRegressor02_merged.csv")
dtr2results = pd.read_csv("results/randomForestRegressor.csv")

dtr2results.rename(columns={"meter_reading":"meter_reading2"}, inplace=True)
dtr2results.head(5)


Unnamed: 0,row_id,meter_reading2
0,0,94.398
1,1,23.6712
2,2,4.7916
3,3,343.6349
4,4,535.4001


In [3]:
mergedResults = pd.merge(dtr1results, dtr2results, on='row_id', how='left')
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading,meter_reading2
41697595,41697595,4.475,
41697596,41697596,5.3375,
41697597,41697597,10.775,
41697598,41697598,170.0938,
41697599,41697599,4.4625,


In [4]:
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading,meter_reading2
0,0,37.5408,94.398
1,1,14.1973,23.6712
2,2,1.6381,4.7916
3,3,336.502,343.6349
4,4,86.0026,535.4001
5,5,4.4366,5.5629
6,6,99.1077,109.9195
7,7,526.937,543.7276
8,8,886.418,1287.4163
9,9,478.133,457.759


In [5]:
mergedResults.drop("meter_reading", axis=1, inplace=True)
mergedResults.rename(columns={"meter_reading2":"meter_reading"}, inplace=True)
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading
0,0,94.398
1,1,23.6712
2,2,4.7916
3,3,343.6349
4,4,535.4001
5,5,5.5629
6,6,109.9195
7,7,543.7276
8,8,1287.4163
9,9,457.759


In [6]:
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading
41697595,41697595,
41697596,41697596,
41697597,41697597,
41697598,41697598,
41697599,41697599,


In [7]:
dtr1results.update(mergedResults)
print(dtr1results.shape)

(41697600, 2)


In [8]:
dtr1results.head(10)

Unnamed: 0,row_id,meter_reading
0,0,94.398
1,1,23.6712
2,2,4.7916
3,3,343.6349
4,4,535.4001
5,5,5.5629
6,6,109.9195
7,7,543.7276
8,8,1287.4163
9,9,457.759


In [9]:
dtr1results.tail(10)

Unnamed: 0,row_id,meter_reading
41697590,41697590,297.875
41697591,41697591,298.0062
41697592,41697592,84.1
41697593,41697593,35.291
41697594,41697594,80.0625
41697595,41697595,4.475
41697596,41697596,5.3375
41697597,41697597,10.775
41697598,41697598,170.0938
41697599,41697599,4.4625


In [10]:
dtr1results.to_csv("results/randomForestRegressor_merged.csv", index=False)

### Kaggle RMSLE score for the Random Forest Regressor = 1.35

---
# 3.b. Random Forest Regressor - Version 2

In [3]:
fullTraining = pd.read_csv("custom/FULL_hourMonthDayofweek.csv")

In [4]:
fullTraining = fullTraining[fullTraining.meter_reading != 0.0]
fullTraining = fullTraining[pd.notnull(fullTraining['air_temperature'])]
fullTraining.drop("timestamp", axis=1, inplace=True)
fullTraining.drop("site_id", axis=1, inplace=True)
fullTraining.drop("primary_use", axis=1, inplace=True)
fullTraining.drop("square_feet", axis=1, inplace=True)
fullTraining.drop("year_built", axis=1, inplace=True)
fullTraining.drop("floor_count", axis=1, inplace=True)
fullTraining.drop("cloud_coverage", axis=1, inplace=True)
fullTraining.drop("precip_depth_1_hr", axis=1, inplace=True)
fullTraining.drop("wind_direction", axis=1, inplace=True)
fullTraining.drop("wind_speed", axis=1, inplace=True)
fullTraining.drop("sea_level_pressure", axis=1, inplace=True)
fullTraining.drop("dew_temperature", axis=1, inplace=True)
fullTraining.fillna(0, inplace=True)
print(fullTraining.shape)

(18252676, 8)


In [5]:
fullTraining.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,hour,month,dayOfWeek
45,45,46,0,53.2397,25.0,00:00:00,Jan,4
72,72,74,0,43.0013,25.0,00:00:00,Jan,4
91,91,93,0,52.4206,25.0,00:00:00,Jan,4
123,123,20,0,91.9886,24.4,01:00:00,Jan,4
148,148,46,0,53.6492,24.4,01:00:00,Jan,4


In [6]:
buildingMeters = []

for row in fullTraining.itertuples(index=False):
    if (row.building_id, row.meter) not in buildingMeters:
        buildingMeters.append((row.building_id, row.meter))
    
print(len(buildingMeters))

2380


In [7]:
readyToTrain = pd.get_dummies(fullTraining, columns=['hour', 'month', 'dayOfWeek'])

In [8]:
readyToTrain.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,hour_00:00:00,hour_01:00:00,hour_02:00:00,hour_03:00:00,hour_04:00:00,...,month_Nov,month_Oct,month_Sep,dayOfWeek_0,dayOfWeek_1,dayOfWeek_2,dayOfWeek_3,dayOfWeek_4,dayOfWeek_5,dayOfWeek_6
45,45,46,0,53.2397,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
72,72,74,0,43.0013,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
91,91,93,0,52.4206,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
123,123,20,0,91.9886,24.4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
148,148,46,0,53.6492,24.4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
trainedModels = {}

counter = 0
oldPercent = 0.
for building_id, meter in buildingMeters:
    buildingMeterData = readyToTrain[(readyToTrain['building_id'] == building_id) & (readyToTrain['meter'] == meter)]
    # TRAIN A MODEL USING buildingMeterData
    y = buildingMeterData['meter_reading']
    X = buildingMeterData[[
        'air_temperature',
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ]]
    rf = RandomForestRegressor(n_estimators = 10)
    trainedModels[(building_id, meter)] = rf.fit(X, y)
    counter += 1
    percent = 100. * counter / 2380
    if percent - oldPercent > 1:
        print('{0:.2f}'.format(percent))
        oldPercent = percent

print(len(trainedModels))

1.01
2.02
3.03
4.03
5.04
6.05
7.06
8.07
9.08
10.08
11.09
12.10
13.11
14.12
15.13
16.13
17.14
18.15
19.16
20.17
21.18
22.18
23.19
24.20
25.21
26.22
27.23
28.24
29.24
30.25
31.26
32.27
33.28
34.29
35.29
36.30
37.31
38.32
39.33
40.34
41.34
42.35
43.36
44.37
45.38
46.39
47.39
48.40
49.41
50.42
51.43
52.44
53.45
54.45
55.46
56.47
57.48
58.49
59.50
60.50
61.51
62.52
63.53
64.54
65.55
66.55
67.56
68.57
69.58
70.59
71.60
72.61
73.61
74.62
75.63
76.64
77.65
78.66
79.66
80.67
81.68
82.69
83.70
84.71
85.71
86.72
87.73
88.74
89.75
90.76
91.76
92.77
93.78
94.79
95.80
96.81
97.82
98.82
99.83
2380


In [10]:
del fullTraining
del readyToTrain

In [11]:
energyTest = pd.read_csv("energy/test.csv")
print(energyTest.shape)

bldgData = pd.read_csv("energy/building_metadata.csv")
bldgData.drop("primary_use", axis=1, inplace=True)
bldgData.drop("square_feet", axis=1, inplace=True)
bldgData.drop("year_built", axis=1, inplace=True)
bldgData.drop("floor_count", axis=1, inplace=True)

weatherTest = pd.read_csv("energy/weather_test.csv")
weatherTest.drop("cloud_coverage", axis=1, inplace=True)
weatherTest.drop("precip_depth_1_hr", axis=1, inplace=True)
weatherTest.drop("wind_direction", axis=1, inplace=True)
weatherTest.drop("wind_speed", axis=1, inplace=True)
weatherTest.drop("sea_level_pressure", axis=1, inplace=True)
weatherTest.drop("dew_temperature", axis=1, inplace=True)

testJoinBldg = pd.merge(energyTest, bldgData, on='building_id')
fullTest = pd.merge(testJoinBldg, weatherTest, on=['site_id', 'timestamp'])

del energyTest
del testJoinBldg
del bldgData
del weatherTest

print(fullTest.shape)

(41697600, 4)
(41498571, 6)


In [12]:
print(fullTest.shape)
fullTest = fullTest[pd.notnull(fullTest['air_temperature'])]
fullTest.drop("site_id", axis=1, inplace=True)
fullTest.fillna(0, inplace=True)
print(fullTest.shape)

(41498571, 6)
(41475699, 5)


In [None]:
# Note: Create a subdirectory named "results" before running
rfResults = "row_id,meter_reading\n"

counter = 0
oldPercent = 0.
for row in fullTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = row.building_id
    meter = row.meter
    air_temperature = row.air_temperature
    dateStr, hour = row.timestamp.split(" ")
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    month = date.strftime('%b')
    dayOfWeek = str(date.weekday())
    
    features = pd.Series(0, index=[
        'air_temperature',
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ])
    
    features['air_temperature'] = air_temperature
    features['hour_' + hour] = 1
    features['month_' + month] = 1
    features['dayOfWeek_' + dayOfWeek] = 1
    
    pred = trainedModels[(building_id, meter)].predict([features])[0]

    rfResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
    counter += 1
    percent = 100. * counter / 41475699
    if percent - oldPercent > 1:
        print('{0:.2f}'.format(percent))
        oldPercent = percent
    
outFile = open("results/randomForestRegressor_02.csv","w")
outFile.write(rfResults)
outFile.close()

1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
25.00
26.00


In [11]:
outFile.write(rfResults)
outFile.close()

In [2]:
# Merge DTR2 results with DTR1 (to fill in missing predictions)
dtr1results = pd.read_csv("results/decisionTreeRegressor02_merged.csv")
dtr2results = pd.read_csv("results/randomForestRegressor_02.csv")

dtr2results.rename(columns={"meter_reading":"meter_reading2"}, inplace=True)
dtr2results.head(5)


Unnamed: 0,row_id,meter_reading2
0,0,43.2743
1,1,15.2894
2,2,4.7916
3,3,335.6488
4,4,83.5454


In [3]:
mergedResults = pd.merge(dtr1results, dtr2results, on='row_id', how='left')
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading,meter_reading2
41697595,41697595,4.475,
41697596,41697596,5.3375,
41697597,41697597,10.775,
41697598,41697598,170.0938,
41697599,41697599,4.4625,


In [4]:
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading,meter_reading2
0,0,37.5408,43.2743
1,1,14.1973,15.2894
2,2,1.6381,4.7916
3,3,336.502,335.6488
4,4,86.0026,83.5454
5,5,4.4366,4.3343
6,6,99.1077,99.1793
7,7,526.937,517.4833
8,8,886.418,947.6229
9,9,478.133,497.7912


In [5]:
mergedResults.drop("meter_reading", axis=1, inplace=True)
mergedResults.rename(columns={"meter_reading2":"meter_reading"}, inplace=True)
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading
0,0,43.2743
1,1,15.2894
2,2,4.7916
3,3,335.6488
4,4,83.5454
5,5,4.3343
6,6,99.1793
7,7,517.4833
8,8,947.6229
9,9,497.7912


In [6]:
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading
41697595,41697595,
41697596,41697596,
41697597,41697597,
41697598,41697598,
41697599,41697599,


In [7]:
dtr1results.update(mergedResults)
print(dtr1results.shape)

(41697600, 2)


In [8]:
dtr1results.head(10)

Unnamed: 0,row_id,meter_reading
0,0,43.2743
1,1,15.2894
2,2,4.7916
3,3,335.6488
4,4,83.5454
5,5,4.3343
6,6,99.1793
7,7,517.4833
8,8,947.6229
9,9,497.7912


In [9]:
dtr1results.tail(10)

Unnamed: 0,row_id,meter_reading
41697590,41697590,297.875
41697591,41697591,298.0062
41697592,41697592,84.1
41697593,41697593,35.291
41697594,41697594,80.0625
41697595,41697595,4.475
41697596,41697596,5.3375
41697597,41697597,10.775
41697598,41697598,170.0938
41697599,41697599,4.4625


In [10]:
dtr1results.to_csv("results/randomForestRegressor02_merged.csv", index=False)

### Kaggle RMSLE score for the Random Forest Regressor, Version 2 = 1.35

---
# 4. Gradient Boosting Regressor

In [3]:
fullTraining = pd.read_csv("custom/FULL_hourMonthDayofweek.csv")

In [4]:
fullTraining = fullTraining[fullTraining.meter_reading != 0.0]
fullTraining = fullTraining[pd.notnull(fullTraining['air_temperature'])]
fullTraining.drop("timestamp", axis=1, inplace=True)
fullTraining.drop("site_id", axis=1, inplace=True)
fullTraining.drop("primary_use", axis=1, inplace=True)
fullTraining.drop("square_feet", axis=1, inplace=True)
fullTraining.drop("year_built", axis=1, inplace=True)
fullTraining.drop("floor_count", axis=1, inplace=True)
fullTraining.drop("cloud_coverage", axis=1, inplace=True)
fullTraining.drop("precip_depth_1_hr", axis=1, inplace=True)
fullTraining.drop("wind_direction", axis=1, inplace=True)
fullTraining.drop("wind_speed", axis=1, inplace=True)
fullTraining.drop("sea_level_pressure", axis=1, inplace=True)
fullTraining.drop("dew_temperature", axis=1, inplace=True)
fullTraining.fillna(0, inplace=True)
print(fullTraining.shape)

(18252676, 8)


In [5]:
fullTraining.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,hour,month,dayOfWeek
45,45,46,0,53.2397,25.0,00:00:00,Jan,4
72,72,74,0,43.0013,25.0,00:00:00,Jan,4
91,91,93,0,52.4206,25.0,00:00:00,Jan,4
123,123,20,0,91.9886,24.4,01:00:00,Jan,4
148,148,46,0,53.6492,24.4,01:00:00,Jan,4


In [6]:
buildingMeters = []

for row in fullTraining.itertuples(index=False):
    if (row.building_id, row.meter) not in buildingMeters:
        buildingMeters.append((row.building_id, row.meter))
    
print(len(buildingMeters))

2380


In [7]:
readyToTrain = pd.get_dummies(fullTraining, columns=['hour', 'month', 'dayOfWeek'])

In [8]:
readyToTrain.head(5)

Unnamed: 0.1,Unnamed: 0,building_id,meter,meter_reading,air_temperature,hour_00:00:00,hour_01:00:00,hour_02:00:00,hour_03:00:00,hour_04:00:00,...,month_Nov,month_Oct,month_Sep,dayOfWeek_0,dayOfWeek_1,dayOfWeek_2,dayOfWeek_3,dayOfWeek_4,dayOfWeek_5,dayOfWeek_6
45,45,46,0,53.2397,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
72,72,74,0,43.0013,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
91,91,93,0,52.4206,25.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
123,123,20,0,91.9886,24.4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
148,148,46,0,53.6492,24.4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
trainedModels = {}

counter = 0
oldPercent = 0.
for building_id, meter in buildingMeters:
    buildingMeterData = readyToTrain[(readyToTrain['building_id'] == building_id) & (readyToTrain['meter'] == meter)]
    # TRAIN A MODEL USING buildingMeterData
    y = buildingMeterData['meter_reading']
    X = buildingMeterData[[
        'air_temperature',
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ]]
    gb = GradientBoostingRegressor(n_estimators = 100)
    trainedModels[(building_id, meter)] = gb.fit(X, y)
    counter += 1
    percent = 100. * counter / 2380
    if percent - oldPercent > 1:
        print('{0:.2f}'.format(percent))
        oldPercent = percent

print(len(trainedModels))

1.01
2.02
3.03
4.03
5.04
6.05
7.06
8.07
9.08
10.08
11.09
12.10
13.11
14.12
15.13
16.13
17.14
18.15
19.16
20.17
21.18
22.18
23.19
24.20
25.21
26.22
27.23
28.24
29.24
30.25
31.26
32.27
33.28
34.29
35.29
36.30
37.31
38.32
39.33
40.34
41.34
42.35
43.36
44.37
45.38
46.39
47.39
48.40
49.41
50.42
51.43
52.44
53.45
54.45
55.46
56.47
57.48
58.49
59.50
60.50
61.51
62.52
63.53
64.54
65.55
66.55
67.56
68.57
69.58
70.59
71.60
72.61
73.61
74.62
75.63
76.64
77.65
78.66
79.66
80.67
81.68
82.69
83.70
84.71
85.71
86.72
87.73
88.74
89.75
90.76
91.76
92.77
93.78
94.79
95.80
96.81
97.82
98.82
99.83
2380


In [11]:
del fullTraining
del readyToTrain

In [12]:
energyTest = pd.read_csv("energy/test.csv")
print(energyTest.shape)

bldgData = pd.read_csv("energy/building_metadata.csv")
bldgData.drop("primary_use", axis=1, inplace=True)
bldgData.drop("square_feet", axis=1, inplace=True)
bldgData.drop("year_built", axis=1, inplace=True)
bldgData.drop("floor_count", axis=1, inplace=True)

weatherTest = pd.read_csv("energy/weather_test.csv")
weatherTest.drop("cloud_coverage", axis=1, inplace=True)
weatherTest.drop("precip_depth_1_hr", axis=1, inplace=True)
weatherTest.drop("wind_direction", axis=1, inplace=True)
weatherTest.drop("wind_speed", axis=1, inplace=True)
weatherTest.drop("sea_level_pressure", axis=1, inplace=True)
weatherTest.drop("dew_temperature", axis=1, inplace=True)

testJoinBldg = pd.merge(energyTest, bldgData, on='building_id')
fullTest = pd.merge(testJoinBldg, weatherTest, on=['site_id', 'timestamp'])

del energyTest
del testJoinBldg
del bldgData
del weatherTest

print(fullTest.shape)

(41697600, 4)
(41498571, 6)


In [13]:
print(fullTest.shape)
fullTest = fullTest[pd.notnull(fullTest['air_temperature'])]
fullTest.drop("site_id", axis=1, inplace=True)
fullTest.fillna(0, inplace=True)
print(fullTest.shape)

(41498571, 6)
(41475699, 5)


In [14]:
# Note: Create a subdirectory named "results" before running
gbResults = "row_id,meter_reading\n"

counter = 0
oldPercent = 0.
for row in fullTest.itertuples(index=False):
    row_id = str(row.row_id)
    building_id = row.building_id
    meter = row.meter
    air_temperature = row.air_temperature
    dateStr, hour = row.timestamp.split(" ")
    
    date = datetime.datetime.strptime(dateStr, '%Y-%m-%d')
    
    month = date.strftime('%b')
    dayOfWeek = str(date.weekday())
    
    features = pd.Series(0, index=[
        'air_temperature',
        'hour_00:00:00',
        'hour_01:00:00',
        'hour_02:00:00',
        'hour_03:00:00',
        'hour_04:00:00',
        'hour_05:00:00',
        'hour_06:00:00',
        'hour_07:00:00',
        'hour_08:00:00',
        'hour_09:00:00',
        'hour_10:00:00',
        'hour_11:00:00',
        'hour_12:00:00',
        'hour_13:00:00',
        'hour_14:00:00',
        'hour_15:00:00',
        'hour_16:00:00',
        'hour_17:00:00',
        'hour_18:00:00',
        'hour_19:00:00',
        'hour_20:00:00',
        'hour_21:00:00',
        'hour_22:00:00',
        'hour_23:00:00',
        'month_Jan',
        'month_Feb',
        'month_Mar',
        'month_Apr',
        'month_May',
        'month_Jun',
        'month_Jul',
        'month_Aug',
        'month_Sep',
        'month_Oct',
        'month_Nov',
        'month_Dec',
        'dayOfWeek_0',
        'dayOfWeek_1',
        'dayOfWeek_2',
        'dayOfWeek_3',
        'dayOfWeek_4',
        'dayOfWeek_5',
        'dayOfWeek_6'
    ])
    
    features['air_temperature'] = air_temperature
    features['hour_' + hour] = 1
    features['month_' + month] = 1
    features['dayOfWeek_' + dayOfWeek] = 1
    
    pred = trainedModels[(building_id, meter)].predict([features])[0]

    gbResults += row_id + "," + '{0:.4f}'.format(pred) + "\n"
    
    counter += 1
    percent = 100. * counter / 41475699
    if percent - oldPercent > 1:
        print('{0:.2f}'.format(percent))
        oldPercent = percent
    
outFile = open("results/gradientBoostingRegressor.csv","w")
outFile.write(gbResults)
outFile.close()

1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
25.00
26.00
27.00
28.00
29.00
30.00
31.00
32.00
33.00
34.00
35.00
36.00
37.00
38.00
39.00
40.00
41.00
42.00
43.00
44.00
45.00
46.00
47.00
48.00
49.00
50.00
51.00
52.00
53.00
54.00
55.00
56.00
57.00
58.00
59.00
60.00
61.00
62.00
63.00
64.00
65.00
66.00
67.00
68.00
69.00
70.00
71.00
72.00
73.00
74.00
75.00
76.00
77.00
78.00
79.00
80.00
81.00
82.00
83.00
84.00
85.00
86.00
87.00
88.00
89.00
90.00
91.00
92.00
93.00
94.00
95.00
96.00
97.00
98.00
99.00


In [11]:
outFile.write(rfResults)
outFile.close()

In [16]:
# Merge DTR2 results with DTR1 (to fill in missing predictions)
dtr1results = pd.read_csv("results/decisionTreeRegressor02_merged.csv")
dtr2results = pd.read_csv("results/gradientBoostingRegressor.csv")

dtr2results.rename(columns={"meter_reading":"meter_reading2"}, inplace=True)
dtr2results.head(5)


Unnamed: 0,row_id,meter_reading2
0,0,37.9178
1,1,26.024
2,2,7.5532
3,3,331.3568
4,4,536.3626


In [17]:
mergedResults = pd.merge(dtr1results, dtr2results, on='row_id', how='left')
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading,meter_reading2
41697595,41697595,4.475,
41697596,41697596,5.3375,
41697597,41697597,10.775,
41697598,41697598,170.0938,
41697599,41697599,4.4625,


In [18]:
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading,meter_reading2
0,0,37.5408,37.9178
1,1,14.1973,26.024
2,2,1.6381,7.5532
3,3,336.502,331.3568
4,4,86.0026,536.3626
5,5,4.4366,13.981
6,6,99.1077,96.7262
7,7,526.937,481.9263
8,8,886.418,1128.7026
9,9,478.133,460.5976


In [19]:
mergedResults.drop("meter_reading", axis=1, inplace=True)
mergedResults.rename(columns={"meter_reading2":"meter_reading"}, inplace=True)
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading
0,0,37.9178
1,1,26.024
2,2,7.5532
3,3,331.3568
4,4,536.3626
5,5,13.981
6,6,96.7262
7,7,481.9263
8,8,1128.7026
9,9,460.5976


In [20]:
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading
41697595,41697595,
41697596,41697596,
41697597,41697597,
41697598,41697598,
41697599,41697599,


In [21]:
dtr1results.update(mergedResults)
print(dtr1results.shape)

(41697600, 2)


In [22]:
dtr1results.head(10)

Unnamed: 0,row_id,meter_reading
0,0,37.9178
1,1,26.024
2,2,7.5532
3,3,331.3568
4,4,536.3626
5,5,13.981
6,6,96.7262
7,7,481.9263
8,8,1128.7026
9,9,460.5976


In [23]:
dtr1results.tail(10)

Unnamed: 0,row_id,meter_reading
41697590,41697590,297.875
41697591,41697591,298.0062
41697592,41697592,84.1
41697593,41697593,35.291
41697594,41697594,80.0625
41697595,41697595,4.475
41697596,41697596,5.3375
41697597,41697597,10.775
41697598,41697598,170.0938
41697599,41697599,4.4625


In [24]:
dtr1results.to_csv("results/gradientBoostingRegressor_merged.csv", index=False)

### Kaggle RMSLE score for the Gradient Boosting Regressor, = 1.38

---
# 5. Averaging

In [2]:
dtr1results = pd.read_csv("results/decisionTreeRegressor02_merged.csv")
dtr2results = pd.read_csv("results/decisionTree01.csv")

dtr2results.rename(columns={"meter_reading":"meter_reading2"}, inplace=True)
dtr2results.head(5)


Unnamed: 0,row_id,meter_reading2
0,0,54.832333
1,1,19.3847
2,2,2.047667
3,3,113.418667
4,4,472.33


In [4]:
mergedResults = pd.merge(dtr1results, dtr2results, on='row_id', how='left')
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading,meter_reading2
41697595,41697595,4.475,6.290476
41697596,41697596,5.3375,5.140476
41697597,41697597,10.775,6.105952
41697598,41697598,170.0938,164.836905
41697599,41697599,4.4625,4.365476


In [5]:
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading,meter_reading2
0,0,37.5408,54.832333
1,1,14.1973,19.3847
2,2,1.6381,2.047667
3,3,336.502,113.418667
4,4,86.0026,472.33
5,5,4.4366,5.801767
6,6,99.1077,45.322
7,7,526.937,163.928333
8,8,886.418,1055.26
9,9,478.133,146.750333


In [6]:
mergedResults['average'] = mergedResults[['meter_reading','meter_reading2']].mean(axis=1)
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading,meter_reading2,average
0,0,37.5408,54.832333,46.186567
1,1,14.1973,19.3847,16.791
2,2,1.6381,2.047667,1.842883
3,3,336.502,113.418667,224.960333
4,4,86.0026,472.33,279.1663
5,5,4.4366,5.801767,5.119183
6,6,99.1077,45.322,72.21485
7,7,526.937,163.928333,345.432667
8,8,886.418,1055.26,970.839
9,9,478.133,146.750333,312.441667


In [7]:
mergedResults.drop("meter_reading", axis=1, inplace=True)
mergedResults.drop("meter_reading2", axis=1, inplace=True)
mergedResults.rename(columns={"average":"meter_reading"}, inplace=True)
mergedResults.head(10)

Unnamed: 0,row_id,meter_reading
0,0,46.186567
1,1,16.791
2,2,1.842883
3,3,224.960333
4,4,279.1663
5,5,5.119183
6,6,72.21485
7,7,345.432667
8,8,970.839
9,9,312.441667


In [8]:
mergedResults.tail(5)

Unnamed: 0,row_id,meter_reading
41697595,41697595,5.382738
41697596,41697596,5.238988
41697597,41697597,8.440476
41697598,41697598,167.465352
41697599,41697599,4.413988


In [9]:
mergedResults.to_csv("results/average_of_universal_and_site_specific.csv", index=False)

### Kaggle RMSLE score for the Averaged results = 1.29

---
# 6. Boosting

In [25]:
results = pd.read_csv("results/decisionTreeRegressor02_merged.csv")
results.head(5)

Unnamed: 0,row_id,meter_reading
0,0,37.5408
1,1,14.1973
2,2,1.6381
3,3,336.502
4,4,86.0026


In [26]:
results['meter_reading'] = results['meter_reading'] * 1.1
results.head(5)

Unnamed: 0,row_id,meter_reading
0,0,41.29488
1,1,15.61703
2,2,1.80191
3,3,370.1522
4,4,94.60286


In [27]:
results.to_csv("results/decisionTreeRegressor02_merged_BOOSTED.csv", index=False)

### Kaggle RMSLE score for the Boosted results = 1.36