# Capstone Project #2 Part 3: Pre-processing & Training Data Development

In this session, we'll pre-process the data so that it's ready for modeling

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

from sklearn.metrics import r2_score, mean_squared_log_error, mean_absolute_error
import numpy as np
from tqdm import tqdm
from scipy import stats
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from datetime import datetime
from datetime import timedelta
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import pmdarima as pm

## 1. Load data

In [2]:
elec = pd.read_csv('electricity_cleaned.csv',parse_dates=['timestamp']).set_index('timestamp')

In [3]:
elec.head()

Unnamed: 0_level_0,building_id,meter_reading
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-05-20 18:00:00,3,248.452
2016-05-20 19:00:00,3,336.844
2016-05-20 20:00:00,3,358.685
2016-05-20 21:00:00,3,377.456
2016-05-20 22:00:00,3,373.702


In [4]:
meta = pd.read_csv('building_metadata.csv')

In [5]:
weather = pd.read_csv('weather_cleaned.csv',parse_dates=['timestamp']).set_index('timestamp')

## 2. Convert hourly data to daily average data

For this project we select building no. 1126 as target building

In [6]:
bldg_id = 1126

In [7]:
# see some background information for the target building
meta[meta['building_id']==bldg_id]

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
1126,13,1126,Education,72332,,


In [8]:
df = elec[elec['building_id']==bldg_id].drop(['building_id'],axis=1)

In [9]:
daily = df.resample('D').mean().meter_reading

In [10]:
daily.head()

timestamp
2016-01-01    179.822913
2016-01-02    175.222208
2016-01-03    174.820542
2016-01-04    194.201375
2016-01-05    203.131417
Freq: D, Name: meter_reading, dtype: float64

## 3. Add weather feature

In [11]:
# our target building is in site 13 according to metadata
w = weather[weather['site_id']==13].resample('D')['air_temperature','dew_temperature'].mean()

In [12]:
daily_w = pd.DataFrame(daily).join(w).dropna()

In [13]:
daily_w.head()

Unnamed: 0_level_0,meter_reading,air_temperature,dew_temperature
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-01,179.822913,-6.229167,-10.283333
2016-01-02,175.222208,-5.8375,-9.945833
2016-01-03,174.820542,-3.008333,-7.595833
2016-01-04,194.201375,-7.229167,-10.004167
2016-01-05,203.131417,-4.6375,-8.208333


## 4. Add holiday feature

In [14]:
holidays = calendar().holidays(start='2016-1-1', end='2016-12-31')
print(holidays)

DatetimeIndex(['2016-01-01', '2016-01-18', '2016-02-15', '2016-05-30',
               '2016-07-04', '2016-09-05', '2016-10-10', '2016-11-11',
               '2016-11-24', '2016-12-26'],
              dtype='datetime64[ns]', freq=None)


The holidays from calendar function seem to include some days that are not widely appliable to all businesses. As a result, we manually create a holiday schedule

In [15]:
holidays = ['2016-01-01', '2016-05-30','2016-07-04', '2016-09-05', '2016-11-24', '2016-11-25', '2016-12-26']

In [16]:
daily_w['new']=pd.to_datetime(daily_w.index.date)
daily_w['holiday'] = daily_w['new'].isin(holidays).apply(lambda x: 1 if x==True else 0)
daily_w.drop(['new'],axis=1,inplace=True)

In [17]:
daily_w.head()

Unnamed: 0_level_0,meter_reading,air_temperature,dew_temperature,holiday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-01,179.822913,-6.229167,-10.283333,1
2016-01-02,175.222208,-5.8375,-9.945833,0
2016-01-03,174.820542,-3.008333,-7.595833,0
2016-01-04,194.201375,-7.229167,-10.004167,0
2016-01-05,203.131417,-4.6375,-8.208333,0


In [18]:
daily_w.to_csv('preprocessed_data.csv',index=True)