# Project 2: Cleaning the ASHRAE datasets

##  Dealing with missing data
by: Alissa Stover, Sophia Skowronski, Ying Hua

## Importing data

In [1]:
''' importing basic data analysis packages'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os,random, math, psutil, pickle 
import missingno as msno

''' For ML'''
from sklearn import metrics, svm
from sklearn.linear_model  import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import preprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

ModuleNotFoundError: No module named 'missingno'

# --------------------------------------------------------------------------------------------------------------

# PART 1: Imputing missing data in training dataset

In [None]:
'''Reading in data'''
building_df = pd.read_csv('building_metadata.csv')
train_df = pd.read_csv('train.csv')

##  Initial filtering and merging

In [None]:
# only focusing on electricity meter:
train_df= train_df.loc[train_df['meter']==0]

# merging with buidling data
train_df = train_df.merge(building_df, on=['building_id'], how='left')

# Converting timestamp to right data type
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])


## Add `time_index`, `day_of_week`, and `hour_of_day` variables to dataframe

In [None]:
# adding time index varialbe that counts difference in time vs. beginning date 
train_df['time_index']= train_df['timestamp']- train_df['timestamp'].min()

# Coverting time difference into hours
train_df['time_index']= train_df['time_index'].apply (lambda x: x.days*24+x.seconds//3600) 
train_df['time_index'] = train_df ['time_index'].astype(int)

# Adding day of the week and hour of day
train_df['day_of_week'] = train_df['timestamp'].dt.dayofweek
train_df['hour_of_day'] = train_df['time_index'] % 24
train_df['hour_of_day'] = train_df['hour_of_day'].astype(int)

## Taking a look at missing data - Explanatory variable `meter_reading`

In [None]:
# Examining how much data is missing
missing_count = len(train_df.loc[train_df['meter_reading'].isnull()])
missing_percent = missing_count / len(train_df)
print ("{0:.0%}".format(missing_percent), "of data is missing.")

# Exmaing how much data has 0 reading
zero_value = len(train_df.loc[train_df['meter_reading'] ==0])
zero_percent = zero_value / len(train_df)
print ("{0:.0%}".format(zero_percent), "of meter readings are 0. ")

> We note that 4% of the meter readings are zero.  We theorize that some of the 0 values are not actual meter readings but some sort of erroneous mistake.  

> One of the things we have noticed is that sometimes a 0 reading will be followed by a huge reading and we suspect those are "catch up" reading when one misses a meter reading.  We note that while 4% is not a big number (especially considering some of these data could actually be 0 readings, we decided that we should take two approaches.  One is the simple appraoch to get rid of these 0 readings and treat all as mistakes.  The other is to use ML techniques to impute these missing number. 

> We plan on using both set when we come up with the model to forecast meter reading and see which set perform better. 

## The simple approach - treating all 0 readings as erroneous and deleting all

To more accurately impute missing data, we first need a set of clean meter readings that will be representative (i.e. not erroneous.)  We begin our process by hilighting which data could be erroneous. 

In [None]:
# adding index as a column 
train_df['index'] = train_df.index 

In [None]:
# for data quality purpose, we decided to also delete the spike reading immediately post the 0 readings

# we first define spike reading to be readings that are >2x standard deviation away from mean for each buildign

building_meter_avg = pd.DataFrame(train_df.groupby('building_id')['meter_reading'].mean())
building_meter_std = pd.DataFrame(train_df.groupby('building_id')['meter_reading'].std())
building_meter_outlier = building_meter_avg.merge(building_meter_std, on='building_id')
building_meter_outlier= building_meter_outlier.rename(columns={"meter_reading_x": "avg", "meter_reading_y": "std"})
building_meter_outlier['outlier'] = building_meter_outlier['avg']+2*building_meter_outlier['std']

# Adding the outlier into the original dataframe
train_df= train_df.merge(building_meter_outlier, on='building_id', how='left')

In [None]:
# Determing spike readings that are immediately followed by 0 readings 

spike_index = []
grouped = train_df.groupby('building_id')
for key, group in grouped:
    group.sort_values(by = 'time_index')
    spike_index += list(group.loc[((group.meter_reading > group.outlier) & \
                                   (group.meter_reading.shift(1) == 0))].index)

In [None]:
# filtering out 0s and spike readings
clean_data_naive = train_df.loc[(train_df.meter_reading != 0 & \
                                 ~train_df.index.isin(spike_index))]

In [None]:
print ('With this simple way, we are capturing', "{0:.0%}".format(len(clean_data_naive)/ \
                                                                 len(train_df)), 'of the meter reading data.')

In [None]:
clean_data_naive.isna().sum()

## The advanced approach 
> With this approach, we want to come up with a ML model to impute missing meter reading data.  To do so, we first need to distinguish which among the 0 readings are missing data and which are actual 0 readings.  We define such as 0 readings that are not followed by spikes and does not last longer than 7 consecutive days. 

In [None]:
# We first separate out all the zero readings as well as spikes, which is the naive clean dataset we calcualted above. 
# since we suspect these spikes are erroneous, we will put them in the to impute category
clean_data_advanced = clean_data_naive
data_to_impute = train_df.loc[~train_df.index.isin (clean_data_advanced.index)]



In [None]:
data_to_impute

In [None]:
# We then add back 0 values that are likely to be real zero readings- i.e. those that are not followed by spikes and does not last 7 consecutive days

grouped = train_df.groupby('building_id')
correct_zero_readings_index = []
for key, group in grouped:
    group = group.sort_values(by = 'time_index')
    correct_zero_readings_index += list(group.loc[(((group.meter_reading == 0) & \
                                                   (group.meter_reading.shift(-1)<=group.outlier)) & \
                                                   (group.meter_reading.groupby((group.meter_reading != \
                                                                                 group.meter_reading.shift()).cumsum()).\
                                                                                 transform('count').lt(8)))].index)
    

In [None]:
# we add these back to our clean data and exclude them from data to impute
clean_data_advanced = train_df.loc[((train_df.index.isin(list(clean_data_advanced.index))) | \
                                  (train_df.index.isin(correct_zero_readings_index)))]
data_to_impute = data_to_impute.drop (correct_zero_readings_index)

> From our visual charts, we can also see that in site 0, there are a large chunks of data that are missing at the beginning of the period.  There are some small readings here and there during this period at certain buildings but for the most part, the readings were non existent. We decided not to include these data. 

In [None]:
# Figuring out which part of the site 0 history to ignore
site_zero_issues = train_df.loc[train_df.site_id == 0].groupby('time_index')['meter_reading'].sum()
max_dff = (site_zero_issues - site_zero_issues.shift()).max()
inflection_time = site_zero_issues[site_zero_issues >=  max_dff].index[0]
size_zero_missing_data_index = list(clean_data_advanced.loc[((clean_data_advanced.site_id ==0) & \
                                                  (clean_data_advanced.time_index < inflection_time))].index)

In [None]:
# moving these data from clean_data_advanced to data_to_impute
clean_data_advanced = clean_data_advanced.drop (size_zero_missing_data_index)
data_to_impute =  train_df.loc[((train_df.index.isin(list(data_to_impute.index))) | \
                                  (train_df.index.isin(size_zero_missing_data_index)))]

> We now have a clean dataset (clean_data_advanced) to run ML algorithmn to impute missing data.  

In [None]:
print ('We are using', "{0:.0%}".format(len(clean_data_advanced)/ len(train_df)), \
       'of the meter reading data as clean data to run ML to impute', \
       "{0:0}".format(len(data_to_impute)), ', or', "{0:.0%}".format(len(data_to_impute)/ len(train_df)), "of the data." )

### Using ML to impute missing data

We decided to try different ML methods to impute missing data (in our case 0 readings).  The three mehtods we want to try is KNN, linear regression and naive bayes. 

Before we run different methodology and compare results, we first want to split our clean data into training and test set and define features that will be used to run the test. Since meter_reading is our explanatory variable that we want to use more features later to predict, we want to keep this part of feature engineering simpler.  

We picked 4 variables as features - site id, buidling id, time of the day and day of the week. The first 2 variables we think will indirectly give us some information about buildng specific as well as weather related inforamtion as well. 

In [None]:
clean_data_advanced_x = clean_data_advanced.loc[:,clean_data_advanced.columns.isin(['building_id', 'site_id','hour_of_day', 'day_of_week'])]
clean_data_advanced_y = clean_data_advanced.loc[:,clean_data_advanced.columns.isin(['meter_reading'])].values

In [None]:
# converting & encoding lable to avoid valueerror
# lab_enc = preprocessing.LabelEncoder()
# training_scores_encoded = lab_enc.fit_transform(clean_data_advanced_y)
training_scores_encoded= np.ravel(clean_data_advanced_y).astype('int')

In [None]:
# spliting the clean dataset into 70/30 for training/ test
X_train, X_test, y_train, y_test = train_test_split(clean_data_advanced_x, training_scores_encoded, test_size=0.3, random_state=1)

> METHOD: Liner regression

In [None]:
# Training model using linear regression
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

In [None]:
# predicting y 
y_pred = regressor.predict(X_test)
r2_score(y_test, y_pred)

## Advanced approach, attempt 2

> Given a extremely low success score (especially even on the training set) we suspected that the model is flawed.  One of the issues we see is that we have essentially all categorical variables even though they are in numeric values.  So we decide to experiementing on reconfiguring these variables into more numeric values.  The way we do this is by taking the average that fits each catergory and use that train the dataset. 

In [None]:
# Creating a copy of the x, y variables to test our new method
clean_data_advanced_new_x = clean_data_advanced.loc[:,clean_data_advanced.columns.isin(['building_id', 'site_id','meter_reading','hour_of_day', 'day_of_week'])].copy()
clean_data_advanced_y = clean_data_advanced.loc[:,clean_data_advanced.columns.isin(['meter_reading'])].copy()
clean_data_advanced_y= np.ravel(clean_data_advanced_y).astype('int')

# lab_enc = preprocessing.LabelEncoder()
# training_scores_encoded = lab_enc.fit_transform(clean_data_advanced_y)


In [None]:
# spliting the clean dataset into 70/30 for training/ test
X_train, X_test, y_train, y_test = train_test_split(clean_data_advanced_new_x, clean_data_advanced_y, test_size=0.3, random_state=1)

In [None]:
# adding in avearage meter reading for each variable for training_set
X_train = X_train.copy()
X_train ['avg_building'] = X_train.groupby('building_id')['meter_reading'].transform('mean')
X_train ['avg_site'] = X_train.groupby('site_id')['meter_reading'].transform('mean')
X_train ['avg_dow'] = X_train.groupby('day_of_week')['meter_reading'].transform('mean')
X_train ['avg_hod'] = X_train.groupby('hour_of_day')['meter_reading'].transform('mean')

In [None]:
#creating a dictionary

grouped = X_train.groupby('building_id')
avg_building_dict= {key:group['avg_building'].mean() for key, group in grouped}

grouped = X_train.groupby('site_id')
avg_site_dict= {key:group['avg_site'].mean() for key, group in grouped}

grouped = X_train.groupby('day_of_week')
avg_dow_dict= {key:group['avg_dow'].mean() for key, group in grouped}

grouped = X_train.groupby('hour_of_day')
avg_hod_dict= {key:group['avg_hod'].mean() for key, group in grouped}


In [None]:
# adding in avearage meter reading for each variable for testing_set
X_test = X_test.copy()
X_test ['avg_building'] = X_test['building_id'].map(avg_building_dict)
X_test ['avg_site'] = X_test['site_id'].map(avg_site_dict)
X_test ['avg_dow'] = X_test['day_of_week'].map(avg_dow_dict)
X_test ['avg_hod'] = X_test['hour_of_day'].map(avg_hod_dict)

In [None]:
# dropping categorical columns
X_train = X_train.drop(columns =['building_id','meter_reading','site_id','day_of_week', 'hour_of_day'])
X_test = X_test.drop(columns=['building_id','meter_reading','site_id','day_of_week', 'hour_of_day'])

> **METHOD: Linear_ regression- ATTEMPT 2**

In [None]:
# Training model using linear regression
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

In [None]:
# predicting y 
y_pred = regressor.predict(X_test)

In [None]:
# evaluating results 
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.head(5)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
r2_score(y_test, y_pred)

## Using linear regression to impute missing value
> With a 87% R-square, we feel pretty good about our model and will go ahead and use the model to forecast the missing data

In [None]:
data_to_impute.head

In [None]:
data_to_impute_cal = data_to_impute.loc[:,clean_data_advanced.columns.isin(['building_id', 'site_id','hour_of_day', 'day_of_week'])]

In [None]:
# Translating categorical data
data_to_impute_cal = data_to_impute_cal.copy()
data_to_impute_cal ['avg_building'] = data_to_impute_cal['building_id'].map(avg_building_dict)
data_to_impute_cal ['avg_site'] = data_to_impute_cal['site_id'].map(avg_site_dict)
data_to_impute_cal ['avg_dow'] = data_to_impute_cal['day_of_week'].map(avg_dow_dict)
data_to_impute_cal ['avg_hod'] = data_to_impute_cal['hour_of_day'].map(avg_hod_dict)

In [None]:
# dropping categorical columns
data_to_impute_cal = data_to_impute_cal.drop(columns =['building_id','site_id','day_of_week', 'hour_of_day'])

In [None]:
# predicting data_to_impute
data_to_impute_cal ['meter_reading'] = regressor.predict(data_to_impute_cal)

In [None]:
full_meter_reading = pd.DataFrame (clean_data_advanced['meter_reading'].append(data_to_impute_cal['meter_reading']))
full_meter_reading = full_meter_reading.sort_index()

In [None]:
train_df_imputed = train_df.copy()
train_df_imputed['meter_reading'] = full_meter_reading['meter_reading']

## Taking a look at missing data - Independent variables

In [None]:
missing_data = pd.DataFrame(train_df_imputed.isna().sum())
missing_data.columns = ['missing_count']
missing_data['missing_percent'] = missing_data['missing_count']/ len(train_df)

In [None]:
missing_data

>  The two missing data are year_built and floor_count. We note that both of them have fairly high percent of missing data.  While we will use ML techniques to impute these missing data, we will be more inclined to use other non-missing data if possible. 

### Imputing year_built variable 

In [None]:
train_df_imputed.head(5)

In [None]:
# Are there any buildings that are not missing year_built consistently?
grouped = train_df_imputed.groupby('building_id')
inconsistent_index = []
for key, group in grouped:
    if group['year_built'].isna().count() not in [len(group), 0]:
        inconsistent_index += [key]
    


In [None]:
# We confirm no building is missing partial data
len(inconsistent_index)

In [None]:
# we then separate the missing data form non-missing data 
yb_missing_data = train_df_imputed.loc[train_df_imputed['year_built'].isna()]
yb_clean_data = train_df_imputed.loc[~train_df_imputed['year_built'].isna()]

> For ML algorithm, we will split the non-missing data to 80/20 for train/test.  We will pick the followingvariable as features: meter_reading, day_of_week, hour_of_day, square_feet, primary_use, site_id

In [None]:
# Setting x-variables 
yb_clean_data_x = yb_clean_data.loc[:, yb_clean_data.columns.isin(['meter_reading', 'day_of_week', 'hour_of_day',\
                                                  'squre_feet', 'primary_use', 'site_id'])]
# Setting y-variables
yb_clean_data_y = yb_clean_data.loc[:, yb_clean_data.columns.isin(['year_built'])]

In [None]:
# converting primary_use into a numeric varialbe
lab_enc = preprocessing.LabelEncoder()
yb_clean_data_x = yb_clean_data_x.copy()
yb_clean_data_x['primary_use'] = lab_enc.fit_transform(yb_clean_data_x['primary_use'])

In [None]:
# splitting dataset into 70/30 for train/test
X_train, X_test, y_train, y_test = train_test_split(yb_clean_data_x, yb_clean_data_y, test_size=0.3, random_state=1)
y_train = np.ravel(y_train)

> ML Method: KNN

In [None]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5)
# Fit the classifier to the data
knn.fit(X_train,y_train)


In [None]:
# Predicting on test data
y_predict = knn.predict(X_test)

In [None]:
#check accuracy of our model on the test data
print("Accuracy:",metrics.accuracy_score(y_test, y_predict))

> Linear regression

In [None]:
# Training model using linear regression
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

In [None]:
# predicting y 
y_pred = regressor.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

 > Naive Bayes


In [None]:
gnb = GaussianNB()
# Train classifier
gnb.fit(X_train, y_train)

In [None]:
# Predict values

y_pred = gnb.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

> Of these methods, KNN gave us the best results.  We now explore accuracy with different n_neighbor


In [None]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 15)
# Fit the classifier to the data
knn.fit(X_train,y_train)

# Predicting on test data
y_predict = knn.predict(X_test)

#check accuracy of our model on the test data
print("Accuracy:",metrics.accuracy_score(y_test, y_predict))

> We think using KNN and 15 n-neighbor values gives us highest  accuracy. We will use this to impute missing year_built data

In [None]:
# predicting year_built data in  missing_data 
yb_missing_data_x= yb_missing_data.loc[:,yb_missing_data.columns.isin(['meter_reading', 'day_of_week', 'hour_of_day',\
                                                  'squre_feet', 'primary_use', 'site_id'])]
yb_missing_data_x = yb_missing_data_x.copy()
yb_missing_data_x['primary_use'] = lab_enc.fit_transform(yb_missing_data_x['primary_use'])
yb_missing_data = yb_missing_data.copy()
yb_missing_data ['year_built'] = knn.predict(yb_missing_data_x)

In [None]:
full_yb = pd.DataFrame (yb_clean_data['year_built'].append(yb_missing_data['year_built']))
full_yb = full_yb.sort_index()

In [None]:
train_df_imputed = train_df_imputed.copy()
train_df_imputed['year_built'] = full_yb['year_built']

### Imputing floor_count variable 

In [None]:
# we then separate the missing data form non-missing data 
fc_missing_data = train_df_imputed.loc[train_df_imputed['floor_count'].isna()]
fc_clean_data = train_df_imputed.loc[~train_df_imputed['floor_count'].isna()]

> For ML algorithm, we will split the non-missing data to 70/30 for train/test.  We will pick the followingvariable as features: meter_reading, day_of_week, hour_of_day, square_feet, primary_use, site_id, year_built

In [None]:
# Setting x-variables 
fc_clean_data_x = fc_clean_data.loc[:, fc_clean_data.columns.isin(['meter_reading', 'day_of_week', 'hour_of_day',\
                                                  'squre_feet', 'primary_use', 'site_id', 'year_built'])]
# Setting y-variables
fc_clean_data_y = fc_clean_data.loc[:, fc_clean_data.columns.isin(['floor_count'])]

In [None]:
# converting primary_use into a numeric varialbe
lab_enc = preprocessing.LabelEncoder()
fc_clean_data_x = fc_clean_data_x.copy()
fc_clean_data_x['primary_use'] = lab_enc.fit_transform(fc_clean_data_x['primary_use'])

In [None]:
# splitting dataset into 70/30 for train/test
X_train, X_test, y_train, y_test = train_test_split(fc_clean_data_x, fc_clean_data_y, test_size=0.3, random_state=1)
y_train = np.ravel(y_train)

> ML Method: KNN

In [None]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5)
# Fit the classifier to the data
knn.fit(X_train,y_train)


In [None]:
# Predicting on test data
y_predict = knn.predict(X_test)

In [None]:
#check accuracy of our model on the test data
print("Accuracy:",metrics.accuracy_score(y_test, y_predict))

> 87% accuracy is pretty good. we will use this to impute missing floor_count data

In [None]:
# predicting floor_count data in  missing_data 
fc_missing_data_x= fc_missing_data.loc[:,fc_missing_data.columns.isin(['meter_reading', 'day_of_week', 'hour_of_day',\
                                                  'squre_feet', 'primary_use', 'site_id', 'year_built'])]
fc_missing_data_x = fc_missing_data_x.copy()
fc_missing_data_x['primary_use'] = lab_enc.fit_transform(fc_missing_data_x['primary_use'])
fc_missing_data = fc_missing_data.copy()
fc_missing_data ['floor_count'] = knn.predict(fc_missing_data_x)

In [None]:
full_fc = pd.DataFrame (fc_clean_data['floor_count'].append(fc_missing_data['floor_count']))
full_fc = full_fc.sort_index()

In [None]:
train_df_imputed = train_df_imputed.copy()
train_df_imputed['floor_count'] = full_fc['floor_count']

In [None]:
train_df_imputed.isna().sum()

In [None]:
# This is the final data set with imputed missing values for building data and meter reading
#train_df_imputed.to_pickle('train_df_imputed.pkl')

# --------------------------------------------------------------------------------------------------------------

# PART 2: Daylight savings correction for train & test data

This Jupyter notebook walks through steps to read in and reduce the memory usage of the clean meter reading files. 
It localizes the timezones and adjusts them for daylight savings time, with data derived from the discussion here: https://www.kaggle.com/patrick0302/locate-cities-according-weather-temperature 
This code also derives from code found at this URL https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe 

In [2]:
# Cleaned train data
train_tz_df = train_df_imputed
train_tz_df["timestamp"] = pd.to_datetime(train_tz_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

# Test data
test_tz_df = pd.read_csv('test.csv')
test_tz_df["timestamp"] = pd.to_datetime(test_tz_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

# Building data
building_df = pd.read_csv('building_metadata.csv')
building_df['primary_use'] = building_df['primary_use'].astype('category')

# Timezone data
time_zones_df = pd.read_csv('time_zones.csv')

## Merge train/test data with timezone data

In [3]:
### Merge building data on test data

In [4]:
temp_df = test_tz_df[['building_id']]
temp_df = temp_df.merge(building_df, on = ['building_id'], how = 'left')

del temp_df['building_id']
test_tz_df = pd.concat([test_tz_df, temp_df], axis = 1)

del temp_df

### Merge timezone

In [5]:
temp_df = train_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
train_tz_df = pd.concat([train_tz_df, temp_df], axis=1)

del temp_df

temp_df = test_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
test_tz_df = pd.concat([test_tz_df, temp_df], axis=1)

del temp_df

In [None]:
train_tz_df.head()

## Correcting Daylight Savings Time

### Prepare daylight savings time column for adjustment

In [6]:
train_tz_df['dst'] = 0
test_tz_df['dst'] = 0

In [7]:
# Train data

# 2016

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Eastern') & 
                 (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & 
                 (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
train_tz_df.loc[((train_tz_df['timezone'] == 'US/Mountain') & 
                 (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & 
                 (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
train_tz_df.loc[((train_tz_df['timezone'] == 'US/Pacific') & 
                 (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & 
                 (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
train_tz_df.loc[((train_tz_df['timezone'] == 'US/Central') & 
                 (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & 
                 (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
train_tz_df.loc[((train_tz_df['timezone'] == 'Canada/Eastern') & 
                 (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & 
                 (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
train_tz_df.loc[((train_tz_df['timezone'] == 'Europe/London') & 
                 (train_tz_df['timestamp'] >= '2016-03-27 01:00:00') & 
                 (train_tz_df['timestamp'] < '2016-10-30 02:00:00')), 'dst'] = 1
train_tz_df.loc[((train_tz_df['timezone'] == 'Europe/Dublin') & 
                 (train_tz_df['timestamp'] >= '2016-03-27 01:00:00') & 
                 (train_tz_df['timestamp'] < '2016-10-30 02:00:00')), 'dst'] = 1

In [8]:
# Test data

# 2017

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Eastern') & 
                (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & 
                (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'US/Mountain') & 
                (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & 
                (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'US/Pacific') & 
                (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & 
                (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'US/Central') & 
                (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & 
                (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'Canada/Eastern') & 
                (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & 
                (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/London') & 
                (test_tz_df['timestamp'] >= '2017-03-26 01:00:00') & 
                (test_tz_df['timestamp'] < '2017-10-29 02:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/Dublin') & 
                (test_tz_df['timestamp'] >= '2017-03-26 01:00:00') & 
                (test_tz_df['timestamp'] < '2017-10-29 02:00:00')), 'dst'] = 1

# 2018

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Eastern') & 
                (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & 
                (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'US/Mountain') & 
                (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & 
                (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'US/Pacific') & 
                (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & 
                (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'US/Central') & 
                (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & 
                (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'Canada/Eastern') & 
                (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & 
                (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/London') & 
                (test_tz_df['timestamp'] >= '2018-03-25 01:00:00') & 
                (test_tz_df['timestamp'] < '2018-10-28 02:00:00')), 'dst'] = 1
test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/Dublin') & 
                (test_tz_df['timestamp'] >= '2018-03-25 01:00:00') & 
                (test_tz_df['timestamp'] < '2018-10-28 02:00:00')), 'dst'] = 1

### Adjust for daylight savings time

In [9]:
from datetime import timedelta 
train_tz_df.loc[train_tz_df['dst'] == 1, 'timestamp'] += timedelta(hours = 1)
test_tz_df.loc[test_tz_df['dst'] == 1, 'timestamp'] += timedelta(hours = 1)

## Data Minification

Save the final dataframes as pickle files.

In [14]:
#train_tz_df.to_pickle('train_imputed_tz_df.pkl')
#test_tz_df.to_pickle('test_imputed_tz_df.pkl')
#del train_tz_df, test_tz_df

## Using the files
To use these files, you must first read them in using the following code.

In [15]:
#train_tz_df = pd.read_pickle('train_imputed_tz_df.pkl')
#test_tz_df = pd.read_pickle('test_imputed_tz_df.pkl')

# --------------------------------------------------------------------------------------------------------------
# PART 3: Timezone correction for weather train & test data

This Jupyter notebook timezone-corrects the weather data; see discussion here for the source of timezone data: https://www.kaggle.com/patrick0302/locate-cities-according-weather-temperature 

It also derives from code found at this URL https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe 

## Read in Data

In [2]:
# Train data
weather_train_tz_df = pd.read_csv('weather_train.csv')

# Test data
weather_test_tz_df = pd.read_csv ('weather_test.csv')

# Timezone data
time_zones_df = pd.read_csv('time_zones.csv')

## Adjust weather data timezones

### Merge weather & time zone data

In [3]:
# Train data
temp_df = weather_train_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
weather_train_tz_df = pd.concat([weather_train_tz_df, temp_df], axis=1)

del temp_df

In [4]:
# Test data
temp_df = weather_test_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
weather_test_tz_df = pd.concat([weather_test_tz_df, temp_df], axis=1)

del temp_df

### Create timezones dictionary to map timezone offsets onto `timestamp` series

In [5]:
timezones = list(time_zones_df.timezone.unique())
timezones_offset = [-5, 0, -7, -8, -5, -6, 1]
timezones_dict = dict(zip(timezones, timezones_offset))
timezones_dict

{'US/Eastern': -5,
 'Europe/London': 0,
 'US/Mountain': -7,
 'US/Pacific': -8,
 'Canada/Eastern': -5,
 'US/Central': -6,
 'Europe/Dublin': 1}

### The "Unnamed: 0" is a duplicate `site_id` column

In [9]:
weather_train_tz_df["Unnamed: 0"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
      dtype=int64)

In [10]:
del weather_train_tz_df["Unnamed: 0"], weather_test_tz_df["Unnamed: 0"]

### Convert `timstamp` series in weather test/train datasets

In [11]:
# Train data
weather_train_tz_df['timestamp'] = pd.to_datetime(weather_train_tz_df['timestamp'])

In [1]:
# Test data
weather_test_tz_df['timestamp'] = pd.to_datetime(weather_test_tz_df['timestamp'])

NameError: name 'pd' is not defined

## `timestamp` adjustment using timezone offset

In [13]:
timezones_dict.keys()

dict_keys(['US/Eastern', 'Europe/London', 'US/Mountain', 'US/Pacific', 'Canada/Eastern', 'US/Central', 'Europe/Dublin'])

In [14]:
weather_train_tz_df['timestamp_utc'] = weather_train_tz_df['timestamp']
weather_test_tz_df['timestamp_utc'] = weather_test_tz_df['timestamp']

from datetime import timedelta 
for zone in timezones_dict.keys():
    weather_train_tz_df.timestamp[weather_train_tz_df.timezone==zone] += timedelta(hours = timezones_dict[zone])
    weather_test_tz_df.timestamp[weather_test_tz_df.timezone==zone] += timedelta(hours = timezones_dict[zone])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


## Data Minification

In [19]:
#weather_train_tz_df.to_pickle('weather_train_tz_df.pkl')
#weather_train_tz_df.to_pickle('weather_test_tz_df.pkl')
#del weather_train_tz_df, weather_test_tz_df

### Using the files
To use these files, you must first read them in using the following code.

In [20]:
#weather_train_tz_df = pd.read_pickle('weather_train_tz_df.pkl')
#weather_test_tz_df = pd.read_pickle('weather_test_tz_df.pkl')

# --------------------------------------------------------------------------------------------------------------

# PART 4: Merge imputed and timezone-corrected weather and meter data

This Jupyter notebook merges the meter and weather data after they have been timezone-corrected and after the meter data has been cleaned.
See discussion here for background on the timezone correction: https://www.kaggle.com/patrick0302/locate-cities-according-weather-temperature 
This code also derives from code found at this URL https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe 

## Read in Data

In [2]:
# Meter data
#train_tz_df = pd.read_pickle('train_imputed_tz_df.pkl')
#test_tz_df = pd.read_pickle('test_imputed_tz_df.pkl')

# Weather data
#weather_train_tz_df = pd.read_pickle('weather_train_tz_df.pkl')
#weather_test_tz_df = pd.read_pickle('weather_test_tz_df.pkl')

### Weather df merge

In [5]:
temp_df = train_tz_df[['site_id','timestamp', 'timezone', 'country_code', 'location']]
temp_df = temp_df.merge(weather_train_tz_df, on=['site_id','timestamp', 'timezone', 'country_code', 'location'], how='left')
del temp_df['site_id'], temp_df['timestamp'], temp_df['timezone'], temp_df['country_code'], temp_df['location']

train_tz_df = pd.concat([train_tz_df, temp_df], axis=1)

In [8]:
temp_df = test_tz_df[['site_id','timestamp', 'timezone', 'country_code', 'location']]
temp_df = temp_df.merge(weather_test_tz_df, on=['site_id','timestamp', 'timezone', 'country_code', 'location'], how='left')

del temp_df['site_id'], temp_df['timestamp'], temp_df['timezone'], temp_df['country_code'], temp_df['location']
test_tz_df = pd.concat([test_tz_df, temp_df], axis=1)

del temp_df, weather_train_tz_df, weather_test_tz_df

In [12]:
del train_tz_df["Unnamed: 0"], test_tz_df["Unnamed: 0"]

In [13]:
test_tz_df.columns

Index(['row_id', 'building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'timezone', 'country_code',
       'location', 'timezone_offset', 'dst', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 'timezone_offset',
       'timestamp_utc'],
      dtype='object')

In [14]:
train_tz_df.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count', 'time_index',
       'day_of_week', 'hour_of_day', 'index', 'avg', 'std', 'outlier',
       'timezone', 'country_code', 'location', 'timezone_offset', 'dst',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'timezone_offset', 'timestamp_utc', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 'timezone_offset',
       'timestamp_utc'],
      dtype='object')

## Data Minification

Save the final dataframes as pickle files.

In [None]:
train_tz_df.to_pickle('train_merge_df.pkl')
test_tz_df.to_pickle('test_merge_df.pkl')
   
del train_tz_df, test_tz_df

## Using the files
To use these files, you must first read them in using the following code.

In [None]:
train_df = pd.read_pickle('train_merge_df.pkl')
test_df = pd.read_pickle('test_merge_df.pkl')