# Project 2 - Data cleaning
###  Timezone correct meter reading data for merge
by: Alissa Stover, Sophia Skowronski, Ying Hua

This Jupyter notebook walks through steps to read in and reduce the memory usage of the clean meter reading files. 
It localizes the timezones and adjusts them for daylight savings time, with data derived from the discussion here: https://www.kaggle.com/patrick0302/locate-cities-according-weather-temperature 
This code also derives from code found at this URL https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe 

In [1]:
## Import Packages

In [2]:
import numpy as np
import pandas as pd
import pickle

In [3]:
# Cleaned train data
train_tz_df = pd.read_pickle('train_df_imputed.pkl')
train_tz_df["timestamp"] = pd.to_datetime(train_tz_df["timestamp"], format='%Y-%m-%d %H:%M:%S')


# Test data
test_tz_df = pd.read_csv('test.csv')
test_tz_df["timestamp"] = pd.to_datetime(test_tz_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

# Building data
building_df = pd.read_csv('building_metadata.csv')
building_df['primary_use'] = building_df['primary_use'].astype('category')


# Timezone data
time_zones_df = pd.read_csv('time_zones.csv')

## Merge train/test data with timezone data

In [4]:
### Merge building data on test data

In [5]:
temp_df = test_tz_df[['building_id']]
temp_df = temp_df.merge(building_df, on = ['building_id'], how = 'left')

del temp_df['building_id']
test_tz_df = pd.concat([test_tz_df, temp_df], axis = 1)

del temp_df

### Merge timezone

In [6]:
temp_df = train_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
train_tz_df = pd.concat([train_tz_df, temp_df], axis=1)

del temp_df

temp_df = test_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
test_tz_df = pd.concat([test_tz_df, temp_df], axis=1)

del temp_df

## Convert timezones

### Prepare daylight savings time column for adjustment

In [7]:
train_tz_df['dst'] = 0
test_tz_df['dst'] = 0

In [8]:
# Train data

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Eastern') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Mountain') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Pacific') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Central') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'Canada/Eastern') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'Europe/London') & (train_tz_df['timestamp'] >= '2016-03-27 01:00:00') & (train_tz_df['timestamp'] < '2016-10-30 02:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'Europe/Dublin') & (train_tz_df['timestamp'] >= '2016-03-27 01:00:00') & (train_tz_df['timestamp'] < '2016-10-30 02:00:00')), 'dst'] = 1


In [9]:
# Test data

# 2017

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Eastern') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Mountain') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Pacific') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Central') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Canada/Eastern') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/London') & (test_tz_df['timestamp'] >= '2017-03-26 01:00:00') & (test_tz_df['timestamp'] < '2017-10-29 02:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/Dublin') & (test_tz_df['timestamp'] >= '2017-03-26 01:00:00') & (test_tz_df['timestamp'] < '2017-10-29 02:00:00')), 'dst'] = 1

# 2018

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Eastern') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Mountain') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Pacific') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Central') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Canada/Eastern') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/London') & (test_tz_df['timestamp'] >= '2018-03-25 01:00:00') & (test_tz_df['timestamp'] < '2018-10-28 02:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/Dublin') & (test_tz_df['timestamp'] >= '2018-03-25 01:00:00') & (test_tz_df['timestamp'] < '2018-10-28 02:00:00')), 'dst'] = 1



### Adjust for daylight savings time

In [10]:
from datetime import timedelta 
train_tz_df.loc[train_tz_df['dst'] == 1, 'timestamp'] += timedelta(hours = 1)
test_tz_df.loc[test_tz_df['dst'] == 1, 'timestamp'] += timedelta(hours = 1)



### Adjust time zones

In [11]:
# Train data

timezones = train_tz_df.timezone.unique()

for i in range(0, len(timezones)):
    train_tz_df.loc[(train_tz_df['timezone'] == timezones[i]), 'timestamp'] = train_tz_df.loc[(train_tz_df['timezone'] == timezones[i]), 'timestamp'].dt.tz_localize(timezones[i], ambiguous = True)


In [None]:
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Eastern'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'US/Eastern'), 'timestamp'].dt.tz_localize('US/Eastern', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Central'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'US/Central'), 'timestamp'].dt.tz_localize('US/Central', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Mountain'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'US/Mountain'), 'timestamp'].dt.tz_localize('US/Mountain', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Pacific'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'US/Pacific'), 'timestamp'].dt.tz_localize('US/Pacific', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'].dt.tz_localize('Canada/Eastern', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/London'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/London'), 'timestamp'].dt.tz_localize('Europe/London', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'].dt.tz_localize('Europe/Dublin', ambiguous = True)


## Data Minification

Save the final dataframes as pickle files.

In [None]:
train_tz_df.to_pickle('train_imputed_tz_df.pkl')
test_tz_df.to_pickle('test_imputed_tz_df.pkl')

del train_tz_df, test_tz_df

## Using the files
To use these files, you must first read them in using the following code.

In [None]:
train_tz_df = pd.read_pickle('train_imputed_tz_df.pkl')
test_tz_df = pd.read_pickle('test_imputed_tz_df.pkl')