# Project 2 - Data cleaning
###  Merge cleaned and timezone-corrected weather and meter data
by: Alissa Stover, Sophia Skowronski, Ying Hua

This Jupyter notebook merges the meter and weather data after they have been timezone-corrected and after the meter data has been cleaned.
See discussion here for background on the timezone correction: https://www.kaggle.com/patrick0302/locate-cities-according-weather-temperature 
This code also derives from code found at this URL https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe 

## Import packages

In [1]:
import pandas as pd
import numpy as np

## Read in Data

In [2]:
# Meter data
train_tz_df = pd.read_pickle('train_imputed_tz_df.pkl')
test_tz_df = pd.read_pickle('test_imputed_tz_df.pkl')

# Weather data
weather_train_tz_df = pd.read_pickle('weather_train_tz_df.pkl')
weather_test_tz_df = pd.read_pickle('weather_test_tz_df.pkl')

# EDIT: Commented out part below. Keep `timestamp_utc` column, which shows original weather timestamp 

### Weather df merge

In [5]:
temp_df = train_tz_df[['site_id','timestamp', 'timezone', 'country_code', 'location']]
temp_df = temp_df.merge(weather_train_tz_df, on=['site_id','timestamp', 'timezone', 'country_code', 'location'], how='left')
del temp_df['site_id'], temp_df['timestamp'], temp_df['timezone'], temp_df['country_code'], temp_df['location'], #, temp_df['timestamp_utc']

train_tz_df = pd.concat([train_tz_df, temp_df], axis=1)

In [6]:
train_tz_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,time_index,...,timestamp_utc,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone_offset,timestamp_utc.1
0,0,0,2016-01-01,220.046471,0,Education,7432,2008.0,8.0,0,...,2016-01-01 05:00:00,19.4,,19.4,0.0,,0.0,0.0,-5.0,2016-01-01 05:00:00
1,1,0,2016-01-01,101.917963,0,Education,2720,2004.0,5.0,0,...,2016-01-01 05:00:00,19.4,,19.4,0.0,,0.0,0.0,-5.0,2016-01-01 05:00:00
2,2,0,2016-01-01,5.634698,0,Education,5376,1991.0,4.0,0,...,2016-01-01 05:00:00,19.4,,19.4,0.0,,0.0,0.0,-5.0,2016-01-01 05:00:00
3,3,0,2016-01-01,366.496399,0,Education,23685,2002.0,10.0,0,...,2016-01-01 05:00:00,19.4,,19.4,0.0,,0.0,0.0,-5.0,2016-01-01 05:00:00
4,4,0,2016-01-01,1568.406545,0,Education,116607,1975.0,1.0,0,...,2016-01-01 05:00:00,19.4,,19.4,0.0,,0.0,0.0,-5.0,2016-01-01 05:00:00


In [8]:
temp_df = test_tz_df[['site_id','timestamp', 'timezone', 'country_code', 'location']]
temp_df = temp_df.merge(weather_test_tz_df, on=['site_id','timestamp', 'timezone', 'country_code', 'location'], how='left')

del temp_df['site_id'], temp_df['timestamp'], temp_df['timezone'], temp_df['country_code'], temp_df['location']#, temp_df['timestamp_utc']
test_tz_df = pd.concat([test_tz_df, temp_df], axis=1)

del temp_df, weather_train_tz_df, weather_test_tz_df

In [9]:
test_tz_df.head()

Unnamed: 0.1,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,Unnamed: 0,...,dst,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone_offset,timestamp_utc
0,0,0,0,2017-01-01,0,Education,7432,2008.0,,0,...,0,,,,,,,,,NaT
1,1,1,0,2017-01-01,0,Education,2720,2004.0,,0,...,0,,,,,,,,,NaT
2,2,2,0,2017-01-01,0,Education,5376,1991.0,,0,...,0,,,,,,,,,NaT
3,3,3,0,2017-01-01,0,Education,23685,2002.0,,0,...,0,,,,,,,,,NaT
4,4,4,0,2017-01-01,0,Education,116607,1975.0,,0,...,0,,,,,,,,,NaT


In [12]:
del train_tz_df["Unnamed: 0"], test_tz_df["Unnamed: 0"]

In [13]:
test_tz_df.columns

Index(['row_id', 'building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'timezone', 'country_code',
       'location', 'timezone_offset', 'dst', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 'timezone_offset',
       'timestamp_utc'],
      dtype='object')

In [14]:
train_tz_df.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count', 'time_index',
       'day_of_week', 'hour_of_day', 'index', 'avg', 'std', 'outlier',
       'timezone', 'country_code', 'location', 'timezone_offset', 'dst',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'timezone_offset', 'timestamp_utc', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 'timezone_offset',
       'timestamp_utc'],
      dtype='object')

# EDIT: There are "NaT" values where the weather data is shifted and no longer matches test/training data

# EDIT: Commenting out section since already in datetime format

### Convert timestamps to datetime

In [None]:
#train_tz_df.loc[(train_tz_df['timezone'] == 'US/Central'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'US/Central'), 'timestamp'])
#train_tz_df.loc[(train_tz_df['timezone'] == 'US/Mountain'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'US/Mountain'), 'timestamp'])
#train_tz_df.loc[(train_tz_df['timezone'] == 'US/Pacific'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'US/Pacific'), 'timestamp'])
#train_tz_df.loc[(train_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'])
#train_tz_df.loc[(train_tz_df['timezone'] == 'US/Eastern'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'US/Eastern'), 'timestamp'])
#train_tz_df.loc[(train_tz_df['timezone'] == 'Europe/London'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'Europe/London'), 'timestamp'])
#train_tz_df.loc[(train_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'])

In [None]:
#test_tz_df.loc[(test_tz_df['timezone'] == 'US/Central'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'US/Central'), 'timestamp'])
#test_tz_df.loc[(test_tz_df['timezone'] == 'US/Mountain'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'US/Mountain'), 'timestamp'])
#test_tz_df.loc[(test_tz_df['timezone'] == 'US/Pacific'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'US/Pacific'), 'timestamp'])
#test_tz_df.loc[(test_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'])
#test_tz_df.loc[(test_tz_df['timezone'] == 'US/Eastern'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'US/Eastern'), 'timestamp'])
#test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/London'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/London'), 'timestamp'])
#test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'])

## Data Minification

Save the final dataframes as pickle files.

In [None]:
train_tz_df.to_pickle('train_merge_df.pkl')
test_tz_df.to_pickle('test_merge_df.pkl')
   
del train_tz_df, test_tz_df

## Using the files
To use these files, you must first read them in using the following code.

In [None]:
train_df = pd.read_pickle('train_merge_df.pkl')
test_df = pd.read_pickle('test_merge_df.pkl')

# EDIT: Summary of pickle file changes

STEP 1: `train_df.pkl` 
`test_df.pkl`

TO

STEP 2: `train_df_imputed.pkl`
`test_df_imputed.pkl`

TO

STEP 3: `train_imputed_tz_df.pkl`
`test_imputed_tz_df.pkl`

TO

STEP 4: `train_merge_df.pkl`
`test_merge_df.pkl`