# Project 2 - Data cleaning
###  Converting weather data for merge
by: Alissa Stover, Sophia Skowronski, Ying Hua

This Jupyter notebook timezone-corrects the weather data; see discussion here for the source of timezone data: https://www.kaggle.com/patrick0302/locate-cities-according-weather-temperature 

It also derives from code found at this URL https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe 

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import pickle

## Read in Data

In [2]:
# Train data
weather_train_tz_df = pd.read_csv('weather_train.csv')

# Test data
weather_test_tz_df = pd.read_csv ('weather_test.csv')

# Timezone data
time_zones_df = pd.read_csv('time_zones.csv')

## Adjust weather data timezones

### Merge weather & time zone data

In [3]:
# Train data
temp_df = weather_train_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
weather_train_tz_df = pd.concat([weather_train_tz_df, temp_df], axis=1)

del temp_df

In [4]:
# Test data
temp_df = weather_test_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
weather_test_tz_df = pd.concat([weather_test_tz_df, temp_df], axis=1)

del temp_df

### Localize timestamps to UTC

In [5]:
# Train data
weather_train_tz_df['timestamp'] = pd.to_datetime(weather_train_tz_df['timestamp'])
weather_train_tz_df.timestamp = weather_train_tz_df.timestamp.dt.tz_localize('UTC')

In [6]:
# Test data
weather_test_tz_df['timestamp'] = pd.to_datetime(weather_test_tz_df['timestamp'])
weather_test_tz_df.timestamp = weather_test_tz_df.timestamp.dt.tz_localize('UTC')

### Adjust timestamp to local timezone, then rename columns 

In [7]:
# Train data
weather_train_tz_df['timestamp_tz'] = [weather_train_tz_df['timestamp'][i].tz_convert(weather_train_tz_df['timezone'][i]) for i in range(0, len(weather_train_tz_df['timestamp']))]                                      
weather_train_tz_df = weather_train_tz_df.rename(columns={"timestamp": "timestamp_utc", "timestamp_tz": "timestamp"})
print(weather_train_tz_df.columns)


Index(['site_id', 'timestamp_utc', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'timezone', 'country_code', 'location',
       'timestamp'],
      dtype='object')


In [8]:
# Test data
weather_test_tz_df['timestamp_tz'] = [weather_test_tz_df['timestamp'][i].tz_convert(weather_test_tz_df['timezone'][i]) for i in range(0, len(weather_test_tz_df['timestamp']))]                                      
weather_test_tz_df = weather_test_tz_df.rename(columns={"timestamp": "timestamp_utc", "timestamp_tz": "timestamp"})
print(weather_test_tz_df.columns)

Index(['site_id', 'timestamp_utc', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'timezone', 'country_code', 'location',
       'timestamp'],
      dtype='object')


## Data Minification

In [9]:
weather_train_tz_df.to_pickle('weather_train_tz_df.pkl')
weather_train_tz_df.to_pickle('weather_test_tz_df.pkl')

del weather_train_tz_df, weather_test_tz_df

### Using the files
To use these files, you must first read them in using the following code.

In [10]:
weather_train_tz_df = pd.read_pickle('weather_train_tz_df.pkl')
weather_test_tz_df = pd.read_pickle('weather_test_tz_df.pkl')