# Project 2 - Data cleaning
###  Converting weather data for merge
by: Alissa Stover, Sophia Skowronski, Ying Hua

This Jupyter notebook timezone-corrects the weather data; see discussion here for the source of timezone data: https://www.kaggle.com/patrick0302/locate-cities-according-weather-temperature 

It also derives from code found at this URL https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe 

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import pickle
import datetime

## Read in Data

In [2]:
# Train data
weather_train_tz_df = pd.read_csv('weather_train.csv')

# Test data
weather_test_tz_df = pd.read_csv ('weather_test.csv')

# Timezone data
time_zones_df = pd.read_csv('time_zones.csv')

## Adjust weather data timezones

### Merge weather & time zone data

In [3]:
# Train data
temp_df = weather_train_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
weather_train_tz_df = pd.concat([weather_train_tz_df, temp_df], axis=1)

del temp_df

In [4]:
# Test data
temp_df = weather_test_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
weather_test_tz_df = pd.concat([weather_test_tz_df, temp_df], axis=1)

del temp_df

# EDIT: Create timezones dictionary to map timestamp series onto

In [5]:
timezones = list(time_zones_df.timezone.unique())
timezones_offset = [-5, 0, -7, -8, -5, -6, 1]
timezones_dict = dict(zip(timezones, timezones_offset))
timezones_dict

{'US/Eastern': -5,
 'Europe/London': 0,
 'US/Mountain': -7,
 'US/Pacific': -8,
 'Canada/Eastern': -5,
 'US/Central': -6,
 'Europe/Dublin': 1}

# EDIT: Check weather data

In [6]:
weather_train_tz_df.head()

Unnamed: 0.1,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,Unnamed: 0,timezone,country_code,location,timezone_offset
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0,0,US/Eastern,US,"Orlando, FL",-5
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5,0,US/Eastern,US,"Orlando, FL",-5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0,0,US/Eastern,US,"Orlando, FL",-5
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0,0,US/Eastern,US,"Orlando, FL",-5
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,0,US/Eastern,US,"Orlando, FL",-5


In [7]:
weather_test_tz_df.head()

Unnamed: 0.1,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,Unnamed: 0,timezone,country_code,location,timezone_offset
0,0,2017-01-01 00:00:00,17.8,4.0,11.7,,1021.4,100.0,3.6,0,US/Eastern,US,"Orlando, FL",-5
1,0,2017-01-01 01:00:00,17.8,2.0,12.8,0.0,1022.0,130.0,3.1,0,US/Eastern,US,"Orlando, FL",-5
2,0,2017-01-01 02:00:00,16.1,0.0,12.8,0.0,1021.9,140.0,3.1,0,US/Eastern,US,"Orlando, FL",-5
3,0,2017-01-01 03:00:00,17.2,0.0,13.3,0.0,1022.2,140.0,3.1,0,US/Eastern,US,"Orlando, FL",-5
4,0,2017-01-01 04:00:00,16.7,2.0,13.3,0.0,1022.3,130.0,2.6,0,US/Eastern,US,"Orlando, FL",-5


# EDIT: The "Unnamed: 0" is a duplicate `site_id` column

In [8]:
weather_train_tz_df.site_id == weather_train_tz_df["Unnamed: 0"]

0         True
1         True
2         True
3         True
4         True
5         True
6         True
7         True
8         True
9         True
10        True
11        True
12        True
13        True
14        True
15        True
16        True
17        True
18        True
19        True
20        True
21        True
22        True
23        True
24        True
25        True
26        True
27        True
28        True
29        True
          ... 
139743    True
139744    True
139745    True
139746    True
139747    True
139748    True
139749    True
139750    True
139751    True
139752    True
139753    True
139754    True
139755    True
139756    True
139757    True
139758    True
139759    True
139760    True
139761    True
139762    True
139763    True
139764    True
139765    True
139766    True
139767    True
139768    True
139769    True
139770    True
139771    True
139772    True
Length: 139773, dtype: bool

In [9]:
weather_train_tz_df["Unnamed: 0"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
      dtype=int64)

In [10]:
del weather_train_tz_df["Unnamed: 0"], weather_test_tz_df["Unnamed: 0"]

# EDIT: Commenting out the following to increase running time (does not convert to tz-aware)

### Localize timestamps to UTC

In [11]:
# Train data
weather_train_tz_df['timestamp'] = pd.to_datetime(weather_train_tz_df['timestamp'])
#weather_train_tz_df.timestamp = weather_train_tz_df.timestamp.dt.tz_localize('UTC')

In [12]:
# Test data
weather_test_tz_df['timestamp'] = pd.to_datetime(weather_test_tz_df['timestamp'])
#weather_test_tz_df.timestamp = weather_test_tz_df.timestamp.dt.tz_localize('UTC')

# EDIT: Timezone adjustment using offset

In [13]:
timezones_dict.keys()

dict_keys(['US/Eastern', 'Europe/London', 'US/Mountain', 'US/Pacific', 'Canada/Eastern', 'US/Central', 'Europe/Dublin'])

In [14]:
weather_train_tz_df['timestamp_utc'] = weather_train_tz_df['timestamp']
weather_test_tz_df['timestamp_utc'] = weather_test_tz_df['timestamp']

from datetime import timedelta 
for zone in timezones_dict.keys():
    weather_train_tz_df.timestamp[weather_train_tz_df.timezone==zone] += timedelta(hours = timezones_dict[zone])
    weather_test_tz_df.timestamp[weather_test_tz_df.timezone==zone] += timedelta(hours = timezones_dict[zone])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


# EDIT: Commenting out the following to increase running time (does not convert to tz-aware)

### Adjust timestamp to local timezone, then rename columns

In [15]:
# Train data
#weather_train_tz_df['timestamp_tz'] = [weather_train_tz_df['timestamp'][i].tz_convert(weather_train_tz_df['timezone'][i]) for i in range(0, len(weather_train_tz_df['timestamp']))]                                      
#weather_train_tz_df = weather_train_tz_df.rename(columns={"timestamp": "timestamp_utc", "timestamp_tz": "timestamp"})
#print(weather_train_tz_df.columns)


In [16]:
# Test data
#weather_test_tz_df['timestamp_tz'] = [weather_test_tz_df['timestamp'][i].tz_convert(weather_test_tz_df['timezone'][i]) for i in range(0, len(weather_test_tz_df['timestamp']))]                                      
#weather_test_tz_df = weather_test_tz_df.rename(columns={"timestamp": "timestamp_utc", "timestamp_tz": "timestamp"})
#print(weather_test_tz_df.columns)

# EDIT: Check weather data

In [17]:
weather_train_tz_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location,timezone_offset,timestamp_utc
0,0,2015-12-31 19:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0,US/Eastern,US,"Orlando, FL",-5,2016-01-01 00:00:00
1,0,2015-12-31 20:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5,US/Eastern,US,"Orlando, FL",-5,2016-01-01 01:00:00
2,0,2015-12-31 21:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0,US/Eastern,US,"Orlando, FL",-5,2016-01-01 02:00:00
3,0,2015-12-31 22:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0,US/Eastern,US,"Orlando, FL",-5,2016-01-01 03:00:00
4,0,2015-12-31 23:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,US/Eastern,US,"Orlando, FL",-5,2016-01-01 04:00:00


In [18]:
weather_test_tz_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location,timezone_offset,timestamp_utc
0,0,2016-12-31 19:00:00,17.8,4.0,11.7,,1021.4,100.0,3.6,US/Eastern,US,"Orlando, FL",-5,2017-01-01 00:00:00
1,0,2016-12-31 20:00:00,17.8,2.0,12.8,0.0,1022.0,130.0,3.1,US/Eastern,US,"Orlando, FL",-5,2017-01-01 01:00:00
2,0,2016-12-31 21:00:00,16.1,0.0,12.8,0.0,1021.9,140.0,3.1,US/Eastern,US,"Orlando, FL",-5,2017-01-01 02:00:00
3,0,2016-12-31 22:00:00,17.2,0.0,13.3,0.0,1022.2,140.0,3.1,US/Eastern,US,"Orlando, FL",-5,2017-01-01 03:00:00
4,0,2016-12-31 23:00:00,16.7,2.0,13.3,0.0,1022.3,130.0,2.6,US/Eastern,US,"Orlando, FL",-5,2017-01-01 04:00:00


## Data Minification

In [19]:
weather_train_tz_df.to_pickle('weather_train_tz_df.pkl')
weather_train_tz_df.to_pickle('weather_test_tz_df.pkl')

del weather_train_tz_df, weather_test_tz_df

### Using the files
To use these files, you must first read them in using the following code.

In [20]:
weather_train_tz_df = pd.read_pickle('weather_train_tz_df.pkl')
weather_test_tz_df = pd.read_pickle('weather_test_tz_df.pkl')