# Project 2
by: Alissa Stover deriving from code found at this URL https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe 

This Jupyter notebook walks through steps to read in and reduce the memory usage of the original CSV files. 
It also merges the files and saves the output as separate pickle files.
Additionally, it creates four versions of the merged weather and meter reading datasets - 2 each of test and train datasets, with one version having timezone-adjusted data derived from the discussion here: https://www.kaggle.com/patrick0302/locate-cities-according-weather-temperature 

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import pickle

## Read in Data

In [2]:
# read in train datasets
train_df = pd.read_csv('train.csv')
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')
train_tz_df = pd.read_csv('train.csv')
train_tz_df["timestamp"] = pd.to_datetime(train_tz_df["timestamp"], format='%Y-%m-%d %H:%M:%S')
building_df = pd.read_csv('building_metadata.csv')
weather_train_df = pd.read_csv('weather_train.csv')
weather_train_tz_df = pd.read_csv('weather_train.csv')

# read in test datasets
test_df = pd.read_csv ('test.csv')
test_tz_df = pd.read_csv ('test.csv')
weather_test_df = pd.read_csv ('weather_test.csv')
weather_test_tz_df = pd.read_csv ('weather_test.csv')

# read in time zone data
time_zones_df = pd.read_csv('time_zones.csv')

In [3]:
def reduce_mem_usage(df, verbose=True):
    
    """This function reduces the memory usage of dataframes"""
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Use the function above to reduce the memory usage of all of the dataframes

In [4]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)
train_tz_df = reduce_mem_usage(train_tz_df)
test_tz_df = reduce_mem_usage(test_tz_df)

weather_train_df = reduce_mem_usage(weather_train_df)
weather_train_tz_df = reduce_mem_usage(weather_train_tz_df)
weather_test_df = reduce_mem_usage(weather_test_df)
weather_test_tz_df = reduce_mem_usage(weather_test_tz_df)
building_df = reduce_mem_usage(building_df)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


## Initial Look at Data

In [5]:
print(building_df.shape)
building_df.head(5)

(1449, 6)


Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [6]:
print(train_df.shape)
train_df.head(5)

(20216100, 4)


Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


In [7]:
print(weather_train_df.shape)
weather_train_df.head(5)

(139773, 9)


Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.5,0.0,0.0
1,0,2016-01-01 01:00:00,24.40625,,21.09375,-1.0,1020.0,70.0,1.5
2,0,2016-01-01 02:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0
3,0,2016-01-01 03:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609


In [8]:
print(time_zones_df.shape)
time_zones_df.head(5)

(16, 4)


Unnamed: 0,site_id,timezone,country_code,location
0,0,US/Eastern,US,"Orlando, FL"
1,1,Europe/London,UK,"UK, Southampton"
2,2,US/Mountain,US,"Tempe, AZ"
3,3,US/Eastern,US,"Washington, WA"
4,4,US/Pacific,US,"San Francisco, CA"


## Adjust weather data timezones 

### Merge weather data and timezone data

In [9]:
temp_df = weather_train_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
weather_train_tz_df = pd.concat([weather_train_tz_df, temp_df], axis=1)

del temp_df

In [10]:
temp_df = weather_test_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
weather_test_tz_df = pd.concat([weather_test_tz_df, temp_df], axis=1)

del temp_df

### Localize timestamps to UTC

In [11]:
weather_train_tz_df['timestamp'] = pd.to_datetime(weather_train_tz_df['timestamp'])
weather_train_tz_df.timestamp = weather_train_tz_df.timestamp.dt.tz_localize('UTC')

In [12]:
weather_test_tz_df['timestamp'] = pd.to_datetime(weather_test_tz_df['timestamp'])
weather_test_tz_df.timestamp = weather_test_tz_df.timestamp.dt.tz_localize('UTC')

### Adjust timestamp to local timezone, then rename columns 

In [13]:
weather_train_tz_df['timestamp_tz'] = [weather_train_tz_df['timestamp'][i].tz_convert(weather_train_tz_df['timezone'][i]) for i in range(0, len(weather_train_tz_df['timestamp']))]                                      


In [14]:
weather_test_tz_df['timestamp_tz'] = [weather_test_tz_df['timestamp'][i].tz_convert(weather_test_tz_df['timezone'][i]) for i in range(0, len(weather_test_tz_df['timestamp']))]                                      


In [15]:
weather_train_tz_df = weather_train_tz_df.rename(columns={"timestamp": "timestamp_utc", "timestamp_tz": "timestamp"})

In [16]:
weather_test_tz_df = weather_test_tz_df.rename(columns={"timestamp": "timestamp_utc", "timestamp_tz": "timestamp"})

In [17]:
print(weather_train_tz_df.columns)
print(weather_test_tz_df.columns)

Index(['site_id', 'timestamp_utc', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'timezone', 'country_code', 'location',
       'timestamp'],
      dtype='object')
Index(['site_id', 'timestamp_utc', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'timezone', 'country_code', 'location',
       'timestamp'],
      dtype='object')


In [18]:
print(building_df.columns)

Index(['site_id', 'building_id', 'primary_use', 'square_feet', 'year_built',
       'floor_count'],
      dtype='object')


In [19]:
train_tz_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


In [20]:
test_tz_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01 00:00:00
1,1,1,0,2017-01-01 00:00:00
2,2,2,0,2017-01-01 00:00:00
3,3,3,0,2017-01-01 00:00:00
4,4,4,0,2017-01-01 00:00:00


# Merging Data

### Convert all timestamp columns to datetime 

In [21]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
train_tz_df['timestamp'] = pd.to_datetime(train_tz_df['timestamp'])
test_tz_df['timestamp'] = pd.to_datetime(test_tz_df['timestamp'])

weather_train_df['timestamp'] = pd.to_datetime(weather_train_df['timestamp'])
weather_train_tz_df['timestamp_utc'] = pd.to_datetime(weather_train_tz_df['timestamp_utc'])

weather_test_df['timestamp'] = pd.to_datetime(weather_test_df['timestamp'])
weather_test_tz_df['timestamp_utc'] = pd.to_datetime(weather_test_tz_df['timestamp_utc'])

building_df['primary_use'] = building_df['primary_use'].astype('category')

### Building df merge

In [22]:
temp_df = train_df[['building_id']]
temp_df = temp_df.merge(building_df, on = ['building_id'], how = 'left')
del temp_df['building_id']
train_df = pd.concat([train_df, temp_df], axis = 1)

temp_df = test_df[['building_id']]
temp_df = temp_df.merge(building_df, on = ['building_id'], how = 'left')

del temp_df['building_id']
test_df = pd.concat([test_df, temp_df], axis = 1)
del temp_df

temp_df = train_tz_df[['building_id']]
temp_df = temp_df.merge(building_df, on = ['building_id'], how = 'left')
del temp_df['building_id']

train_tz_df = pd.concat([train_tz_df, temp_df], axis = 1)


temp_df = test_tz_df[['building_id']]
temp_df = temp_df.merge(building_df, on = ['building_id'], how = 'left')

del temp_df['building_id']
test_tz_df = pd.concat([test_tz_df, temp_df], axis = 1)
del temp_df, building_df

### Merge timezone

In [23]:
temp_df = train_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
train_tz_df = pd.concat([train_tz_df, temp_df], axis=1)

del temp_df

temp_df = test_tz_df[['site_id']]
temp_df = temp_df.merge(time_zones_df, on = ['site_id'], how = 'left')

del temp_df['site_id']
test_tz_df = pd.concat([test_tz_df, temp_df], axis=1)

del temp_df

## Convert timezones

### Prepare daylight savings time column for adjustment

In [24]:
train_tz_df['dst'] = 0
test_tz_df['dst'] = 0

In [25]:
# Train data

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Eastern') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Mountain') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Pacific') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'US/Central') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'Canada/Eastern') & (train_tz_df['timestamp'] >= '2016-03-13 02:00:00') & (train_tz_df['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'Europe/London') & (train_tz_df['timestamp'] >= '2016-03-27 01:00:00') & (train_tz_df['timestamp'] < '2016-10-30 02:00:00')), 'dst'] = 1

train_tz_df.loc[((train_tz_df['timezone'] == 'Europe/Dublin') & (train_tz_df['timestamp'] >= '2016-03-27 01:00:00') & (train_tz_df['timestamp'] < '2016-10-30 02:00:00')), 'dst'] = 1


In [26]:
# Test data

# 2017

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Eastern') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Mountain') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Pacific') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Central') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Canada/Eastern') & (test_tz_df['timestamp'] >= '2017-03-12 02:00:00') & (test_tz_df['timestamp'] < '2017-11-05 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/London') & (test_tz_df['timestamp'] >= '2017-03-26 01:00:00') & (test_tz_df['timestamp'] < '2017-10-29 02:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/Dublin') & (test_tz_df['timestamp'] >= '2017-03-26 01:00:00') & (test_tz_df['timestamp'] < '2017-10-29 02:00:00')), 'dst'] = 1

# 2018

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Eastern') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Mountain') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Pacific') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'US/Central') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Canada/Eastern') & (test_tz_df['timestamp'] >= '2018-03-11 02:00:00') & (test_tz_df['timestamp'] < '2018-11-04 01:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/London') & (test_tz_df['timestamp'] >= '2018-03-25 01:00:00') & (test_tz_df['timestamp'] < '2018-10-28 02:00:00')), 'dst'] = 1

test_tz_df.loc[((test_tz_df['timezone'] == 'Europe/Dublin') & (test_tz_df['timestamp'] >= '2018-03-25 01:00:00') & (test_tz_df['timestamp'] < '2018-10-28 02:00:00')), 'dst'] = 1



### Adjust for daylight savings time

In [27]:
from datetime import timedelta 
train_tz_df.loc[train_tz_df['dst'] == 1, 'timestamp'] += timedelta(hours = 1)
test_tz_df.loc[test_tz_df['dst'] == 1, 'timestamp'] += timedelta(hours = 1)



### Adjust time zones

In [28]:
# Train data

timezones = train_tz_df.timezone.unique()

for i in range(0, len(timezones)):
    train_tz_df.loc[(train_tz_df['timezone'] == timezones[i]), 'timestamp'] = train_tz_df.loc[(train_tz_df['timezone'] == timezones[i]), 'timestamp'].dt.tz_localize(timezones[i], ambiguous = True)


In [29]:
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Eastern'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'US/Eastern'), 'timestamp'].dt.tz_localize('US/Eastern', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Central'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'US/Central'), 'timestamp'].dt.tz_localize('US/Central', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Mountain'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'US/Mountain'), 'timestamp'].dt.tz_localize('US/Mountain', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Pacific'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'US/Pacific'), 'timestamp'].dt.tz_localize('US/Pacific', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'].dt.tz_localize('Canada/Eastern', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/London'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/London'), 'timestamp'].dt.tz_localize('Europe/London', ambiguous = True)
test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'] = test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'].dt.tz_localize('Europe/Dublin', ambiguous = True)


## Data Minification

Save dataframes as pickle files 

In [30]:
#train_df.to_pickle('train_df.pkl')
#test_df.to_pickle('test_df.pkl')
#train_tz_df.to_pickle('train_tz_df.pkl')
#test_tz_df.to_pickle('test_tz_df.pkl')

#weather_train_df.to_pickle('weather_train_df.pkl')
#weather_test_tz_df.to_pickle('weather_test_tz_df.pkl')
#weather_train_tz_df.to_pickle('weather_train_tz_df.pkl')
#weather_test_tz_df.to_pickle('weather_test_tz_df.pkl')
   
#del train_df, test_df, train_tz_df, test_tz_df, weather_train_df, weather_test_df weather_train_tz_df, weather_test_tz_df

In [31]:
#train_df = pd.read_pickle('train_df.pkl')
#test_df = pd.read_pickle('test_df.pkl')
#train_tz_df = pd.read_pickle('train_tz_df.pkl')
#test_tz_df = pd.read_pickle('test_tz_df.pkl')

#weather_train_df = pd.read_pickle('weather_train_df.pkl')
#weather_test_df = pd.read_pickle('weather_test_df.pkl')
#weather_train_tz_df = pd.read_pickle('weather_train_tz_df.pkl')
#weather_test_tz_df = pd.read_pickle('weather_test_tz_df.pkl')

# maybe stop script here, then merge in another script 

### Weather df merge 

In [32]:
temp_df = train_df[['site_id','timestamp']]
temp_df = temp_df.merge(weather_train_df, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['site_id','timestamp']]
temp_df = temp_df.merge(weather_test_df, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
test_df = pd.concat([test_df, temp_df], axis=1)

del temp_df, weather_train_df, weather_test_df

In [33]:
print(train_tz_df.columns)
print(weather_train_tz_df.columns)

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count', 'timezone',
       'country_code', 'location', 'dst'],
      dtype='object')
Index(['site_id', 'timestamp_utc', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'timezone', 'country_code', 'location',
       'timestamp'],
      dtype='object')


In [34]:
temp_df = train_tz_df[['site_id','timestamp', 'timezone', 'country_code', 'location']]
temp_df = temp_df.merge(weather_train_tz_df, on=['site_id','timestamp', 'timezone', 'country_code', 'location'], how='left')

del temp_df['site_id'], temp_df['timestamp'], temp_df['timezone'], temp_df['country_code'], temp_df['location'], temp_df['timestamp_utc']

train_tz_df = pd.concat([train_tz_df, temp_df], axis=1)


In [35]:
train_tz_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,timezone,country_code,location,dst,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01 00:00:00-05:00,0.0,0,Education,7432,2008.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0
1,1,0,2016-01-01 00:00:00-05:00,0.0,0,Education,2720,2004.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0
2,2,0,2016-01-01 00:00:00-05:00,0.0,0,Education,5376,1991.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0
3,3,0,2016-01-01 00:00:00-05:00,0.0,0,Education,23685,2002.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0
4,4,0,2016-01-01 00:00:00-05:00,0.0,0,Education,116607,1975.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0


In [36]:
temp_df = test_tz_df[['site_id','timestamp', 'timezone', 'country_code', 'location']]
temp_df = temp_df.merge(weather_test_tz_df, on=['site_id','timestamp', 'timezone', 'country_code', 'location'], how='left')

del temp_df['site_id'], temp_df['timestamp'], temp_df['timezone'], temp_df['country_code'], temp_df['location'], temp_df['timestamp_utc']
test_tz_df = pd.concat([test_tz_df, temp_df], axis=1)

del temp_df, weather_train_tz_df, weather_test_tz_df

In [37]:
test_tz_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,timezone,country_code,location,dst,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0,2017-01-01 00:00:00-05:00,0,Education,7432,2008.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
1,1,1,0,2017-01-01 00:00:00-05:00,0,Education,2720,2004.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
2,2,2,0,2017-01-01 00:00:00-05:00,0,Education,5376,1991.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
3,3,3,0,2017-01-01 00:00:00-05:00,0,Education,23685,2002.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
4,4,4,0,2017-01-01 00:00:00-05:00,0,Education,116607,1975.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609


### Convert timestamps to datetime

In [38]:
train_tz_df.loc[(train_tz_df['timezone'] == 'US/Central'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'US/Central'), 'timestamp'])
train_tz_df.loc[(train_tz_df['timezone'] == 'US/Mountain'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'US/Mountain'), 'timestamp'])
train_tz_df.loc[(train_tz_df['timezone'] == 'US/Pacific'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'US/Pacific'), 'timestamp'])
train_tz_df.loc[(train_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'])
train_tz_df.loc[(train_tz_df['timezone'] == 'US/Eastern'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'US/Eastern'), 'timestamp'])
train_tz_df.loc[(train_tz_df['timezone'] == 'Europe/London'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'Europe/London'), 'timestamp'])
train_tz_df.loc[(train_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'] = pd.to_datetime(train_tz_df.loc[(train_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'])


In [39]:
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Central'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'US/Central'), 'timestamp'])
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Mountain'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'US/Mountain'), 'timestamp'])
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Pacific'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'US/Pacific'), 'timestamp'])
test_tz_df.loc[(test_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'Canada/Eastern'), 'timestamp'])
test_tz_df.loc[(test_tz_df['timezone'] == 'US/Eastern'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'US/Eastern'), 'timestamp'])
test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/London'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/London'), 'timestamp'])
test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'] = pd.to_datetime(test_tz_df.loc[(test_tz_df['timezone'] == 'Europe/Dublin'), 'timestamp'])


# Data Minification

Save the final dataframes as pickle files.

In [46]:
train_df.to_pickle('train_df.pkl')
test_df.to_pickle('test_df.pkl')
train_tz_df.to_pickle('train_tz_df.pkl')
test_tz_df.to_pickle('test_tz_df.pkl')
   
del train_df, test_df, train_tz_df, test_tz_df

# Using the files
To use these files, you must first read them in using the following code.

In [41]:
train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')
train_tz_df = pd.read_pickle('train_tz_df.pkl')
test_tz_df = pd.read_pickle('test_tz_df.pkl')

In [42]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0


In [43]:
train_tz_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,timezone,country_code,location,dst,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01 00:00:00-05:00,0.0,0,Education,7432,2008.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0
1,1,0,2016-01-01 00:00:00-05:00,0.0,0,Education,2720,2004.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0
2,2,0,2016-01-01 00:00:00-05:00,0.0,0,Education,5376,1991.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0
3,3,0,2016-01-01 00:00:00-05:00,0.0,0,Education,23685,2002.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0
4,4,0,2016-01-01 00:00:00-05:00,0.0,0,Education,116607,1975.0,,US/Eastern,US,"Orlando, FL",0,19.40625,,19.40625,0.0,,0.0,0.0


In [44]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0,2017-01-01,0,Education,7432,2008.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
1,1,1,0,2017-01-01,0,Education,2720,2004.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
2,2,2,0,2017-01-01,0,Education,5376,1991.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
3,3,3,0,2017-01-01,0,Education,23685,2002.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
4,4,4,0,2017-01-01,0,Education,116607,1975.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609


In [45]:
test_tz_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,timezone,country_code,location,dst,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0,2017-01-01 00:00:00-05:00,0,Education,7432,2008.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
1,1,1,0,2017-01-01 00:00:00-05:00,0,Education,2720,2004.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
2,2,2,0,2017-01-01 00:00:00-05:00,0,Education,5376,1991.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
3,3,3,0,2017-01-01 00:00:00-05:00,0,Education,23685,2002.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
4,4,4,0,2017-01-01 00:00:00-05:00,0,Education,116607,1975.0,,US/Eastern,US,"Orlando, FL",0,15.601562,2.0,12.796875,0.0,1022.0,130.0,2.099609
