In [2]:
import pandas as pd
import numpy as np
import gc
from sklearn.pipeline import Pipeline, TransformerMixin
from os import path
import os

data_folder = '/Users/trudie/geli_code/ashrae-energy-prediction'

In [3]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [4]:
file_dtype = {
    'weather_test' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16},
    'weather_train' : {'site_id': np.int8, 'air_temperature': np.float, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16}
}

def loadFile(name, filepath=None):
    if not filepath:
        filepath = os.getcwd()
    print(f'{filepath}/input/ashrae-energy-prediction/{name}.csv')
    return ConvertToDatetime().transform(
            pd.read_csv(f'{filepath}/input/{name}.csv', dtype=file_dtype[name]))
        
weather_train = loadFile('weather_train', filepath=data_folder)
weather_test = loadFile('weather_test', filepath=data_folder)


/Users/trudie/geli_code/ashrae-energy-prediction/input/ashrae-energy-prediction/weather_train.csv
/Users/trudie/geli_code/ashrae-energy-prediction/input/ashrae-energy-prediction/weather_test.csv


In [5]:

def clean_weather_data(weather_filenm:str, method:str='linear', gap_limit:int=None, limit_direction:str='forward', save_filenm=None):
    """
    Assumes weather_filenm is of the format ASHRAE provided
    
    :param weather_filenm: 
    :param method : {‘linear’, ‘time’, ‘index’, ‘values’, ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘barycentric’, ‘krogh’, ‘polynomial’, ‘spline’, ‘piecewise_polynomial’, ‘from_derivatives’, ‘pchip’, ‘akima’} 
    :param gap_limit: Maximum number of consecutive hours to fill. Must be greater than 0.
    :param limit_direction: forward/backward/both
    :return: 
    """
    df_weather_dtypes = {'site_id': np.int8, 'air_temperature': np.float32, 'cloud_coverage': np.float32, 'dew_temperature': np.float32,
                     'precip_depth_1_hr': np.float32, 'sea_level_pressure': np.float32, 'wind_direction': np.float32, 'wind_speed': np.float32}

    weather_df = pd.read_csv(weather_filenm, dtype=df_weather_dtypes, parse_dates=['timestamp'])
    grouped_weather_df = weather_df.groupby('site_id').apply(lambda group: group.interpolate(method=method, limit=gap_limit, limit_direction=limit_direction))
    
    if 'cloud_coverage' in grouped_weather_df.columns:
        grouped_weather_df['cloud_coverage'] = grouped_weather_df['cloud_coverage'].round(decimals=0).clip(0,8)
        
    grouped_weather_df.reset_index(inplace=True)
    if save_filenm!=None:
        grouped_weather_df.to_csv(save_filenm)

    return grouped_weather_df


weather_train_filenm = f'{data_folder}/input/weather_train.csv'

interp_weather_train_filenm = f'{data_folder}/fully_interpolated_weather_train.csv'
grouped_weather_train = clean_weather_data(weather_train_filenm, method='linear', gap_limit=None, save_filenm=interp_weather_train_filenm)

partially_interp_weather_train_filenm = f'{data_folder}/partially_interpolated_weather_train.csv'
grouped_weather_train_with_gap_limit = clean_weather_data(weather_train_filenm, method='linear', gap_limit=3, save_filenm=partially_interp_weather_train_filenm)

print(grouped_weather_train.head(10))
print(grouped_weather_train_with_gap_limit.head(10))

   index  site_id           timestamp  air_temperature  cloud_coverage  \
0      0        0 2016-01-01 00:00:00        25.000000             6.0   
1      1        0 2016-01-01 01:00:00        24.400000             4.0   
2      2        0 2016-01-01 02:00:00        22.799999             2.0   
3      3        0 2016-01-01 03:00:00        21.100000             2.0   
4      4        0 2016-01-01 04:00:00        20.000000             2.0   
5      5        0 2016-01-01 05:00:00        19.400000             4.0   
6      6        0 2016-01-01 06:00:00        21.100000             6.0   
7      7        0 2016-01-01 07:00:00        21.100000             6.0   
8      8        0 2016-01-01 08:00:00        20.600000             6.0   
9      9        0 2016-01-01 09:00:00        21.100000             6.0   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_direction  \
0             20.0                NaN         1019.700012             0.0   
1             21.1             

In [6]:
# align weather
def weatherSiteOffsets():
    weather = pd.concat([weather_train,weather_test],ignore_index=True)
    weather['timestamp'] = pd.to_datetime(weather['timestamp'])
    weather_key = ['site_id', 'timestamp']

    temp_skeleton = weather[weather_key + ['air_temperature']].drop_duplicates(subset=weather_key).sort_values(by=weather_key).copy()

    # calculate ranks of hourly temperatures within date/site_id chunks
    temp_skeleton['temp_rank'] = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.date])['air_temperature'].rank('average')
    
    # create a dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
    df_2d = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.hour])['temp_rank'].mean().unstack(level=1)

    # Subtract the columnID of temperature peak by 14, getting the timestamp alignment gap.
    site_ids_offsets = pd.Series(df_2d.values.argmax(axis=1) - 14)
    site_ids_offsets.index.name = 'site_id'
    return site_ids_offsets

site_time_offsets_df = weatherSiteOffsets()

def alignWeather(df):
    df['offset'] = df.site_id.map(site_time_offsets_df)
    df['timestamp_aligned'] = (df.timestamp - pd.to_timedelta(df.offset, unit='H'))
    df['timestamp'] = df['timestamp_aligned']
    del df['timestamp_aligned'], df['offset']
    gc.collect()
    return df

#weather_train = alignWeather(weather_train)
#weather_test = alignWeather(weather_test)



In [7]:
from darksky_weather_connector import *

# 2016
# 0: {'name': 'Orlando', 'lat':28.538336, 'lon':-81.379234},
# 1: {'name': 'London', 'lat':51.507351, 'lon':-0.127758},
SITE_DATA = {'site_id': 0,
             'start_datetime':datetime.strptime('2016-01-01', '%Y-%m-%d'),
             'end_datetime':datetime.strptime('2017-01-01', '%Y-%m-%d'),
             'latitude':28.538336,
             'longitude':-81.379234,
             'pv':True,
             }
API_KEY = '4b6c6722e6c612fd5f789cee71aa7135'
# wd = fetch_historic_weather_data(SITE_DATA, api_key=API_key, file_path=data_folder)



In [26]:
import similaritymeasures as sm

siteid = 0
ds_data = pd.read_csv(f'{data_folder}/input/site{siteid}_darksky_weather.csv')
training_data = pd.read_csv(f'{data_folder}/input/fully_interpolated_weather_train.csv')

# {'site_id': np.int8, 'air_temperature': np.float, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
#                  'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16}
ds_data = ds_data.rename(columns={'time':'timestamp','temperature':'air_temperature', 'cloudCover':'cloud_coverage', 'windSpeed':'wind_speed'})
print(ds_data.head())
# 
# # quantify the difference between the two curves using
# # Dynamic Time Warping distance
# dtw, d = sm.dtw(ds_data['air_temperature'], training_data)
# 
# # print the results
# print(dtw)
# 
# # plot the data
# plt.figure()
# plt.plot(exp_data[:, 0], exp_data[:, 1])
# plt.plot(num_data[:, 0], num_data[:, 1])
# plt.show()

                   timestamp  dni  ghi  dhi  apparentTemperature  \
0  2016-01-01 00:00:00-05:00  0.0  0.0  0.0                71.51   
1  2016-01-01 01:00:00-05:00  0.0  0.0  0.0                71.80   
2  2016-01-01 02:00:00-05:00  0.0  0.0  0.0                71.71   
3  2016-01-01 03:00:00-05:00  0.0  0.0  0.0                71.24   
4  2016-01-01 04:00:00-05:00  0.0  0.0  0.0                71.71   

   air_temperature  precipIntensity  humidity  dewPoint  pressure  \
0            70.28              0.0      0.96     69.06    1020.0   
1            70.53              0.0      0.96     69.38    1019.4   
2            70.42              0.0      0.97     69.45    1018.8   
3            70.01              0.0      0.96     68.96    1018.2   
4            70.45              0.0      0.96     69.29    1019.0   

   cloud_coverage  wind_speed  windGust  windBearing  visibility  
0            0.23        0.07      0.08        180.0       2.852  
1            0.75        0.07      0.07   

In [27]:
temperature_df = training_data[['timestamp','air_temperature']].merge(ds_data,on='timestamp')
# print(training_data[training_data['site_id']==siteid])
print(temperature_df.head())
print(training_data.head())

Empty DataFrame
Columns: [timestamp, air_temperature_x, dni, ghi, dhi, apparentTemperature, air_temperature_y, precipIntensity, humidity, dewPoint, pressure, cloud_coverage, wind_speed, windGust, windBearing, visibility]
Index: []
   Unnamed: 0  index  site_id            timestamp  air_temperature  \
0           0      0        0  2016-01-01 00:00:00             25.0   
1           1      1        0  2016-01-01 01:00:00             24.4   
2           2      2        0  2016-01-01 02:00:00             22.8   
3           3      3        0  2016-01-01 03:00:00             21.1   
4           4      4        0  2016-01-01 04:00:00             20.0   

   cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
0             6.0             20.0                NaN              1019.7   
1             4.0             21.1               -1.0              1020.2   
2             2.0             21.1                0.0              1020.2   
3             2.0             20.6