In [18]:
mode = '_mean' if False else '_all'
print(mode)

_all


In [2]:
import pandas as pd
import numpy as np
import gc
from sklearn.pipeline import Pipeline, TransformerMixin
from os import path

In [3]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [4]:
file_dtype = {
    'weather_test' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16},
    'weather_train' : {'site_id': np.int8, 'air_temperature': np.float16, 'cloud_coverage': np.float16, 'dew_temperature': np.float16,
                     'precip_depth_1_hr': np.float16, 'sea_level_pressure': np.float16, 'wind_direction': np.float16, 'wind_speed': np.float16}
}

def loadFile(name):
    return ConvertToDatetime().transform(
            pd.read_csv('../input/ashrae-energy-prediction/' + name + '.csv', dtype=file_dtype[name]))
        
weather_train = loadFile('weather_train')
weather_test = loadFile('weather_test')


In [5]:
# align weather
def weatherSiteOffsets():
    weather = pd.concat([weather_train,weather_test],ignore_index=True)
    weather['timestamp'] = pd.to_datetime(weather['timestamp'])
    weather_key = ['site_id', 'timestamp']

    temp_skeleton = weather[weather_key + ['air_temperature']].drop_duplicates(subset=weather_key).sort_values(by=weather_key).copy()

    # calculate ranks of hourly temperatures within date/site_id chunks
    temp_skeleton['temp_rank'] = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.date])['air_temperature'].rank('average')
    
    # create a dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
    df_2d = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.hour])['temp_rank'].mean().unstack(level=1)

    # Subtract the columnID of temperature peak by 14, getting the timestamp alignment gap.
    site_ids_offsets = pd.Series(df_2d.values.argmax(axis=1) - 14)
    site_ids_offsets.index.name = 'site_id'
    return site_ids_offsets

site_time_offsets_df = weatherSiteOffsets()

def alignWeather(df):
    df['offset'] = df.site_id.map(site_time_offsets_df)
    df['timestamp_aligned'] = (df.timestamp - pd.to_timedelta(df.offset, unit='H'))
    df['timestamp'] = df['timestamp_aligned']
    del df['timestamp_aligned'], df['offset']
    gc.collect()
    return df

weather_train = alignWeather(weather_train)
weather_test = alignWeather(weather_test)

del site_time_offsets_df
gc.collect()

7

In [6]:
if path.exists('../input/ashrae-energy-prediction/weather_train_s_radiation.pickle'):
    weather_train['s_radiation'] =  pd.read_pickle('../input/ashrae-energy-prediction/weather_train_s_radiation.pickle')
    weather_test['s_radiation'] =  pd.read_pickle('../input/ashrae-energy-prediction/weather_test_s_radiation.pickle')
    weather_train['s_radiation'] = weather_test['s_radiation'].astype(np.float16)
    weather_test['s_radiation'] = weather_test['s_radiation'].astype(np.float16)
print(weather_train.head())

   site_id           timestamp  air_temperature  cloud_coverage  \
0        0 2015-12-31 19:00:00        25.000000             6.0   
1        0 2015-12-31 20:00:00        24.406250             NaN   
2        0 2015-12-31 21:00:00        22.796875             2.0   
3        0 2015-12-31 22:00:00        21.093750             2.0   
4        0 2015-12-31 23:00:00        20.000000             2.0   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_direction  \
0         20.00000                NaN              1019.5             0.0   
1         21.09375               -1.0              1020.0            70.0   
2         21.09375                0.0              1020.0             0.0   
3         20.59375                0.0              1020.0             0.0   
4         20.00000               -1.0              1020.0           250.0   

   wind_speed  s_radiation  
0    0.000000          0.0  
1    1.500000          0.0  
2    0.000000          0.0  
3    0.000000     

In [7]:
# TODO: do something with race precipitation is coded as a “-1” value
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/113103#latest-664978
# Test out below
def cleanPrecipDepth(df):
    df.loc[df['precip_depth_1_hr'] == -1, 'precip_depth_1_hr'] = 0.25
    return df

#weather_train = cleanPrecipDepth(weather_train)
#weather_test = cleanPrecipDepth(weather_test)

#print(weather_train['precip_depth_1_hr'].value_counts().sort_index())

In [8]:
class ImputeCloudCoverage(TransformerMixin):
        
    def transform(self, df, **transform_params):
        # set age of building to mediam of site_id
        # else if set ot overall median
        median = df['cloud_coverage'].median()
        # Set all year_built NaNs to site mean for year_built
        for i, i_median in df.groupby(['site_id'])['cloud_coverage'].median().items():
            # TODO add in +9 as a NAN
            if not np.isnan(i_median):
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = i_median
            else:
                df.loc[(df['cloud_coverage'].isnull()) & (df['site_id'] == i), 'cloud_coverage'] = median
        df['cloud_coverage'] = np.uint8(df['cloud_coverage'])
        df['cloud_coverage'] = df['cloud_coverage']
        del median
        gc.collect()
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [9]:
# TODO: try both for direction
class ImputeWeather(TransformerMixin):

    def __init__(self, method:str='linear', gap_limit:int=None, limit_direction:str='both'):
        self._method = method
        self._gap_limit = gap_limit
        self._limit_direction = limit_direction
        
    def transform(self, weather_df, **transform_params):
        grouped_weather_df = weather_df.groupby('site_id').apply(lambda group: group.interpolate(method=self._method, limit=self._gap_limit, limit_direction=self._limit_direction))
        if 'cloud_coverage' in grouped_weather_df.columns:
            grouped_weather_df['cloud_coverage'] = grouped_weather_df['cloud_coverage'].round(decimals=0).clip(0,8)
        grouped_weather_df.reset_index(inplace=True)
        weather_df = grouped_weather_df.drop(['index'], axis=1)
        gc.collect()
        return weather_df

    def fit(self, X, y=None, **fit_params):
        return self
print(weather_train.head(20))
print(ImputeWeather().transform(weather_train.head(20)))

    site_id           timestamp  air_temperature  cloud_coverage  \
0         0 2015-12-31 19:00:00        25.000000             6.0   
1         0 2015-12-31 20:00:00        24.406250             NaN   
2         0 2015-12-31 21:00:00        22.796875             2.0   
3         0 2015-12-31 22:00:00        21.093750             2.0   
4         0 2015-12-31 23:00:00        20.000000             2.0   
5         0 2016-01-01 00:00:00        19.406250             NaN   
6         0 2016-01-01 01:00:00        21.093750             6.0   
7         0 2016-01-01 02:00:00        21.093750             NaN   
8         0 2016-01-01 03:00:00        20.593750             NaN   
9         0 2016-01-01 04:00:00        21.093750             NaN   
10        0 2016-01-01 05:00:00        21.093750             NaN   
11        0 2016-01-01 06:00:00        20.593750             NaN   
12        0 2016-01-01 07:00:00        18.906250             6.0   
13        0 2016-01-01 08:00:00        20.000000

In [10]:
class FillMean(TransformerMixin):

    def __init__(self, cols):
        self._cols = cols
        
    def transform(self, df, **transform_params):
        for col in self._cols:
            df[col] = df[col].fillna(df[col].mean())
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [11]:
# TODO: rename to rolling
class AddWeatherLags(TransformerMixin):
    
    def __init__(self, window, center=False):
        self._window = window
        self._center = center
        
    def transform(self, weather_df, **transform_params):
        group_df = weather_df.groupby(['site_id'])
        cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
        #if 's_radiation' in weather_df.columns:
        #    cols.append('s_radiation')
        rolled = group_df[cols].rolling(window=self._window, center=self._center, min_periods=0)
        lag_mean = rolled.mean().reset_index().astype(np.float16)
        lag_max = rolled.max().reset_index().astype(np.float16)
        lag_min = rolled.min().reset_index().astype(np.float16)
        lag_std = rolled.std().reset_index().astype(np.float16)
        c_chars = '_c' if self._center== True else ''
        for col in cols:
            weather_df[f'{col}_mean_lag{self._window}{c_chars}'] = lag_mean[col]
            if mode == '_all':
                weather_df[f'{col}_max_lag{self._window}{c_chars}'] = lag_max[col]
                weather_df[f'{col}_min_lag{self._window}{c_chars}'] = lag_min[col]
                weather_df[f'{col}_std_lag{self._window}{c_chars}'] = lag_std[col]
        del group_df, rolled
        gc.collect()
        return weather_df

    def fit(self, X, y=None, **fit_params):
        return self
    
print(AddWeatherLags(72, True).transform(weather_train.head(20)))

    site_id           timestamp  air_temperature  cloud_coverage  \
0         0 2015-12-31 19:00:00        25.000000             6.0   
1         0 2015-12-31 20:00:00        24.406250             NaN   
2         0 2015-12-31 21:00:00        22.796875             2.0   
3         0 2015-12-31 22:00:00        21.093750             2.0   
4         0 2015-12-31 23:00:00        20.000000             2.0   
5         0 2016-01-01 00:00:00        19.406250             NaN   
6         0 2016-01-01 01:00:00        21.093750             6.0   
7         0 2016-01-01 02:00:00        21.093750             NaN   
8         0 2016-01-01 03:00:00        20.593750             NaN   
9         0 2016-01-01 04:00:00        21.093750             NaN   
10        0 2016-01-01 05:00:00        21.093750             NaN   
11        0 2016-01-01 06:00:00        20.593750             NaN   
12        0 2016-01-01 07:00:00        18.906250             6.0   
13        0 2016-01-01 08:00:00        20.000000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
weather_pipes = Pipeline(
    steps=[
        #('convertToDatetime', ConvertToDatetime()),
        ('imputeWeather', ImputeWeather()),
        ('fillMean',FillMean(['air_temperature','dew_temperature'
                              , 'precip_depth_1_hr', 'sea_level_pressure'])),
        ('imputeCloudCoverage', ImputeCloudCoverage()),
        ('addWeatherLags3', AddWeatherLags(3)),
        #('addWeatherLags3C', AddWeatherLags(3, True)),
        ('addWeatherLags72', AddWeatherLags(72)),
    ]
)

In [13]:
print(weather_train)

        site_id           timestamp  air_temperature  cloud_coverage  \
0             0 2015-12-31 19:00:00        25.000000             6.0   
1             0 2015-12-31 20:00:00        24.406250             NaN   
2             0 2015-12-31 21:00:00        22.796875             2.0   
3             0 2015-12-31 22:00:00        21.093750             2.0   
4             0 2015-12-31 23:00:00        20.000000             2.0   
5             0 2016-01-01 00:00:00        19.406250             NaN   
6             0 2016-01-01 01:00:00        21.093750             6.0   
7             0 2016-01-01 02:00:00        21.093750             NaN   
8             0 2016-01-01 03:00:00        20.593750             NaN   
9             0 2016-01-01 04:00:00        21.093750             NaN   
10            0 2016-01-01 05:00:00        21.093750             NaN   
11            0 2016-01-01 06:00:00        20.593750             NaN   
12            0 2016-01-01 07:00:00        18.906250            

In [14]:
all = weather_train.append(weather_test, ignore_index=True)
all = weather_pipes.transform(all)


all.sample(20, random_state=42)

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,s_radiation,...,sea_level_pressure_mean_lag3,wind_direction_mean_lag3,wind_speed_mean_lag3,air_temperature_mean_lag72,cloud_coverage_mean_lag72,dew_temperature_mean_lag72,precip_depth_1_hr_mean_lag72,sea_level_pressure_mean_lag72,wind_direction_mean_lag72,wind_speed_mean_lag72
235004,5,2017-11-25 14:00:00,6.0,0,2.0,,,300.0,9.296875,573.5,...,1021.5,110.0,3.267578,23.34375,6.667969,20.203125,-0.013885,1021.0,149.0,3.119141
217966,4,2017-12-14 14:00:00,16.703125,2,6.101562,0.0,1019.0,300.0,4.101562,858.5,...,1015.0,183.375,2.232422,20.59375,4.722656,15.140625,0.0,1020.5,132.75,1.762695
356861,12,2018-01-26 10:00:00,3.400391,1,2.300781,,1022.0,260.0,4.0,567.0,...,1022.0,165.0,1.200195,-0.970703,6.929688,-3.476562,2.847656,1013.5,142.125,4.972656
344311,11,2018-08-09 02:00:00,19.796875,2,19.40625,4.726562,1006.5,260.0,1.5,0.0,...,1009.5,143.375,2.232422,23.90625,3.458984,18.21875,0.361084,1015.5,121.0625,2.029297
696,0,2016-01-29 19:00:00,15.0,6,1.700195,0.0,1018.5,260.0,3.599609,0.0,...,1018.0,263.25,4.265625,17.125,7.042969,14.179688,9.054688,1014.0,175.25,3.082031
388255,14,2017-09-05 23:00:00,20.0,1,18.90625,8.0,1010.0,0.0,0.0,0.0,...,1022.0,213.375,1.733398,22.34375,1.638672,17.03125,0.208374,1012.5,231.625,3.365234
263477,7,2017-03-10 19:00:00,-13.703125,2,-25.5,2.330078,1018.5,310.0,7.699219,0.0,...,1018.0,133.375,1.700195,4.378906,2.070312,1.487305,1.777344,1010.0,145.625,2.205078
390789,14,2017-12-20 13:00:00,7.199219,0,-7.199219,0.0,1012.0,305.0,3.099609,866.5,...,1031.0,103.3125,0.5,0.447266,0.597168,-7.160156,0.0,1022.0,216.75,3.007812
215557,4,2017-09-05 05:00:00,19.40625,4,17.796875,0.0,1013.0,10.0,3.099609,0.0,...,1020.0,23.328125,8.429688,24.109375,4.207031,17.890625,-0.027771,1017.5,201.5,4.789062
82484,9,2016-06-01 04:00:00,20.59375,0,20.0,0.0,1012.5,0.0,0.0,876.0,...,1007.0,276.75,3.601562,24.390625,4.765625,16.9375,0.166626,1005.0,230.75,4.328125


In [15]:
all.dtypes

site_id                                    int8
timestamp                        datetime64[ns]
air_temperature                         float16
cloud_coverage                            uint8
dew_temperature                         float16
precip_depth_1_hr                       float16
sea_level_pressure                      float16
wind_direction                          float16
wind_speed                              float16
s_radiation                             float16
air_temperature_mean_lag3               float16
cloud_coverage_mean_lag3                float16
dew_temperature_mean_lag3               float16
precip_depth_1_hr_mean_lag3             float16
sea_level_pressure_mean_lag3            float16
wind_direction_mean_lag3                float16
wind_speed_mean_lag3                    float16
air_temperature_mean_lag72              float16
cloud_coverage_mean_lag72               float16
dew_temperature_mean_lag72              float16
precip_depth_1_hr_mean_lag72            

In [16]:
all.to_pickle(f'../input/ashrae-energy-prediction-pickles/weather_processed{mode}.pickle')


In [17]:
all.dtypes


site_id                                    int8
timestamp                        datetime64[ns]
air_temperature                         float16
cloud_coverage                            uint8
dew_temperature                         float16
precip_depth_1_hr                       float16
sea_level_pressure                      float16
wind_direction                          float16
wind_speed                              float16
s_radiation                             float16
air_temperature_mean_lag3               float16
cloud_coverage_mean_lag3                float16
dew_temperature_mean_lag3               float16
precip_depth_1_hr_mean_lag3             float16
sea_level_pressure_mean_lag3            float16
wind_direction_mean_lag3                float16
wind_speed_mean_lag3                    float16
air_temperature_mean_lag72              float16
cloud_coverage_mean_lag72               float16
dew_temperature_mean_lag72              float16
precip_depth_1_hr_mean_lag72            