In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
import gc
from os import path
from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.dtypes import CategoricalDtype
from tqdm import tqdm
from datetime import date 
import holidays
import lightgbm as lgb


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

# label encoding
le = LabelEncoder()

In [2]:
class ConvertToDatetime(TransformerMixin):
        
    def transform(self, df, **transform_params):
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        return df

    def fit(self, X, y=None, **fit_params):
        return self

In [3]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float32, 'year_built': np.float16, 'floor_count': np.float16},
}

def loadFile(name):
    for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
        if path.exists(dir_path + name + '.csv'):
            return  ConvertToDatetime().transform(
                pd.read_csv(dir_path + name + '.csv', dtype=file_dtype[name]))
        


In [4]:
train = loadFile('train')

In [5]:
class MeterReadingLog1p(TransformerMixin):
  
    def transform(self, df, **transform_params):
        if 'meter_reading' in df.columns:
            df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
            df = df.drop('meter_reading', axis=1)
        return df
    
    def fit(self, X, y=None, **fit_params):
        return self
print(train.sample(20, random_state=42))
print(MeterReadingLog1p().transform(train.sample(20, random_state=42)))
gc.collect()

          building_id  meter           timestamp  meter_reading
14245562         1324      1 2016-09-16 16:00:00       0.000000
1282718          1013      0 2016-01-24 06:00:00      32.000099
13883790          229      1 2016-09-10 07:00:00     567.655029
4781820           217      3 2016-04-01 01:00:00       0.000000
10415393         1434      0 2016-07-10 04:00:00      65.750000
1057008          1047      0 2016-01-20 04:00:00      90.983299
4507399           911      1 2016-03-26 20:00:00     295.063995
19478829         1039      0 2016-12-18 23:00:00      16.900000
8955615           265      0 2016-06-14 06:00:00     128.369995
13799839          896      0 2016-09-08 19:00:00     300.000000
15647011          973      0 2016-10-11 11:00:00     247.000000
2524294           813      0 2016-02-16 08:00:00      10.958300
10016102          870      0 2016-07-03 02:00:00       4.166700
3915750           898      0 2016-03-15 03:00:00      40.000000
17217526          903      0 2016-11-08 

35

In [6]:
class AddTimeFeatures(TransformerMixin):
        
    def transform(self, df_a, **transform_params):
        # TODO: try week of year as numerical 
        df = df_a
        df['dayofweek'] = df['timestamp'].dt.dayofweek.astype('uint8') # vs weekend?
        #df['weekday'] = df['timestamp'].dt.weekday.astype('category')
        #df['dayofweek_hour'] = (df['timestamp'].dt.dayofweek * 24) + df['timestamp'].dt.hour
        #df['dayofweek_hour'] = df['dayofweek_hour'].astype('category')
        #df['week'] = df['timestamp'].dt.week.astype('category')
        df['hour'] = df['timestamp'].dt.hour.astype('uint8')
        return df
        
    def fit(self, X, y=None, **fit_params):
        return self

In [7]:
class CreateMeterHDDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _m_dow_desc_DF
        if 'meter_reading_log1p' in df.columns:
            group = df[['building_id','meter','dayofweek','hour', 'meter_reading_log1p']].groupby(['building_id','meter','dayofweek','hour'])['meter_reading_log1p']
            desc_DF = group.describe()
            desc_DF = desc_DF.reset_index()
            col_dict = {}
            for col in desc_DF.columns:
                if col not in ['building_id', 'meter', 'count','hour','dayofweek']:
                    col_dict[col] = 'meter_h_d_' + col
            _m_dow_desc_DF = desc_DF.rename(columns=col_dict).drop('count', axis=1)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterHDDescDF().transform(
    AddTimeFeatures().transform(
        MeterReadingLog1p().transform(
            train#.sample(2000, random_state=0)
        )
    )
)  
print(_m_dow_desc_DF)

        building_id  meter  dayofweek  hour  meter_h_d_mean  meter_h_d_std  \
0                 0      0          0     0        3.347606       2.679136   
1                 0      0          0     1        3.348162       2.679857   
2                 0      0          0     2        3.348718       2.680495   
3                 0      0          0     3        3.352176       2.682882   
4                 0      0          0     4        3.355895       2.686116   
5                 0      0          0     5        3.354454       2.684861   
6                 0      0          0     6        3.357607       2.687330   
7                 0      0          0     7        3.365461       2.693466   
8                 0      0          0     8        3.363215       2.691854   
9                 0      0          0     9        3.342064       2.675741   
10                0      0          0    10        3.342488       2.676990   
11                0      0          0    11        3.348377     

[399840 rows x 11 columns]


In [7]:
class CreateMeterHDDescDF(TransformerMixin):

    def transform(self, df, **transform_params):
        global _m_dow_desc_DF
        if 'meter_reading_log1p' in df.columns:
            group = df[['building_id','meter','dayofweek','hour''meter_reading_log1p']].groupby(['building_id','meter','dayofweek','hour'])['meter_reading_log1p']
            desc_DF = group.describe()
            desc_DF = desc_DF.reset_index()
            col_dict = {}
            for col in desc_DF.columns:
                if col not in ['building_id', 'meter', 'count','hour','dayofweek']:
                    col_dict[col] = 'meter_h_d_' + col
            _m_dow_desc_DF = desc_DF.rename(columns=col_dict).drop('count', axis=1)
            gc.collect()
        return df 
    def fit(self, X, y=None, **fit_params):
        return self

#if 'meter_mean' not in train.columns:
#    print(building_meter_desc_DF)
#    train = train.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    #test = test.merge(building_meter_desc_DF, on=['building_id','meter'], how='left')
#    del building_meter_desc_DF
CreateMeterHDDescDF().transform(
    AddTimeFeatures().transform(
        MeterReadingLog1p().transform(
            train#.sample(2000, random_state=0)
        )
    )
)  
print(_m_dow_desc_DF)

        building_id  meter  dayofweek  hour  meter_h_d_mean  meter_h_d_std  \
0                 0      0          0     0        3.347606       2.679136   
1                 0      0          0     1        3.348162       2.679857   
2                 0      0          0     2        3.348718       2.680495   
3                 0      0          0     3        3.352176       2.682882   
4                 0      0          0     4        3.355895       2.686116   
5                 0      0          0     5        3.354454       2.684861   
6                 0      0          0     6        3.357607       2.687330   
7                 0      0          0     7        3.365461       2.693466   
8                 0      0          0     8        3.363215       2.691854   
9                 0      0          0     9        3.342064       2.675741   
10                0      0          0    10        3.342488       2.676990   
11                0      0          0    11        3.348377     

[399840 rows x 11 columns]


In [10]:
#_m_dow_desc_DF = reduce_mem_usage(_m_dow_desc_DF)
cols =  [x for x in _m_dow_desc_DF.columns if x not in ['building_id', 'meter', 'dayofweek','hour']]
for col in cols:
    print(col)
    _m_dow_desc_DF[col] = _m_dow_desc_DF[col].astype(np.float32)

meter_h_d_mean
meter_h_d_std
meter_h_d_min
meter_h_d_25%
meter_h_d_50%
meter_h_d_75%
meter_h_d_max


In [11]:
_m_dow_desc_DF.to_csv('desc_h_dow.csv')