In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Any results you write to the current directory are saved as output.

In [None]:
building_metadata = pd.read_csv("../input/ashrae-energy-prediction/building_metadata.csv")
train = pd.read_csv("../input/ashrae-energy-prediction/train.csv")
weather_train = pd.read_csv("../input/ashrae-energy-prediction/weather_train.csv")

In [None]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def intpo(df):
    feat = ["air_temperature", "dew_temperature", "precip_depth_1_hr", "sea_level_pressure", "wind_speed"]
    site_ids = df.site_id.unique()
    df_new = pd.DataFrame()
    for i in site_ids:
        new = df[feat][df.site_id == i].interpolate(limit_direction='both', method='linear')
        #new = new[feat].fillna((new[feat].median()), inplace=True)
        df_new = pd.concat([df_new, new]).reset_index(drop = True)
    
    df_new[feat] = df_new[feat].fillna(df_new[feat].median())
    df[feat] = df_new
    return df

In [None]:
def med_fill(df):
    feat = ["cloud_coverage", "wind_direction"]
    site_ids = df.site_id.unique()
    df_new = pd.DataFrame()
    for i in site_ids:
        new = df[feat][df.site_id == i].fillna((df[feat][df.site_id == i].median()))
        df_new = pd.concat([df_new, new]).reset_index(drop = True)
    
    df_new = df_new.fillna(df_new.median())
    df[feat] = df_new
    return df

In [None]:
def time_edit(df):
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["hour"] = df["timestamp"].dt.hour.astype(np.uint8)
    df["weekday"] = df["timestamp"].dt.dayofweek.astype(np.uint8)
    df["month"] = df["timestamp"].dt.month
    
    holidays = {"2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04", "2016-09-05", "2016-10-10", "2016-11-11",
               "2016-11-24", "2016-11-25", "2016-12-25", "2017-01-02", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-11-24", "2017-12-25", "2018-01-01", "2018-01-15",
                "2018-02-19", "2018-05-28", "2018-07-04", "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-11-23",
                "2018-12-25"}
    df["is_holiday"] = 0
    df["is_holiday"][df["timestamp"].isin(holidays)] = 1
    df['group'] = df['timestamp'].dt.month
    df['group'].replace((1, 7), 1, inplace = True)
    df['group'].replace((2, 8), 2, inplace = True)
    df['group'].replace((3, 9), 3, inplace = True)
    df['group'].replace((4, 10), 4, inplace = True)
    df['group'].replace((5, 11), 5, inplace = True)
    df['group'].replace((6, 12), 6, inplace = True)
    
    return df

In [None]:
weather_train = intpo(weather_train)
weather_train = med_fill(weather_train)
weather_train['precip_depth_1_hr'][weather_train['precip_depth_1_hr'] == -1] = 0

In [None]:
#https://www.kaggle.com/isaienkov/lightgbm-fe-1-19
def degToCompass(num):
    val=int((num/22.5)+.5)
    arr=[i for i in range(0,16)]
    return arr[(val % 16)]

weather_train['wind_direction'] = weather_train['wind_direction'].apply(degToCompass)
weather_train["wind_direction"] = weather_train['wind_direction'].astype(np.uint8)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
building_metadata["primary_use"] = le.fit_transform(building_metadata["primary_use"])

In [None]:
building_metadata["square_feet"] = np.log(building_metadata["square_feet"])
train["meter_reading_log"] = np.log1p(train["meter_reading"])

In [None]:
train = train.merge(building_metadata, on = 'building_id', how = 'left')
train = train.merge(weather_train, on = ['site_id', 'timestamp'], how = 'left')
train = time_edit(train)

del weather_train

In [None]:
#https://www.kaggle.com/ragnar123/another-1-08-lb-no-leak
def make_is_bad_zero(Xy_subset, min_interval=48, summer_start=3000, summer_end=7500):
    """Helper routine for 'find_bad_zeros'.
    
    This operates upon a single dataframe produced by 'groupby'. We expect an 
    additional column 'meter_id' which is a duplicate of 'meter' because groupby 
    eliminates the original one."""
    meter = Xy_subset.meter_id.iloc[0]
    is_zero = Xy_subset.meter_reading == 0
    if meter == 0:
        # Electrical meters should never be zero. Keep all zero-readings in this table so that
        # they will all be dropped in the train set.
        return is_zero

    transitions = (is_zero != is_zero.shift(1))
    all_sequence_ids = transitions.cumsum()
    ids = all_sequence_ids[is_zero].rename("ids")
    if meter in [2, 3]:
        # It's normal for steam and hotwater to be turned off during the summer
        keep = set(ids[(Xy_subset.timestamp < summer_start) |
                       (Xy_subset.timestamp > summer_end)].unique())
        is_bad = ids.isin(keep) & (ids.map(ids.value_counts()) >= min_interval)
    elif meter == 1:
        time_ids = ids.to_frame().join(Xy_subset.timestamp).set_index("timestamp").ids
        is_bad = ids.map(ids.value_counts()) >= min_interval

        # Cold water may be turned off during the winter
        jan_id = time_ids.get(0, False)
        dec_id = time_ids.get(8283, False)
        if (jan_id and dec_id and jan_id == time_ids.get(500, False) and
                dec_id == time_ids.get(8783, False)):
            is_bad = is_bad & (~(ids.isin(set([jan_id, dec_id]))))
    else:
        raise Exception(f"Unexpected meter type: {meter}")

    result = is_zero.copy()
    result.update(is_bad)
    return result

def find_bad_zeros(X, y):
    """Returns an Index object containing only the rows which should be deleted."""
    Xy = X.assign(meter_reading=y, meter_id=X.meter)
    is_bad_zero = Xy.groupby(["building_id", "meter"]).apply(make_is_bad_zero)
    return is_bad_zero[is_bad_zero].index.droplevel([0, 1])

def find_bad_sitezero(X):
    """Returns indices of bad rows from the early days of Site 0 (UCF)."""
    return X[(X.timestamp < 3378) & (X.site_id == 0) & (X.meter == 0)].index

def find_bad_building1099(X, y):
    """Returns indices of bad rows (with absurdly high readings) from building 1099."""
    return X[(X.building_id == 1099) & (X.meter == 2) & (y > 3e4)].index

# binds all together
def find_bad_rows(X, y):
    return find_bad_zeros(X, y).union(find_bad_sitezero(X)).union(find_bad_building1099(X, y))

In [None]:
train["timestamp"] = (train["timestamp"] - pd.to_datetime("2016-01-01")).dt.total_seconds() // 3600
bad_rows = find_bad_rows(train.drop(["meter_reading"], axis = 1), train["meter_reading"])
train = train.drop(index = bad_rows)

In [None]:
train = reduce_mem_usage(train,use_float16=True)
building_metadata = reduce_mem_usage(building_metadata,use_float16=True)

In [None]:
feat = list(train.columns.values)
features = [x for x in feat if x not in ['timestamp', 'meter_reading', 'year','group', "meter_reading_log"]]

In [None]:
from sklearn import metrics
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
import gc

In [None]:
#https://www.kaggle.com/ragnar123/another-1-08-lb-no-leak
def run_lgbm(train, num_rounds = 1000, folds = 6):
    kf = GroupKFold(n_splits = folds)
    param =  {'num_leaves': 500,
             'objective': 'regression',
             'learning_rate': .03,
             'boosting': 'gbdt',
             'feature_fraction': .7,
             'bagging_fraction': .1,
             'bagging_freq': 5,
             'n_jobs': 10,
             'seed': 654321,
             'metric': 'rmse'
              }
    
    target = 'meter_reading_log'
    categorical = ['building_id', 'meter', 'site_id', 'primary_use' , 'is_holiday', 'weekday', 'month']

    models = []
    oof = np.zeros(len(train))
    for tr_idx, val_idx in tqdm(kf.split(train, groups = train['group']), total = folds):
        tr_x, tr_y = train[features].iloc[tr_idx], train[target].iloc[tr_idx]
        vl_x, vl_y = train[features].iloc[val_idx], train[target].iloc[val_idx]
        tr_data = lgb.Dataset(tr_x, label = tr_y,  categorical_feature = categorical)
        vl_data = lgb.Dataset(vl_x, label = vl_y,  categorical_feature = categorical)
        clf = lgb.train(param, tr_data, num_rounds, valid_sets = [tr_data, vl_data], verbose_eval = 25, 
                        early_stopping_rounds = 50)
        models.append(clf)
        oof[val_idx] = clf.predict(vl_x)
        gc.collect()
    score = np.sqrt(metrics.mean_squared_error(train[target], np.clip(oof, a_min=0, a_max=None)))
    
    return models
    
models = run_lgbm(train)

In [None]:
test = pd.read_csv("../input/ashrae-energy-prediction/test.csv")
weather_test = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")

weather_test = intpo(weather_test)
weather_test = med_fill(weather_test)
weather_test['precip_depth_1_hr'][weather_test['precip_depth_1_hr'] == -1] = 0

weather_test['wind_direction'] = weather_test['wind_direction'].apply(degToCompass)
weather_test["wind_direction"] = weather_test['wind_direction'].astype(np.uint8)

In [None]:
test = reduce_mem_usage(test,use_float16=True)
weather_test = reduce_mem_usage(weather_test,use_float16=True)

In [None]:
test = test.merge(building_metadata, on = 'building_id', how = 'left')
test = test.merge(weather_test, on = ['site_id', 'timestamp'], how = 'left')
test = time_edit(test)
del weather_test, building_metadata

In [None]:
#https://www.kaggle.com/ragnar123/another-1-08-lb-no-leak
result = []
step_size = 50000
i = 0
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    result.append(np.expm1(np.mean([model.predict(test[features].iloc[i : i + step_size]) for model in models], axis=0)))
    i += step_size

result = np.concatenate(result)

In [None]:
submission = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
submission["meter_reading"] = result
submission.to_csv("submission.csv", index = False)

In [None]:
del models, result

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam, RMSprop
from keras import backend as K
from keras.losses import MSE

In [None]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
def lstm_model(df):
    
    model = Sequential()
    model.add(LSTM(units = 64, return_sequences=True, input_shape=(None, df.shape[1])))
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(LSTM(units = 64, return_sequences = True))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(LSTM(units = 32))
    model.add(Dropout(0.25))
    model.add(Dense(1))
    model.compile(optimizer = Adam(lr=0.01), loss = MSE, metrics = [root_mean_squared_error])
    
    return model

In [None]:
test = test[features]

In [None]:
def run_model(train, test):
    
    kf = GroupKFold(n_splits = 6)
    i = 0
    results = np.zeros((test.shape[0], 5))
    target = 'meter_reading_log'
    model = lstm_model(train[features])
    
    for tr_idx, val_idx in tqdm(kf.split(train, groups = train['group']), total = 6):
        K.clear_session()
        tr_x, tr_y = train[features].iloc[tr_idx], train[target].iloc[tr_idx]
        vl_x, vl_y = train[features].iloc[val_idx], train[target].iloc[val_idx]

        tr_x, tr_y = tr_x.values[:], tr_y.values[:]
        vl_x, vl_y = vl_x.values[:], vl_y.values[:]
        tr_x = tr_x.reshape((tr_x.shape[0], 1, tr_x.shape[1]))
        tr_y = tr_y.reshape((tr_y.shape[0], 1))
        vl_x = vl_x.reshape((vl_x.shape[0], 1, vl_x.shape[1]))
        vl_y = vl_y.reshape((vl_y.shape[0], 1))

        rlst = EarlyStopping(monitor = 'val_root_mean_squared_error', min_delta = .0001, patience = 5, verbose = True, mode = 'min')
        model.fit(tr_x, tr_y, batch_size = 512, epochs = 50, validation_data = [vl_x, vl_y], callbacks = [rlst], verbose=1)

        results[i] = model.predict(test, batch_size=512)
        i += 1
    return results

In [None]:
#results = run_model(train, test)
#result_f = np.expm1(np.mean(results)).reshape(test.shape[0], 1)
#result_f = np.concatenate(result_f)
#submission = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
#submission["meter_reading"] = result_f
#submission.to_csv("submission.csv", index = False)