credit to:
@aitude for 
https://www.kaggle.com/aitude/ashrae-kfold-lightgbm-without-leak-1-08

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc
import pickle
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
df_train = pd.read_feather('../input/ashrae-great-energy-predictor-iii-featherdataset/train.feather')

building = pd.read_feather('../input/ashrae-great-energy-predictor-iii-featherdataset/building_metadata.feather')
le = LabelEncoder()
building.primary_use = le.fit_transform(building.primary_use)

DATA_PATH = "../input/ashrae-energy-prediction/"
weather_train = pd.read_csv(DATA_PATH + 'weather_train.csv')
weather_test = weather_df = pd.read_csv(DATA_PATH + 'weather_test.csv')
# weather_train = pd.read_feather('../input/ashrae-great-energy-predictor-iii-featherdataset/weather_train.feather')
# weather_test = pd.read_feather('../input/ashrae-great-energy-predictor-iii-featherdataset/weather_test.feather')

# Baseline model with raw data (without cleaning)

In [None]:
# # Remove outliers
# df_train = df_train [ df_train['building_id'] != 1099 ]
# df_train = df_train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

In [None]:
# # building_meter map

# bm_cols = ['bm', 'weekday', 'hour',]
# df_train['hour'] = df_train['timestamp'].dt.hour
# df_train['weekday'] = df_train['timestamp'].dt.weekday
# df_train['bm'] = df_train['building_id'].apply(lambda x: str(x)) + '_' + df_train['meter'].apply(lambda x: str(x))
# bm = df_train.groupby(bm_cols)['meter_reading'].mean().rename('bm_week_hour').to_frame()

In [None]:
# df_train = df_train.merge(bm, right_index=True, left_on=bm_cols, how='left')
# df_train.drop(['bm'], axis=1, inplace=True)
# df_train.head()

## Utility Functions

In [None]:
# Original code from https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling by @aitude

def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]
    
    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["timestamp"].dt.day
    weather_df["week"] = weather_df["timestamp"].dt.week
    weather_df["month"] = weather_df["timestamp"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['day','week','month'],axis=1)
        
    return weather_df

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def features_engineering(df):
    
    # Sort by timestamp
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # Add more features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["weekday"] = df["timestamp"].dt.weekday
    
    df['square_feet'] =  np.log1p(df['square_feet'])
    df['sm'] = df['site_id'].apply(lambda x: str(x)) + '_' + df['meter'].apply(lambda x: str(x))
    
    
    # Remove Unused Columns
    drop = ["timestamp",'site_id',"sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count"]
    df = df.drop(drop, axis=1)
    gc.collect()
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    # reduce memory
    df = reduce_mem_usage(df, use_float16=True)
    
    return df

In [None]:
def rmse(ytrue, ypred):
    return np.sqrt(np.mean(np.square(ypred - ytrue), axis=0))
def rmsle(ytrue, ypred):
    return np.sqrt(np.mean(np.square(np.log1p(ypred) - np.log1p(ytrue)), axis=0))

## Fill Weather Information

I'm using [this kernel](https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling) to handle missing weather information.

In [None]:
weather_train = fill_weather_dataset(weather_train)

## Memory Reduction

In [None]:
df_train = reduce_mem_usage(df_train,use_float16=True)
building = reduce_mem_usage(building,use_float16=True)
weather_train = reduce_mem_usage(weather_train,use_float16=True)

## Merge Data

We need to add building and weather information into training dataset.

In [None]:
df_train = df_train.merge(building, left_on='building_id',right_on='building_id',how='left')
df_train = df_train.merge(weather_train,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])
del weather_train
gc.collect()

## Features Engineering

In [None]:
%%time
df_train = features_engineering(df_train)

In [None]:
df_train.head()

## Features & Target Variables

In [None]:
y_train = np.log1p(df_train["meter_reading"])
X_train = df_train.drop('meter_reading', axis = 1)
del df_train
gc.collect()

##  KFOLD LIGHTGBM Model

In [None]:
%%time
categorical_features = ['sm', "building_id", "meter", "primary_use", "weekday"]
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
    "num_threads": 2
}

pred_L1 = []
valid_L1 = []
seed = None
kf = KFold(n_splits=3, random_state=seed)

models = []
for train_index,test_index in kf.split(X_train):
    train_features = X_train.loc[train_index]
    train_target = y_train.loc[train_index]
    
    test_features = X_train.loc[test_index]
    test_target = y_train.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    models.append(model)
    pred_L1.append(model.predict(test_features))
    valid_L1.append(test_target)
    
    del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()

In [None]:
del X_train, y_train
gc.collect()

## Important Features

In [None]:
for model in models:
    lgb.plot_importance(model)
    plt.show()

## Load Test Data

In [None]:
df_test = pd.read_feather('../input/ashrae-great-energy-predictor-iii-featherdataset/test.feather')
row_ids = df_test["row_id"]
df_test.drop("row_id", axis=1, inplace=True)
df_test = reduce_mem_usage(df_test)

In [None]:
# df_test['hour'] = df_test['timestamp'].dt.hour
# df_test['weekday'] = df_test['timestamp'].dt.weekday
# df_test['bm'] = df_test['building_id'].apply(lambda x: str(x)) + '_' + df_test['meter'].apply(lambda x: str(x))
# df_test = df_test.merge(bm, right_index=True, left_on=bm_cols, how='left')
# df_test.drop('bm', axis=1, inplace=True)

## Merge Building Data

In [None]:
df_test = df_test.merge(building,left_on='building_id',right_on='building_id',how='left')
del building
gc.collect()

## Fill Weather Information

In [None]:
weather_test = fill_weather_dataset(weather_test)
weather_test = reduce_mem_usage(weather_test)

## Merge Weather Data

In [None]:
df_test = df_test.merge(weather_test,how='left',on=['timestamp','site_id'])
del weather_test
gc.collect()

## Features Engineering

In [None]:
df_test = features_engineering(df_test)

In [None]:
df_test.info()

In [None]:
df_test.head()

## Prediction

In [None]:
%%time
pred = []
for model in tqdm(models):
    if  pred == []:
        pred = np.expm1(model.predict(df_test, num_iteration=model.best_iteration)) / len(models)
    else:
        pred += np.expm1(model.predict(df_test, num_iteration=model.best_iteration)) / len(models)
    del model
    gc.collect()

# Save Model

In [None]:
# save model to file
pickle.dump(models, open("models.pkl", "wb"))
pickle.dump(pred_L1, open("pred_L1.pkl", "wb"))
pickle.dump(valid_L1, open("valid_L1.pkl", "wb"))

In [None]:
del df_test, models
gc.collect()

## Submission

In [None]:
submission = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(pred, 0, a_max=None)})
del row_ids, pred
gc.collect()
submission['meter_reading'] = submission['meter_reading'].astype('float32')
submission['row_id'] = submission['row_id'].astype('int32')
submission.to_csv("submission.csv", index=False, chunksize=25000)

In [None]:
submission.head()

In [None]:
print(f"submission mean: {submission['meter_reading'].mean():.4f}")
print(f"submission std: {submission['meter_reading'].std():.4f}")
print(f"submission min: {submission['meter_reading'].min():.4f}")
print(f"submission max: {submission['meter_reading'].max():.4f}")

In [None]:
sns.distplot(np.log1p(submission['meter_reading'].values), kde=False);
gc.collect()