5.  [Modeling](#section-five)
    - [Model Validation](#subsection-v)
    - [Test Set](#subsection-test)
    - [Testing Model](#subsection-pred)
    
6. [Conclusion](#section-six)
    - [Model Validation](#subsection-future)

# Data Preparation

This notebook is the continuation of [ASHRAE - Energy Prediction1](https://www.kaggle.com/fatmanuranl/ashrae-energy-prediction1). Data is preapred according to EDA on that notebook. Run the cells below to prepare data for modeling.

In [None]:
import pandas as pd
import numpy as np
import gc
import seaborn as sns

Function for reducing size.

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Function to fill missing meterological data

In [None]:
#function to fill missing meterological data
def site_mean_weather(table):
    for col in list(table.columns[table.isnull().any()]):
        imputaion = table.groupby(['site_id','hour','month'])[col].transform('mean')
        table[col].fillna(imputaion,inplace = True)

Data preparation function.

In [None]:
def prep_func(df):
    # Drop unnecessary columns
    df.drop(['floor_count','year_built', 'cloud_coverage','precip_depth_1_hr','sea_level_pressure'],axis=1,inplace=True)
    gc.collect()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    # Encode meter types
    df['meter'] = pd.Categorical(df['meter']).rename_categories({0: 'electricity', 1: 'chilledwater', 2: 'steam', 3: 'hotwater'})
    gc.collect()
    #Create time related features
    df['hour'] = df.timestamp.dt.hour
    df['month'] = df.timestamp.dt.month
    # Fill missing data
    site_mean_weather(df)
    gc.collect()
    #Create time related features
    df['day'] = df.timestamp.dt.day
    df["weekday"] = df.timestamp.dt.weekday 
    gc.collect()
    #Change column types to category
    df[['primary_use','hour','month','site_id',
    'building_id','wind_direction','weekday','day']] = df[['primary_use','hour','month','site_id','building_id','wind_direction','weekday','day']].astype('category')
    gc.collect()
    #Sort Data chronologically
    df['timestamp'].sort_values().reset_index(drop=True)
    gc.collect()
    df.drop(['timestamp'],axis = 1, inplace = True)
    gc.collect()
    # Create weekend feature
    df.loc[df['weekday'].isin([5, 6]), 'Weekend'] = 1
    df['Weekend'].fillna(0,inplace = True)
    df['Weekend'] = df['Weekend'].astype('bool')
    gc.collect()
    
    df = reduce_mem_usage(df)
    gc.collect()
    
    print('Data is ready')

Run cell to import data.

In [None]:
train = pd.read_csv('../input/ashrae-energy-prediction/train.csv')
weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')
building = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
df = pd.merge(train,building, on="building_id", how="left")
df = df.merge(weather_train, on=["site_id", "timestamp"], how="left")
del train, weather_train,building
gc.collect()
print('Data is imported')

In [None]:
prep_func(df)

In [None]:
df.loc[(df['site_id'] == 0) & (df['meter'] == 'electricity'), 'meter_reading'] = df[(df['site_id'] == 0) & (df['meter'] == 'electricity')]['meter_reading'].apply(lambda x: x* 0.2931 )

In [None]:
df['cons/sqft'] = df['meter_reading'] / df['square_feet']

In [None]:
reduce_mem_usage(df)
gc.collect()

<a id="section-five"></a>
# 1. Modeling

Light Gradient Boosted Machine (LightGBM) model is chosen. 
* Faster training speed and higher efficiency.
* Lower memory usage.
* Better accuracy.
* Support of parallel and GPU learning.
* Capable of handling large-scale data

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit

<a id="subsection-v"></a>
# Model Validation

In [None]:
X = df.drop(['meter_reading','cons/sqft'], axis = 1) #Features
gc.collect()
y = df['cons/sqft'] #target
gc.collect()

In [None]:
del df
gc.collect()

For all model following parameters are used.

In [None]:
categorical_columns = ['primary_use','hour','month','site_id','building_id','wind_direction','weekday','day','meter']

In [None]:
params = {
    "objective": "regression",
    "boosting": "gbdt", # gradient boosting
    "learning_rate": 0.15,
    "num_leaves": 30,
    "feature_fraction": 0.6,
    "reg_lambda": 2,
    "metric": "rmse"}

Run either time series  or kfold model. The predictipn part is same for both

### Time Series Split Model

In [None]:
#Choose number of splits
tss = TimeSeriesSplit(2)
tss.split(X) 
folds = tss.split(X) 
models = []
#Spliting data
for train_index, test_index in folds:
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #print('TRAIN:', train_index, 'TEST:', test_index)
#defining train and validation sets
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test)
    dftrainLGB = lgb.Dataset(data = X_train, label = y_train, feature_name = list(X_train))
    del X_test, y_train, y_test
    gc.collect()
#model training
    model = lgb.train(params, train_set=dftrainLGB, num_boost_round=1000, valid_sets=(lgb_train, lgb_valid), 
                      verbose_eval=75, early_stopping_rounds=200,categorical_feature = categorical_columns)
    models.append(model)
    del lgb_train, lgb_valid, dftrainLGB
    gc.collect()

In [None]:
del X, y
gc.collect()

In [None]:
del tss, folds
gc.collect()

In [None]:
for model in models:
    lgb.plot_importance(model)

In [None]:
del X_train
gc.collect()

In [None]:
#cv_results = lgb.cv(params, dftrainLGB, num_boost_round=200, nfold=4, 
                    #verbose_eval=10, early_stopping_rounds=40)

## Kfold Model

#Choose number of splits
kf = KFold(n_splits=3)
models = []
#Spliting data
for train_index, test_index in  kf.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#defining train and validation sets
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test)
    dftrainLGB = lgb.Dataset(data = X_train, label = y_train, feature_name = list(X_train))
    del X_test, y_train, y_test
    gc.collect()
#model training
    model = lgb.train(params, train_set=dftrainLGB, num_boost_round=1000, valid_sets=(lgb_train, lgb_valid), 
                      verbose_eval=75, early_stopping_rounds=250,categorical_feature = categorical_columns)
    models.append(model)
    del lgb_train, lgb_valid, dftrainLGB
    gc.collect()

<a id="subsection-test"></a>
## Test Set

In [None]:
test = pd.read_csv('../input/ashrae-energy-prediction/test.csv')
weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv')
building = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
test = reduce_mem_usage(test)
weather_test = reduce_mem_usage(weather_test)
building = reduce_mem_usage(building)

In [None]:
test_df = pd.merge(test,building, on="building_id", how="left")
gc.collect()
test_df = test_df.merge(weather_test, on=["site_id", "timestamp"], how="left")
gc.collect()
del test, weather_test, building
gc.collect()

In [None]:
prep_func(test_df)

In [None]:
gc.collect()

<a id="subsection-pred"></a>
## Testing Model

In [None]:
half1 = test_df.iloc[:20848800].drop('row_id',axis=1)
half2 = test_df.iloc[20848800:].drop('row_id',axis=1)

In [None]:
results1 = []
for model in models:
    if  len(results1)== 0:
        results1 = (model.predict(half1, num_iteration=model.best_iteration)) / len(models)
    else:
        results1 += (model.predict(half1, num_iteration=model.best_iteration)) / len(models)
    del model
    gc.collect()

In [None]:
results2 = []
for model in models:
    if  len(results2) == 0:
        results2 = (model.predict(half2, num_iteration=model.best_iteration)) / len(models)
    else:
        results2 += (model.predict(half2, num_iteration=model.best_iteration)) / len(models)
    del model
    gc.collect()

In [None]:
del models
gc.collect()

In [None]:
results = np.concatenate((results1, results2), axis=0)

In [None]:
del results1, results2
gc.collect()

In [None]:
test_df['meter_reading'] = results * test_df['square_feet']

In [None]:
test_df.drop(['building_id','primary_use','air_temperature','dew_temperature',
              'wind_direction','wind_speed','hour','month','day','weekday','Weekend','square_feet'],axis = 1, inplace=True)
gc.collect()

In [None]:
test_df.loc[(test_df['site_id'] == 0) & (test_df['meter'] == 'electricity'), 'meter_reading'] = test_df[(test_df['site_id'] == 0) & (test_df['meter'] == 'electricity')]['meter_reading'].apply(lambda x: x / 0.2931 )
gc.collect()

In [None]:
output = pd.DataFrame({"row_id": test_df['row_id'], "meter_reading": test_df['meter_reading']})
del test_df
gc.collect()
output.to_csv("submission.csv", index=False)

<a id="section-six"></a>
# 2. Conclusions

Here is a summary table for all things I tried.

<a id="subsection-future"></a>
## Future Work

* Removing outliers
* Finding feature importance (changing selected features)
* Changing parameters
* Using cross validaiton 
* Trying different models for different meter types
* Running model without divinding meter reading to area
* Running model after taking log of meter readings
