In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')
import gc


# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import matplotlib.patches as patches
from scipy import stats
from scipy.stats import skew

from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('max_columns', 100)

py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

import os,random, math, psutil, pickle
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

# Reading and understanding our data

In [None]:
train_df=pd.read_csv("../input/ashrae-energy-prediction/train.csv")
test_df = pd.read_csv("../input/ashrae-energy-prediction/test.csv")
weather_train_df = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')
weather_test_df = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv')
building_meta_df = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
sample_submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')

train_df

In [None]:
print('Size of train_df data', train_df.shape)
print('Size of weather_train_df data', weather_train_df.shape)
print('Size of weather_test_df data', weather_test_df.shape)
print('Size of building_meta_df data', building_meta_df.shape)

In [None]:
train_df.info()

# Reduce The Memory For Faster Processing

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

weather_train_df = reduce_mem_usage(weather_train_df)
weather_test_df = reduce_mem_usage(weather_test_df)
building_meta_df = reduce_mem_usage(building_meta_df)

In [None]:
train_df.info()

In [None]:
building_meta_df



Features that are likely predictive:

Buildings

    primary_use
    square_feet
    year_built
    floor_count (may be too sparse to use)

Weather

    time of day
    holiday
    weekend
    cloud_coverage + lags
    dew_temperature + lags
    precip_depth + lags
    sea_level_pressure + lags
    wind_direction + lags
    wind_speed + lags

Train

    max, mean, min, std of the specific building historically
    number of meters
    number of buildings at a siteid



**Converting "timestamps" To datetime And String To Category Datatype**

*Categoricals are a pandas data type corresponding to categorical variables in statistics. A categorical variable takes on a limited, and usually fixed, number of possible values*

In [None]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

weather_train_df['timestamp'] = pd.to_datetime(weather_train_df['timestamp'])
weather_test_df['timestamp'] = pd.to_datetime(weather_test_df['timestamp'])

#building_meta_df['primary_use'] = building_meta_df['primary_use'].astype('category')

In [None]:
train_df["hour"] = train_df["timestamp"].dt.hour
train_df["day"] = train_df["timestamp"].dt.day
train_df["weekend"] = train_df["timestamp"].dt.weekday
train_df["month"] = train_df["timestamp"].dt.month

test_df["hour"] = test_df["timestamp"].dt.hour
test_df["day"] = test_df["timestamp"].dt.day
test_df["weekend"] = test_df["timestamp"].dt.weekday
test_df["month"] = test_df["timestamp"].dt.month

# Merging the features of building_meta_df with train_df and test_df

In [None]:
temp_df = train_df[['building_id']]
temp_df = temp_df.merge(building_meta_df, on=['building_id'], how='left')
del temp_df['building_id']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['building_id']]
temp_df = temp_df.merge(building_meta_df, on=['building_id'], how='left')
del temp_df['building_id']
test_df = pd.concat([test_df, temp_df], axis=1)

del temp_df, building_meta_df

# Merging the features of weather_df with train_df and test_df

In [None]:
temp_df = train_df[['site_id','timestamp']]
temp_df = temp_df.merge(weather_train_df, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['site_id','timestamp']]
temp_df = temp_df.merge(weather_test_df, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
test_df = pd.concat([test_df, temp_df], axis=1)

del temp_df, weather_train_df, weather_test_df

**Storing train_df and test_df using to_pickle**

*to_pickle(): A good solution for keeping that dataframe constantly available in between runs so I don't have to spend all that time waiting for the script to run*

*gc.collect() : It performs a blocking garbage collection of all generations. All objects, regardless of how long they have been in memory, are considered for collection; however, objects that are referenced in managed code are not collected. Use this method to force the system to try to reclaim the maximum amount of available memory.*

In [None]:
train_df.to_pickle('train_df.pkl')
test_df.to_pickle('test_df.pkl')
   
del train_df, test_df
gc.collect()

In [None]:
train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')

train_df

# Featuring Engineering

**Encoding**

In [None]:
le = LabelEncoder()

train_df['primary_use'] = le.fit_transform(train_df['primary_use']).astype(np.int8)

test_df['primary_use'] = le.fit_transform(test_df['primary_use']).astype(np.int8)

**Calculating The Age**

In [None]:
#train_df['age'] = train_df['year_built'].max() - train_df['year_built'] + 1
#test_df['age'] = test_df['year_built'].max() - test_df['year_built'] + 1

**Find the optimal feature subset using an evaluation measure. The choice of evaluation metric distinguish the three main strategies of feature selection algorithms: the wrapper strategy, the filter strategy, and the embedded strategy.**

Filter methods:

    information gain
    chi-square test
    correlation coefficient
    variance threshold
    
    
Wrapper methods:

    recursive feature elimination
    sequential feature selection algorithms
    
    
Embedded methods:

    L1 (LASSO) regularization
    decision tree

In [None]:
train_df

**Dropping Timestamps**

In [None]:
train_df = train_df.drop("timestamp", axis = 1)

In [None]:
test_df = test_df.drop("timestamp", axis = 1)

# Dealing With Cat,Num Features And Dropping Unnecessary Features

In [None]:
categoricals = ["building_id", "primary_use", "hour", "day", "weekend", "month", "meter"]

numericals = ["square_feet", "year_built", "air_temperature", "cloud_coverage","dew_temperature"]

In [None]:
drop_cols = ["precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed"]

In [None]:
train_df[categoricals + numericals]

In [None]:
test_df[categoricals + numericals]

In [None]:
feat_cols = categoricals + numericals

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
#plotting 100 highest consuming buildings
#import matplotlib.pyplot as plt
#top_buildings = train_df.groupby("building_id")["meter_reading"].mean().sort_values(ascending = False).iloc[:100]

#for value in top_buildings.index:
#    train_df[train_df["building_id"] == value]["meter_reading"].rolling(window = 24).mean().plot()
#    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

train_df.hist(figsize=(10,10),color="green",bins=20)

*When your input value is so small, using np.log1p to calculate, you will get the more accutrate result than np.log according to the interpretation from*

In [None]:
target = np.log1p(train_df["meter_reading"])
del train_df["meter_reading"]

In [None]:
train_df = train_df.drop(drop_cols + ["site_id", "floor_count"], axis = 1)

In [None]:
test_df = test_df.drop(drop_cols + ["site_id", "floor_count","row_id"], axis = 1)

In [None]:
test_df.info()

In [None]:
train_df.info()

In [None]:
gc.collect()

# Modelling

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

**Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.**

The general procedure is as follows:

    Shuffle the dataset randomly.
    
    Split the dataset into k groups
    
    For each unique group:
    
        Take the group as a hold out or test data set
        Take the remaining groups as a training data set
        Fit a model on the training set and evaluate it on the test set
        Retain the evaluation score and discard the model
        
    Summarize the skill of the model using the sample of model evaluation scores
    


**You can see that I'm not actually doing Kfold. I am skipping the first 4 folds and then just doing the last one so I am actually training on the first 80% of data and validating on the last 20%. I intended to do kfold and then realized it was too compute heavy and may possibly cause leakage.**

In [None]:
num_folds = 5
kf = KFold(n_splits = num_folds, shuffle = False, random_state = 42)
error = 0
models = []
for i, (train_index, val_index) in enumerate(kf.split(train_df)):
    if i + 1 < num_folds:
        continue
    print(train_index.max(), val_index.min())
    
    #Splitting The DF Into Train Test Split
    train_X = train_df[feat_cols].iloc[train_index]
    val_X = train_df[feat_cols].iloc[val_index]
    train_y = target.iloc[train_index]
    val_y = target.iloc[val_index]
    
    #Training The Model
    lgb_train = lgb.Dataset(train_X, train_y > 0)
    lgb_eval = lgb.Dataset(val_X, val_y > 0)
    params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': {'binary_logloss'},
            'learning_rate': 0.1,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq' : 5
            }
    gbm_class = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=(lgb_train, lgb_eval),
               early_stopping_rounds=20,
               verbose_eval = 20)
    
    #we did this so that we only consider positive vals and no dta leakage would be there
    lgb_train = lgb.Dataset(train_X[train_y > 0], train_y[train_y > 0]) 
    lgb_eval = lgb.Dataset(val_X[val_y > 0] , val_y[val_y > 0])
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'learning_rate': 0.5,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq' : 5
            }
    gbm_regress = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=(lgb_train, lgb_eval),
               early_stopping_rounds=20,
               verbose_eval = 20)
#     models.append(gbm)

    y_pred = (gbm_class.predict(val_X, num_iteration=gbm_class.best_iteration) > .5) *\
    (gbm_regress.predict(val_X, num_iteration=gbm_regress.best_iteration))
    error += np.sqrt(mean_squared_error(y_pred, (val_y)))/num_folds
    print(np.sqrt(mean_squared_error(y_pred, (val_y))))
    break
print(error)

In [None]:
sorted(zip(gbm_regress.feature_importance(), gbm_regress.feature_name()),reverse = True)

In [None]:
del train_df
gc.collect()

# Modifying For Submissions

*tqdm: Is Used For Displaying Horizontal Animated Bar*

*\ : Is For Regular Expression

In [None]:
from tqdm import tqdm
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_df.shape[0]/50000)))):
    
    res.append(np.expm1
               (
                   (gbm_class.predict
                    (test_df.iloc[i:i+step_size], num_iteration=gbm_class.best_iteration) > .5) *\
                       (gbm_regress.predict(test_df.iloc[i:i+step_size], num_iteration=gbm_regress.best_iteration))
               )
              )
    i+=step_size

In [None]:
del test_df

In [None]:
res = np.concatenate(res)
pd.DataFrame(res).describe()

In [None]:
print(res)

In [None]:
res.shape

In [None]:
sub = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")

In [None]:
sub["meter_reading"] = res

In [None]:
gc.collect()

In [None]:
os.remove("./train_df.pkl")
os.remove("./test_df.pkl")

In [None]:
sub.to_csv("submission.csv", index = False)