In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import all the module that we need to use.

In [None]:
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Define a function to reduce the memory usage.

In [None]:
#Based on this great kernel https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            print("min for this col: ",mn)
            print("max for this col: ",mx)
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

# Input the CSV file that we use to train the model

In [None]:
building = pd.read_csv("../input/ashrae-energy-prediction/building_metadata.csv")
weather_train = pd.read_csv("../input/ashrae-energy-prediction/weather_train.csv")
train = pd.read_csv("../input/ashrae-energy-prediction/train.csv")

# Data collation, merge the three sheet

In [None]:
train = train.merge(building, left_on = "building_id", right_on = "building_id", how = "left")

In [None]:
train = train.merge(weather_train, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"], how = "left")

# Delete unnecessary data to save memory

In [None]:
del weather_train

# Convert date format so that it can be quantified

In [None]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["weekend"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

In [None]:
train = train.drop("timestamp", axis = 1)

# Convert data format of 'Primary_use' so that it can be quantified

In [None]:
le = LabelEncoder()
train["primary_use"] = le.fit_transform(train["primary_use"])

In [None]:
train = train.drop(["precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed","site_id", "floor_count"],axis=1)

In [None]:
train

In [None]:
target = np.log1p(train["meter_reading"])

In [None]:
target

In [None]:
train = train.drop(["meter_reading"],axis=1)

# Reduce the memory usage

In [None]:
train, NAlist = reduce_mem_usage(train)

# Build the model and train it

In [None]:
data = ["building_id", "primary_use", "hour", "day", "weekend", "month", "meter","square_feet", "year_built", "air_temperature", "cloud_coverage","dew_temperature"]
num_folds = 3
kf = KFold(n_splits = num_folds, shuffle = False, random_state = 42)
error = 0
models = []
evals_results = []
for i, (train_index, test_index) in enumerate(kf.split(train)):
    if i + 1 < num_folds:
        continue
    print(train_index.max(), test_index.min())
    train_X = train[data].iloc[train_index]
    test_X = train[data].iloc[test_index]
    train_y = target.iloc[train_index]
    test_y = target.iloc[test_index]
    
    lgb_train = lgb.Dataset(train_X[train_y > 0], train_y[train_y > 0])
    lgb_test = lgb.Dataset(test_X[test_y > 0] , test_y[test_y > 0])
    evals_result = {}
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'learning_rate': 0.6,
            'feature_fraction': 0.7,
            'bagging_fraction': 0.7,
            'bagging_freq' : 4
            }
    model = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=(lgb_train, lgb_test),
               early_stopping_rounds=30,
               verbose_eval = 25,
               evals_result = evals_result
                           )
    models.append(model)
    evals_results.append(evals_result)

# Feature Importance analysis

In [None]:
for model, evals_result in zip(models, evals_results):
    f, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize=(15, 6))
    lgb.plot_importance(model, ax=ax1)
    lgb.plot_metric(evals_result, metric='rmse', ax=ax2)

plt.show()

In [None]:
sorted(zip(model.feature_importance(), model.feature_name()),reverse = True)

# Import data for testing and repeat the processing steps for traingin data

In [None]:
test = pd.read_csv("../input/ashrae-energy-prediction/test.csv")

In [None]:
test = test.merge(building, left_on = "building_id", right_on = "building_id", how = "left")

In [None]:
test["primary_use"] = le.transform(test["primary_use"])

# Reduce the menory usage again

In [None]:
test, NAlist = reduce_mem_usage(test)

# Use the method to try to reclaim all memory that is inaccessible

In [None]:
gc.collect()

In [None]:
weather_test = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")

In [None]:
weather_test = weather_test.drop(["precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed"], axis = 1)

In [None]:
test = test.merge(weather_test, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"], how = "left")

In [None]:
del weather_test

In [None]:
test["timestamp"] = pd.to_datetime(test["timestamp"])
test["hour"] = test["timestamp"].dt.hour.astype(np.uint8)
test["day"] = test["timestamp"].dt.day.astype(np.uint8)
test["weekend"] = test["timestamp"].dt.weekday.astype(np.uint8)
test["month"] = test["timestamp"].dt.month.astype(np.uint8)
test = test[data]

# Predict the final result

In [None]:
from tqdm import tqdm

step_size = 100000
res = []
i = 0
for j in tqdm(range(int(np.ceil(test.shape[0]/step_size)))):
    r = np.zeros(test.iloc[i:i+step_size].shape[0])
    for model in models:
        r += np.expm1(model.predict(test.iloc[i:i+step_size], num_iteration=model.best_iteration)) / len(models)
    res = np.append(res,r)
    i += step_size

# Make the submission

In [None]:
submission = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")

In [None]:
submission["meter_reading"] = res

In [None]:
submission

In [None]:
submission.to_csv('Final_result.csv', index=False)