In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# for plotting 
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Introduction to competition 
Q: How much does it cost to cool a skyscraper in the summer?
A: A lot! And not just in dollars, but in environmental impact.

Thankfully, significant investments are being made to improve building efficiencies to reduce costs and emissions. The question is, are the improvements working? That’s where you come in. Under pay-for-performance financing, the building owner makes payments based on the difference between their real energy consumption and what they would have used without any retrofits. The latter values have to come from a model. Current methods of estimation are fragmented and do not scale well. Some assume a specific meter type or don’t work with different building types.

In this competition, you’ll develop accurate models of metered building energy usage in the following areas: chilled water, electric, hot water, and steam meters. The data comes from over 1,000 buildings over a three-year timeframe. With better estimates of these energy-saving investments, large scale investors and financial institutions will be more inclined to invest in this area to enable progress in building efficiencies.

# Input data

In [None]:
building_metadata = pd.read_csv("../input/ashrae-energy-prediction/building_metadata.csv")
weather_train = pd.read_csv("../input/ashrae-energy-prediction/weather_train.csv")
train = pd.read_csv("../input/ashrae-energy-prediction/train.csv")

In [None]:
print('Size of train data', train.shape)
print('Size of weather_train data', weather_train.shape)
print('Size of building_meta data', building_metadata.shape)

So the number of columns isn't big for each dataset , but the number of rows are huge. For train its 20 million+ and for test it is 41 million+. If we don't reduce the memory consumption, given kaggle's RAM allocation limit , the kernel will run out of memory limit and reboot frequently.

# Reducing memory size
Thes section is borrowed from the greate kernel https://www.kaggle.com/caesarlupum/ashrae-start-here-a-gentle-introduction. Reducing size is important as because of the very last train (20 million +) and test ( 40 million + ) the kernet RAM is being overflowed and restarting frequently. 


In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
## REducing memory
for df in [train, weather_train, building_metadata]:
    df = reduce_mem_usage(df)

# Data description
## Files
### train.csv
* building_id - Foreign key for the building metadata.
* meter - The meter id code. Read as {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}. Not every building has all meter types.
* timestamp - When the measurement was taken
* meter_reading - The target variable. Energy consumption in kWh (or equivalent). Note that this is real data with measurement error, which we expect will impose a baseline level of modeling error.

In [None]:
train.head(3)
# print(train.dtypes)

In [None]:
train['meter_reading'].hist(figsize=(6, 5))

Very skewed and also have some outliers. It is less skewed after taking log. So it is better to train with log value. 

In [None]:
np.log1p(train['meter_reading']).hist(figsize=(6, 5))

## weather_[train/test].csv
Weather data from a meteorological station as close as possible to the site.
* site_id
* air_temperature - Degrees Celsius
* cloud_coverage - Portion of the sky covered in clouds, in oktas
* dew_temperature - Degrees Celsius
* precip_depth_1_hr - Millimeters
* sea_level_pressure - Millibar/hectopascals
* wind_direction - Compass direction (0-360)
* wind_speed - Meters per second

In [None]:
weather_train.head(3)
# print(weather_train.dtypes)

In [None]:
weather_train["air_temperature"].hist(figsize=(6, 4))

In [None]:
weather_train["cloud_coverage"].hist(figsize=(6, 4))

In [None]:
weather_train["dew_temperature"].hist(figsize=(6, 4))

In [None]:
weather_train["sea_level_pressure"].hist(figsize=(6, 4))

In [None]:
weather_train["wind_speed"].hist(figsize=(6, 4))

### building_metadata.csv
* site_id - Foreign key for the weather files.
* building_id - Foreign key for training.csv
* primary_use - Indicator of the primary category of activities for the building based on EnergyStar property type definitions
* square_feet - Gross floor area of the building
* year_built - Year building was opened
* floor_count - Number of floors of the building

In [None]:
building_metadata.head(3)
print(building_metadata.dtypes)

In [None]:
building_metadata["square_feet"].hist(figsize=(6, 4))

In [None]:
building_metadata["square_feet"] = building_metadata["square_feet"].apply(np.log1p)
building_metadata["square_feet"].hist(figsize=(6, 4))

In [None]:
building_metadata["primary_use"].value_counts()

# Fixing missing values

In [None]:
def check_missing(df, ascending=False):
    total = df.isnull().sum().sort_values(ascending = ascending)
    percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = ascending)
    missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    
    # Only want to check columns with null values
    missing_data = missing_data[missing_data['Total']!=0]
    return missing_data

In [None]:
check_missing(train).head(len(train))

In [None]:
check_missing(weather_train).head(len(weather_train))

So except cloud_coverage and precip_depth_1_hr others have less than 30% missing values. We'll fill them by their mean values. 

In [None]:
weather_train.drop(['cloud_coverage', 'precip_depth_1_hr'], axis=1, inplace=True)
weather_train.replace('NaN', np.nan, inplace=True)
for col in ["air_temperature", "dew_temperature", "sea_level_pressure", "wind_direction", "wind_speed"]:
    weather_train[col].fillna(weather_train[col].mean(), inplace=True)

In [None]:
check_missing(building_metadata).head(len(building_metadata))

Both of them have high missing values . So we'll drop them both for now.

In [None]:
building_metadata.drop(['floor_count', 'year_built'], axis=1, inplace=True)

# Merging files

In [None]:
def merging(df, weather):
    df = df.merge(building_metadata, left_on = "building_id", right_on = "building_id", how = "left")
    df = df.merge(weather, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"], how = "left")
    return df

In [None]:
train = merging(train, weather_train)
del weather_train
gc.collect()

In [None]:
check_missing(train).head(len(train))

Check that this train data which initially didn't have any null value, now has some . It is because, weather_train doesn't have value for all (site_id, timestamp) pair in train data.

In [None]:
categorical_features = []
print(train.columns)
for col in train.columns:
    if train[col].dtype == "object":
        categorical_features.append(col)
print(categorical_features)

Though this only finds 2 categorical values , actually there are more. For example from data description, we know 
* The meter id code is int and has 4 unique values from 0 to 3. 
* year_built has 116 unique values except NAN. meter has only . 
*  site_id has values from 0 to 15. 
* building_id is an int value, but it has 1449 unique items.
However to keep the data small, not trying one hot encoding right now.

## any column with unique value ?

In [None]:
one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
print(f'columns with unique value in train{one_value_cols}')

So no column with unique value

# Feature engineering
Timestamp related featuring. All timestamp are in year 2016.

In [None]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["weekend"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

In [None]:
# # train = pd.get_dummies(train, columns=categorical_features)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train["primary_use"])
train["primary_use"] = le.transform(train["primary_use"])

In [None]:
drop_columns = ["timestamp", "meter_reading"]
target = np.log1p(train["meter_reading"])
train.drop(drop_columns, axis=1, inplace=True)

# Train with LightGBM

In [None]:
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test , y_train, y_test = train_test_split(train, target , test_size= 0.2, random_state=1)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},
    'learning_rate': 0.3,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8
}


In [None]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_test = lgb.Dataset(x_test, y_test)
del x_train, x_test , y_train, y_test

In [None]:
gbm = lgb.train(params, lgb_train, num_boost_round=2000, valid_sets=[lgb_train, lgb_test], early_stopping_rounds=20, verbose_eval = 20)

In [None]:
del lgb_train, lgb_test, train, target
gc.collect()

# Input test data

In [None]:
weather_test = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")
test = pd.read_csv("../input/ashrae-energy-prediction/test.csv")
print('Size of weather_test_df data', weather_test.shape)
print('Size of test_df data', test.shape)

In [None]:
row_id = test['row_id']
test.drop(['row_id'], axis=1, inplace=True)

# Test data conversion

In [None]:
for df in [test, weather_test]:
    df = reduce_mem_usage(df)

weather_test.drop(['cloud_coverage', 'precip_depth_1_hr'], axis=1, inplace=True)
weather_test.replace('NaN', np.nan, inplace=True)
for col in ["air_temperature", "dew_temperature", "sea_level_pressure", "wind_direction", "wind_speed"]:
    weather_test[col].fillna(weather_test[col].mean(), inplace=True)

test = merging(test, weather_test)   
del weather_test, building_metadata
gc.collect()


In [None]:
test["timestamp"] = pd.to_datetime(test["timestamp"])
test["hour"] = test["timestamp"].dt.hour
test["day"] = test["timestamp"].dt.day
test["weekend"] = test["timestamp"].dt.weekday
test["month"] = test["timestamp"].dt.month
test.drop(["timestamp"], axis=1, inplace=True)

test["primary_use"] = le.transform(test["primary_use"])

del le
gc.collect()

# Making prediction
As the RAM overflowing issue persists with predicting whole testset at once , predicting step by step resolved that problem.

In [None]:
pred = []
step = 50000
for i in range(0, len(test), step):
    pred.extend(np.expm1(gbm.predict(test.iloc[i: min(i+step, len(test)), :], num_iteration=gbm.best_iteration)))

In [None]:
submission = pd.DataFrame({'row_id':row_id, 'meter_reading': pred})
submission['meter_reading'].describe()

So there is no negative value in train and energy consumption can't be negative , but the model seems to predict them. Let's fix them and then write to csv. 

In [None]:
submission['meter_reading'] = submission['meter_reading'].apply(lambda x: 0 if x<0 else x)
submission.to_csv("submission.csv", index = False)

In [None]:
submission['meter_reading'].describe()