In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/ashrae-energy-prediction'):
    for filename in filenames:
        #df_name = os.path.splitext(filename)[0]
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from time import time
import datetime
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',1500)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
from collections import Counter 
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, \
                            f1_score, roc_curve, confusion_matrix

In [None]:
# importing datas of weather and simultaneously determine the datatypes:

weather_dtype = {"site_id":"uint8",'air_temperature':"float16",'cloud_coverage':"float16",'dew_temperature':"float16",'precip_depth_1_hr':"float16",
                 'sea_level_pressure':"float32",'wind_direction':"float16",'wind_speed':"float16"}

df_weather_train=pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv', parse_dates=['timestamp'],dtype=weather_dtype)
df_weather_test=pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv', parse_dates=['timestamp'],dtype=weather_dtype)


# importing datas of building characteristics. 

metadata_dtype = {'site_id':"uint8",'building_id':'uint16','square_feet':'int','year_built':'float32','floor_count':"float16"}
df_buildings=pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv', dtype=metadata_dtype)



# importing train-data

train_dtype = {'meter':"uint8",'building_id':'uint16','meter_reading':"float32"}
df_train=pd.read_csv('../input/ashrae-energy-prediction/train.csv',parse_dates=['timestamp'],dtype=train_dtype)


# importing test-data

df_test=pd.read_csv('../input/ashrae-energy-prediction/test.csv',parse_dates=['timestamp'],dtype=train_dtype)


print("data loaded")

In [None]:
# find missing values in dataframes:
def missing_values(df):
    return pd.DataFrame(df.isna().sum()/len(df),columns=["% NANs"])

## weather

In [None]:
df_weather_train.isnull().sum(axis = 0)

In [None]:
df_weather_train.describe()

Weather data contains many NaNs. Some should be imputed and some omitted. Cloud_coverage, precip_depth_1_hr datasets are with biggest gaps. From the distribution of cloud_coverage (data not shown) and for its importance we decided to throw our only the precip_depth_1_hr value.  Importantly, we imputed the missing gaps in weather dataset *site-specificaly*, because sites are connected to the actual location where the weather data comes from. 

We will try to impute the other values.

## buildings

In [None]:
display(df_buildings.describe())
display(missing_values(df_buildings))
df_buildings.info(memory_usage='deep')

Looking at the building data, we decided to remove "floor_count" and "year_built" features from the dataset, because they copntain 53% and 76% NaNs, respectively, which can't be computed.

## train

In [None]:
display(df_train.head())
display(missing_values(df_train))
display(df_train.describe())
df_buildings[df_buildings['building_id']==1099]
df_buildings[df_buildings['building_id']==1099]

In [None]:
display(df_train[(df_train['building_id'] == 1099) & (df_train['meter'] == 2)]['meter_reading'].describe())
df_buildings[df_buildings['building_id']==1099]

In [None]:
# dataset without rows with building1099
tt = df_train[df_train.building_id != 1099]
q75 = tt[tt['meter'] == 2]['meter_reading'].quantile(0.75)
q25 = tt[tt['meter'] == 2]['meter_reading'].quantile(0.25)

IQR = q75-q25
lowerIQR = q25 - 1.5*(IQR)
upperIQR = q75 + 1.5*(IQR)
print(lowerIQR, upperIQR)
print(int(21904700/2483.25))

In [None]:
ajut=df_train[(df_train['building_id'] == 1099) & (df_train['meter'] == 2)]
ajut_e=df_train[(df_train['building_id'] == 1099) & (df_train['meter'] == 0)]

plt.plot(ajut_e['timestamp'].dt.date, ajut_e['meter_reading'])
plt.plot(ajut['timestamp'].dt.date, ajut['meter_reading']/8820)
plt.show()

Building nr 1099 'steam' has exceptionally high energy values. We calculated that any value above 2483.25 in the dataset's 'steam' can be considered as outlier. So, we divided the steam meter values for building 1099 by 8820 so that it would fit to the dataset, instead of deleting that building data.

In [None]:
# look closer at site 0 energuy consumtion

df_train['meter_reading'] = np.log1p(df_train['meter_reading'])


site0_bds = list(df_buildings[df_buildings['site_id']==0]['building_id'])
plt.figure(figsize=(10,6))
for i in site0_bds:
    temp_df = df_train[df_train['building_id'] == i]
    plt.scatter(temp_df['timestamp'].dt.date, temp_df['meter_reading'], marker='.')

plt.show()

We notioced that adter convervting the energy values to log1p scale, we get a more desent data distribution.

We also noticed that site 0 has weird energy consumption pattern until 2016.06. We decided to omit this data until that date. 

In [None]:
sns.distplot(df_train[df_train['meter'] == 0]['meter_reading'],kde=False, label="Electricity")
sns.distplot(df_train[df_train['meter'] == 1]['meter_reading'],kde=False, label="ChilledWater")
sns.distplot(df_train[df_train['meter'] == 2]['meter_reading'],kde=False, label="Steam")
sns.distplot(df_train[df_train['meter'] == 3]['meter_reading'],kde=False, label="HotWater")
plt.title("Distribution of Log of Meter Reading Variable")
plt.legend()
plt.show()


Most used energy source is Electricity, the others are far less used.

In [None]:
btypes = Counter(df_buildings['primary_use'])
building_meters={}
for b in btypes.keys():
    building_meters[b] = df_buildings[df_buildings['primary_use']==b]['building_id'].unique().tolist()

df_train['meter'].replace({0:"Electricity",1:"ChilledWater",2:"Steam",3:"HotWater"},inplace=True)

for btype, b_list in building_meters.items():
    #print(Counter([df_train[df_train['building_id']==b]['meter'] for b in b_list])
    temp = Counter([df_train[df_train['building_id']==int(b)]['meter'].unique()[0] for b in b_list])
    print(btype, dict(temp))

As shown, all buiilding types used 'Electricity'. Many building types contained so few data, that we decided to merge them under buildingtype "Other".

## Fix data

In [None]:
#import train data again:

df_train=pd.read_csv('../input/ashrae-energy-prediction/train.csv',parse_dates=['timestamp'],dtype=train_dtype)

In [None]:
%%time

# remove weird date data from site 0:
to_del = df_train[(df_train['building_id'] <= 104) & (df_train['timestamp'] <= "2016-05-20")].index
df_train=df_train.drop(to_del, axis=0)


# Fix format error for "Energy" in site 0:
#   Site 0: Multiply by 0.2931 to get to model inputs into kWh like the other sites, and 3.4118 to get back to kBTU for scoring.
df_train.loc[(
    df_train['building_id'] <=104) & (df_train['meter'] == 0), 'meter_reading'] *= 0.2931

# reduce abnormaly high "steam" values for building 1099 so that their max_val is out of outlier border 
#  (calculated from all "steam" values excluding this one)
df_train.loc[(
    df_train['building_id'] == 1099) & (df_train['meter'] == 2), 'meter_reading'] /= 8744


# now convert meter values to log1p: # need to convert back later?
df_train['meter_reading'] = np.log1p(df_train['meter_reading'])


# Weather: remove 'precip_depth_1_hr'
df_weather_train.drop('precip_depth_1_hr',axis=1,inplace=True)
df_weather_test.drop('precip_depth_1_hr',axis=1,inplace=True)


# BUILDINGS: remove 'floor_count' and 'year_built'
df_buildings.drop('floor_count',axis=1,inplace=True)
df_buildings.drop('year_built',axis=1,inplace=True)

# group least common building types under "Other"
df_buildings['primary_use'].replace({'Healthcare':"Other",
                                     'Parking':"Other",
                                     'Warehouse/storage':"Other",
                                     'Manufacturing/industrial':"Other", 
                                     'Retail':"Other",
                                     'Services':"Other",
                                     'Technology/science':"Other", 
                                     'Food sales and service':"Other",
                                     'Utility':"Other", 
                                     'Religious worship':"Other"},inplace=True)


#impute missing variables for weather (within the site-ids!):
def impute_cols(df):
    
    cols = df.columns
    sites=list(Counter(df.site_id).values())
    sites[0]=sites[0]-1
    counter = 0
    for i in sites:
        df.loc[counter:counter+i, cols] = df.loc[counter:counter+i, cols].interpolate(axis=0)
        counter+=i
        
impute_cols(df_weather_train)
impute_cols(df_weather_test)



#weather_test_df = weather_test_df.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))
#df_weather_test.groupby('site_id').apply(lambda group: group.isna().sum())
#Counter(df_weather_train.isnull().any(axis=1))

gc.collect()

In [None]:
%%time

#Merge all datasets

# for train:
df_train = pd.merge(df_train, df_buildings, on='building_id', how='left', copy=False)
df_train = pd.merge(df_train, df_weather_train, on=['site_id', 'timestamp'], how='left', copy=False)
#del(df_train["timestamp"])
print("trainig data shape:", df_train.shape)

# for test:
df_test = pd.merge(df_test, df_buildings, on='building_id', how='left', copy=False)
df_test = pd.merge(df_test, df_weather_test, on=['site_id', 'timestamp'], how='left', copy=False)
#del(df_test["timestamp"])



# site 8 has more data on more dates, so we need to trim off data from other sites at these dates:
df_train=df_train.dropna()   
df_test=df_test.dropna()  
print("NA values in train dataset:", dict(Counter(df_train.isnull().any(axis=1))))
print("NA values in test dataset:", dict(Counter(df_test.isnull().any(axis=1))))
#df_train[df_train.isnull().any(axis=1)]


# Generate time data from timestamp and delete the latter:
def preprocess(df):
    df["hour"] = df["timestamp"].dt.hour  # test deleting
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    df["weekend"] = df["dayofweek"] >= 5
    del(df["timestamp"])
    

preprocess(df_train) 
preprocess(df_test)

gc.collect()


In [None]:
print(df_test.shape)
print(df_train.shape)

## train for each meter separately:

### {0:"Electricity",1:"ChilledWater",2:"Steam",3:"HotWater"}

In [None]:
# extra preprocess:

def prepare_meter_data(metertype):

    # get indexes of rows with selected metertype 
    tr_rowids = df_train[df_train['meter'] == metertype].index
    ts_rowids = df_test[df_test['meter'] == metertype].index

    # slice out selected rows for train and test dataset separately
    df_train_mod = df_train.loc[tr_rowids].drop(['meter_reading'], axis=1)
    df_val_mod = df_train.loc[tr_rowids]['meter_reading']

    df_test_mod = df_test.loc[ts_rowids]

    # delete unnecesarry cols
    todrop = (["hour", "day", "weekend", "meter"])
    df_train_mod = df_train_mod.drop(todrop, axis = 1) 
    df_test_mod = df_test_mod.drop(todrop, axis = 1) 

    #one-hot encoding for cateorical variables
    df_train_mod = pd.get_dummies(df_train_mod, columns = ["month", "dayofweek", "primary_use"])
    df_test_mod = pd.get_dummies(df_test_mod, columns = ["month", "dayofweek", "primary_use"])
    
    return (df_train_mod, df_val_mod, df_test_mod)


In [None]:
# As example, create train, val, test datsets for "steam"
train, val, test = prepare_meter_data(2)

In [None]:
print(train.shape, val.shape, test.shape)

Create predictions for all models and put them bac to the df_test dataset under 'meter_reading' column for final conversions:

## After training the model and predicting the results, you need to transform back some data (in the created 'meter_reading' column in the train_dataset) before submitting it.

In [None]:
'''

# convert all meter readings back freom log1p scale: 
df_test['meter_reading'] = np.expm1(df_test['meter_reading'])

# de-fix format error for "Energy" in site 0 ((Multiply model inputs 3.4118 to get back to kBTU for scoring)):
df_test.loc[(
    df_test['building_id'] <=104) & (df_test['meter'] == 0), 'meter_reading'] *= 3.4118

# de-fix abnormaly high "steam" values for building 1099 ((multiply values with 8744):
df_test.loc[(
    df_test['building_id'] == 1099) & (df_test['meter'] == 2), 'meter_reading'] *= 8744

'''