In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import gc
from sklearn import preprocessing
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train=pd.read_csv("/kaggle/input/ashrae-energy-prediction/train.csv")
building_info=pd.read_csv("/kaggle/input/ashrae-energy-prediction/building_metadata.csv")
weather_info_test=pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_test.csv")
test=pd.read_csv("/kaggle/input/ashrae-energy-prediction/test.csv")
weather_info_train=pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_train.csv")

In [None]:
# train = reduce_mem_usage(train)
# test = reduce_mem_usage(test)

# weather_info_train = reduce_mem_usage(weather_info_train)
# weather_info_test = reduce_mem_usage(weather_info_test)
# building_info = reduce_mem_usage(building_info)

In [None]:
train_all=train.merge(building_info,on='building_id',how='left')
train_all=train_all.merge(weather_info_train,on=['site_id', 'timestamp'],how='left')
gc.collect()
train_all.head()


In [None]:
del weather_info_train

In [None]:
# check the correlation between the numerical independent variables and the meter_reading
train_all.corr()
# based on the correlation matrix, the following columns won't be included in the model:
# precip_depth_1_hr, sea_level_pressure, wind_direction, dew_temperature.
# Before dropping any columns, there are 16 columns in train_all.
gc.collect()

In [None]:
# delete the columns I don't want
train_all=train_all.drop(['precip_depth_1_hr','sea_level_pressure','wind_direction','dew_temperature'],axis=1)
# There  are 12 columns after dropping 4 columns related to weather. There are 
# 20216100 rows in the train_all dataframe before dropping any rows.

In [None]:
# This is time series data, and we are trying to predict the meter_reading for each 
# meter type of each building. Thus, we're predicting the time series trend for each meter type in each building.
# Based on our goal, the weird history of a meter's reading which is not representing the meter's trend and will not be repeated
# in the future should be deleted.
# From the EDA, we know All electricity meter is 0 until May 20 for site_id == 0. Thus,
# data meets these conditions are deleted.
train_all=train_all.drop(train_all[(train_all['building_id']<= 104) & (train_all['meter']==0) & (train_all['timestamp']<= "2016-05-21")].index)
#train_df = train_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')
# train_all.shape
# There are 19867540 rows after dropping those rows.

In [None]:
# Add time features to the data, and get rid of the timestamp. 
# Why do we need to extract time features instead of using timestamp as a predictor direcly?
# Use FFT to figure out the major frequency in the data might be helpful.
train_all["timestamp"] = pd.to_datetime(train_all["timestamp"])
train_all["hour"] = train_all["timestamp"].dt.hour# the hour of a day
# I feel like which day it is in a month might not change the energy consumption pattern much.
train_all["day"] = train_all["timestamp"].dt.day# the day of a month 
train_all["dayofweek"] = train_all["timestamp"].dt.weekday# the day of a week; same as dt.dayofweek
train_all["month"] = train_all["timestamp"].dt.month# the month of a year
train_all.drop('timestamp',axis=1,inplace=True)
train_all
gc.collect()

In [None]:
# train_all[train_all['building_id']==104].shape#(5400, 15)
# train_all[train_all['building_id']==105].shape#(8784, 15)
# Since certain timestamps of some buildings are deleted, the time series has different
# length for each building now.This problem needs to be taken care of!

In [None]:
train_all.head()

In [None]:
# train_all.groupby('site_id')['cloud_coverage'].value_counts()

In [None]:
#  train_all[train_all['site_id']==2].air_temperature.value_counts()
l=train_all.groupby(['site_id','month'])['air_temperature'].mean()
l[0]
train_all[(train_all['site_id']==2) & (train_all['month']==12) & (train_all['day']==25)].air_temperature.value_counts()
# train_all[(train_all['site_id']==2) & (train_all['month']==12) & (train_all['day']==25)]

In [None]:
categorical_feacture=['building_id', 'site_id','primary_use']
numerical_feature=['square_feet', 'year_built', 'floor_count','air_temperature', 'cloud_coverage', 'wind_speed', 'hour', 'dayofweek','month']
feature_columns=categorical_feacture+numerical_feature
label_column='meter_reading'

In [None]:
train_all.isna().sum()
# As we can see, there are many missing values in year_built, floor_count, air_temperature, and cloud_coverage.
# How to deal with the missing values in this case is interesting. 
# I was thinking to fill in the NaNs in air_temperature, and cloud_coverage using the site_id mean.
# However, I later found that for a certain site_id, there are many different values that are quite different for 
# air_temperature and cloud_coverage.
# As for year_built and floor_count, these are features related to certain building and I don't think 
# it is reasonable to use the mean values to fill in the NaNs.

# For now, I'll leave the NaN values as they are.

# Later, try 'Fill Nan value in weather dataframe by interpolation'!

# Seems like fill in the NaNs in air_temperature, and cloud_coverage using the mean of temperature, cloud_coverage
# of day of the month is good. From this notebook:
# https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling

In [None]:
# prepare the dataframe to be dataset that the model can read.
# Each column should be numpy array instead of pandas dataframe
# This function transfer every column to be numpy array
def convert_to_tensor(s):
    dt = s.dtype
    if dt == "float64" or dt == "int64":
        a = np.asarray(s).astype("float32")
        a = np.nan_to_num(a, nan=a[~np.isnan(a)].mean())
        return a
    elif dt == "object":
        a=np.array(s) # change 
        return a
    return None
train_all_dataset=[]
for col in train_all.columns:
    t = convert_to_tensor(train_all[col])
    train_all_dataset.append(t)
    

In [None]:
gc.collect()

In [None]:
print(train_all_dataset)

In [None]:
def prepare_train_data(df):
    data_list = []
    target_list = []
    for unit_number in df.unit_number.unique():
        unit = df[df.unit_number == unit_number]
        #print(unit)
        data_list.append(np.array(unit[data_cols])[:127,:])
        #print(data_list)
        target_list.append(np.array(unit["RUL"])[127])
        #print(target_list)
    return (np.stack(data_list), np.array(target_list).T)

In [None]:
# In the columns in the training dataframe, building_id, meter, site_id, and primary_use are categorical variables.
# We need to encode these variables. Since this notebook will train the model for each meter type, 'meter' won't
# be included as a predictor here.
# onehotencoder is used:
ohe = preprocessing.OneHotEncoder()
bidarr = np.array(train_all['building_id'])
bidarr = bidarr.reshape(-1,1)
print(bidarr.shape)
ohe.fit(bidarr)
building_id_one_hot = ohe.transform(bidarr).toarray()
gc.collect()