# 1. 머리말 Introduction
건물 효율을 향상시켜 비용과 배출량을 줄이기 위해 상당한 투자가 이루어지고 있다. 
문제는 개선책이 효과가 있는가 하는 것이다.

이번 커널에서는 냉수, 전기, 온수, 증기계량기 등의 분야에서 계측된 건물 에너지 사용의 정확한 모델을 개발하게 된다. 
데이터는 3년 동안 1,000개 이상의 건물에서 나왔다. 이러한 에너지 절약형 투자에 대한 더 나은 추정치로, 대규모 투자자와 금융 기관들은 건물 효율성의 진보를 가능하게 하기 위해 이 분야에 더 많은 투자를 할 것이다.

![](https://storage.googleapis.com/kaggle-competitions/kaggle/9994/logos/thumb76_76.png?t=2019-10-08-17-08-54)

**호스트 정보**
1894년에 설립된 ASHRAE는 난방, 환기, 냉방, 그리고 그들의 제휴분야의 예술과 과학을 발전시키는 역할을 한다. ASHRAE 회원들은 전 세계의 빌딩 시스템 설계와 산업 공정 전문가들을 대표한다. 132개국에서 54,000명 이상의 회원이 활동하고 있는 ASHRAE는 연구, 표준 작성, 출판 및 지속적인 교육을 지원하며, 현재 미래의 건설 환경을 형성하고 있다.

## 1-1. 제시된 과제
-> 테스트 세트의 각 ID에 대해 대상 변수를 예측해야 한다. 
파일에는 header를 포함한다.

# 2. 패키지 불러오기

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import lightgbm as lgb
import datetime
import sys
import os
import gc #Garbage Collector
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import warnings

warnings.filterwarnings("ignore")
gc.enable()
device_id = 0  # cpu -> -1, gpu -> 0

## 2-2.Version

In [None]:
print('pandas: {}'.format(pd.__version__))
print('numpy: {}'.format(np.__version__))
print('Python: {}'.format(sys.version))

## 3.데이터 로드

In [None]:
%%time
print(os.listdir("../input/ashrae-energy-prediction/"))
DATA_PATH = "../input/ashrae-energy-prediction/"

# 데이터 불러오기

train_data = pd.read_csv(DATA_PATH +'train.csv')
building = pd.read_csv(DATA_PATH +'building_metadata.csv')
weather_train = pd.read_csv(DATA_PATH +'weather_train.csv')
train_data = train_data.merge(building, on='building_id', how='left')
train_data = train_data.merge(weather_train, on=['site_id', 'timestamp'], how='left')

test_data = pd.read_csv(DATA_PATH +'test.csv')
weather_test = pd.read_csv(DATA_PATH +'weather_test.csv')
test_data = test_data.merge(building, on='building_id', how='left')
test_data = test_data.merge(weather_test, on=['site_id', 'timestamp'], how='left')

print ("끝!")

> 데이터 형태 및 컬럼 확인

**Weather**
* time of day
* holiday
* weekend
* cloud_coverage + lags
* dew_temperature + lags
* precip_depth + lags
* sea_level_pressure + lags
* wind_direction + lags
* wind_speed + lags


In [None]:
weather_train.keys(),weather_train.shape

**Train**
* max, mean, min, std of the specific building historically
* number of meters
* number of buildings at a siteid

In [None]:
train_data.keys(),train_data.shape

**Buildings**
* primary_use
* square_feet
* year_built
* floor_count (may be too sparse to use)

In [None]:
building.keys(),building.shape

> 결측치 확인

In [None]:
train_data.isnull().any()

> 다운사이징

In [None]:
#Based on this great kernel https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            print("min for this col: ",mn)
            print("max for this col: ",mx)
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

In [None]:
#44% 감소
train, _ = reduce_mem_usage(train_data)
test, _ = reduce_mem_usage(test_data)

In [None]:
del building, weather_train, weather_test
del train_data
del test_data
gc.collect() 

In [None]:
train_columns = train.columns.tolist() #list 배열로 변경

## 3-1.데이터 확인

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
train.describe()

In [None]:
train.dtypes

In [None]:
for c in train_columns:
    print(train[c].value_counts())
    print()

In [None]:
for c in test.columns:
    print(test[c].value_counts())
    print()

# 4.Feature preprocessing by sklearn pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, QuantileTransformer

In [None]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df, y=None):
        # df = df.copy()
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        df["hour"] = df["timestamp"].dt.hour
        df["day"] = df["timestamp"].dt.day
        df["weekday"] = df["timestamp"].dt.weekday
        df["month"] = df["timestamp"].dt.month
        return df

In [None]:
minmax_features = [
    "year_built",
    "hour",
    "day",
    "weekday",
    "month",
]

minmax_transformer = make_pipeline(
    MinMaxScaler(),
)

numeric_features = [
    "square_feet",
    "air_temperature",
    "cloud_coverage",
    "dew_temperature",
    "floor_count",
]

numeric_transformer = make_pipeline(
    QuantileTransformer(
        n_quantiles=100,
        output_distribution="normal",
        random_state=0,
    ),
)

categorical_features = [
    "primary_use",
    "meter",
    "building_id",
]

categorical_transformer = make_pipeline(
    OrdinalEncoder(),
)

preprocessor = make_pipeline(
    DateFeatureExtractor(),
    ColumnTransformer(
        transformers=[
            ("numeric", numeric_transformer, numeric_features),
            ("minmax", minmax_transformer, minmax_features),
            ("categorical", categorical_transformer, categorical_features),
        ]
    ),
)

In [None]:
preprocessed_train = preprocessor.fit_transform(train)

In [None]:
preprocessed_train[:5, :]

In [None]:
target = np.log1p(train[["meter_reading"]].values)

In [None]:
target[:5, :]

In [None]:
del train
gc.collect()

# 5. Chainer regressor model
오픈소스 신경망 프레임워크 - Preferred Networks(일본) 

In [None]:
import chainer
import chainer.functions as F
import chainer.links as L

In [None]:
chainer.print_runtime_info()

In [None]:
class MLP(chainer.Chain):

    def __init__(self, n_units=10, n_out=10):
        super(MLP, self).__init__()
        with self.init_scope():
            # embed_id
            self.embed_primary_use = L.EmbedID(16, 2)
            self.embed_meter = L.EmbedID(4, 2)
            self.embed_building_id = L.EmbedID(1449, 6)
            # the size of the inputs to each layer will be inferred
            self.l1 = L.Linear(None, n_units)  # n_in -> n_units
            self.l2 = L.Linear(None, n_units)  # n_units -> n_units
            self.l3 = L.Linear(None, n_out)  # n_units -> n_out

    def forward(self, numeric_x, categorical_x):
        # embed layers
        e1 = self.embed_primary_use(categorical_x[:, 0])
        e2 = self.embed_meter(categorical_x[:, 1])
        e3 = self.embed_building_id(categorical_x[:, 2])
        
        # concat all inputs
        x = F.concat((numeric_x, e1, e2, e3), axis=1)
        
        # main layers
        h = F.dropout(F.relu(self.l1(x)), ratio=.1)
        h = F.dropout(F.relu(self.l2(h)), ratio=.1)
        return self.l3(h)

In [None]:
def train_and_validate(
    model,
    optimizer,
    train,
    validation,
    n_epoch,
    batchsize,
    device,
):
    # 1. If the device is gpu(>=0), send model to the gpu.
    if device >= 0:
        model.to_gpu(device)

    # 2. Setup optimizer
    optimizer.setup(model)

    # 3. Create iterator from datast
    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    validation_iter = chainer.iterators.SerialIterator(
        validation, batchsize, repeat=False, shuffle=False
    )

    # 4. Create Updater/Trainer
    updater = chainer.training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = chainer.training.Trainer(updater, (n_epoch, 'epoch'), out='out')

    # 5. Extend functionalities of trainer
    trainer.extend(chainer.training.extensions.LogReport())
    trainer.extend(
        chainer.training.extensions.Evaluator(
            validation_iter, model, device=device
        ), name='val'
    )
    trainer.extend(
        chainer.training.extensions.PrintReport(
            ['epoch', 'main/loss', 'val/main/loss', 'elapsed_time']
        )
    )
    trainer.extend(
        chainer.training.extensions.PlotReport(
            ['main/loss', 'val/main/loss'], x_key='epoch', file_name='loss.png'
        )
    )

    # 6. Start training
    trainer.run()

In [None]:
preprocessed_train = preprocessed_train[:1000]

# 6.KFold : 교차검증
모든 데이터가 최소 1회 TestSet으로 쓰이도록 합니다 ->모델검증

In [None]:
from sklearn.model_selection import KFold

batchsize = 512
n_epoch = 20
n_splits = 5
seed = 666

kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

models = []
for fold_n, (train_index, valid_index) in enumerate(kf.split(preprocessed_train)):
    gc.collect()
    
    print()
    print('Fold:', fold_n)
    X_train, X_valid = preprocessed_train[train_index, :], preprocessed_train[valid_index, :]
    y_train, y_valid = target[train_index], target[valid_index]
    
    model = MLP(64, 1)
    regresser = L.Classifier(model, lossfun=F.mean_squared_error, accfun=F.mean_squared_error)
    optimizer = chainer.optimizers.Adam()
    
    train_and_validate(
        regresser,
        optimizer,
        chainer.datasets.TupleDataset(
            X_train[:, :len(numeric_features) + len(minmax_features)].astype("f"),
            X_train[:, len(numeric_features) + len(minmax_features):].astype("i"),
            y_train,
        ),
        chainer.datasets.TupleDataset(
            X_valid[:, :len(numeric_features) + len(minmax_features)].astype("f"),
            X_valid[:, len(numeric_features) + len(minmax_features):].astype("i"),
            y_valid,
        ),
        n_epoch,
        batchsize,
        device_id,
    )
    
    models.append(model)

In [None]:
del X_train, X_valid, y_train, y_valid, preprocessed_train, target
gc.collect()

**Important Features**

In [None]:
for model in models:
    lgb.plot_importance(model)
    plt.show()

# 7.Submission

In [None]:
test["meter_reading"] = 0.0

In [None]:
from tqdm import tqdm

if device_id >= 0:
    import cupy as cp

step_size = 50000

i = 0
res = []
for j in tqdm(range(int(np.ceil(test.shape[0] / 50000)))):
    gc.collect()
    batch = test[train_columns].iloc[i : i + step_size]
    preprocessed_batch = preprocessor.transform(batch)
    
    device = chainer.get_device(device_id)
    preprocessed_batch = device.send(preprocessed_batch)
    
    predictions = []
    with chainer.using_config('train', False):
        for model in models:
            ndarray = model(
                preprocessed_batch[:, :len(numeric_features) + len(minmax_features)].astype("f"),
                preprocessed_batch[:, len(numeric_features) + len(minmax_features):].astype("i"),
            )
            ndarray.to_cpu()
            predictions.append(ndarray.array)
        
    res.append(np.expm1(sum(predictions) / n_splits))
    i += step_size

In [None]:
res = np.concatenate(res)

In [None]:
submission = pd.read_csv('/kaggle/input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading'] < 0, 'meter_reading'] = 0
submission.to_csv('submission.csv', index=False)
submission