In [None]:
from sklearn.model_selection import KFold
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
building_metadata = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
#sample_submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
#test = pd.read_csv('../input/ashrae-energy-prediction/test.csv')
train = pd.read_csv('../input/ashrae-energy-prediction/train.csv')
weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv')
weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')

## building_metadata

* year_built, floor_countは欠損値が多い。他のカラムも少ないのでbuilding_metadataだけでデータの補完は難しい

In [None]:
building_metadata.head(5)

In [None]:
building_metadata.info()

In [None]:
building_metadata.describe().T

In [None]:
building_metadata['site_id'].unique()

In [None]:
building_metadata['primary_use'].unique()

In [None]:
building_metadata = pd.get_dummies(building_metadata, columns=["primary_use"])

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(building_metadata.corr(), vmax=1, vmin=-1, center=0, annot=True)

In [None]:
# site_idごとのビル数
building_metadata.groupby('site_id').count()['building_id'].sort_values(ascending=False)

In [None]:
# square_feet, year_built, floor_countの分布
fig, axarr = plt.subplots(1, 3, figsize=(18, 3))
axarr[0].set_title('square_feet')
axarr[0].hist(building_metadata['square_feet'], bins=20);
axarr[1].set_title('year_built')
axarr[1].hist(building_metadata['year_built'], bins=20);
axarr[2].set_title('floor_count')
axarr[2].hist(building_metadata['floor_count'], bins=20);

## train

* meter_readingは大きい値の影響が多いのでlog値で学習する
* 1ヶ月未満しか測定データがないbuilding_id、meterもあるので対応方法を考える余地有
* 一定期間0、もしくは0付近の値を取り続けている場合がある。故障や電源が入っていないなどの理由が考えられ、その場合テストデータでも同じ傾向となるとは限らない。複数の対応方法を試してみる価値有
* meter1,2,3はmeter0に比べて設置されているビルが少ない。その分学習精度が落ちると思われる

In [None]:
train.head(5)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
fig, axarr = plt.subplots(1, 2, figsize=(12, 3))
axarr[0].set_title('meter_reading')
axarr[0].hist(train['meter_reading'], bins=20);
axarr[1].set_title('np.log1p(meter_reading)')
axarr[1].hist(np.log1p(train['meter_reading']), bins=20);

In [None]:
# meter_readingの値の上位10件
#train['meter_reading'].sort_values(ascending=False).head(10)
train.sort_values(by='meter_reading', ascending=False).head(10)

In [None]:
# meter_readingの最大値を取るビルのmeter_readingのグラフ作成
b1099_2 = train[(train['building_id'] == 1099) & (train['meter'] == 2)]
display(b1099_2)
plt.plot(pd.to_datetime(b1099_2['timestamp']), b1099_2['meter_reading']);
plt.xlim([pd.to_datetime('2016-01-01 00:00:00'), pd.to_datetime('2016-12-31 23:00:00')])

In [None]:
# site_id, meterごとのビルの件数
train.merge(building_metadata,on=['building_id'],how='inner').groupby(['site_id','meter'])['building_id'].nunique()

In [None]:
#  building_id, meterごとのレコード件数
count_stats = train.groupby(['building_id', 'meter']).count().sort_values(by='meter_reading')
display(count_stats)
plt.hist(count_stats['meter_reading'], bins=50);

In [None]:
# レコード件数が最も少ないビルのmeter_readingのグラフ作成
b403_0 = train[(train['building_id'] == 403) & (train['meter'] == 0)]
display(b403_0)
plt.plot(pd.to_datetime(b403_0['timestamp']), np.log1p(b403_0['meter_reading']));
plt.xlim([pd.to_datetime('2016-01-01 00:00:00'), pd.to_datetime('2016-12-31 23:00:00')])

In [None]:
# site_id, meterを指定し、building_idごとにmeter_readingのグラフ作成
def plot_meter_reading(site_id, meter):
    ncols = 5
    
    start = pd.to_datetime('2016-01-01 00:00:00')
    end = pd.to_datetime('2016-12-31 23:00:00')

    building_metadata_tmp = building_metadata[building_metadata['site_id'] == site_id]
    train_tmp = train.merge(building_metadata_tmp, on=['building_id'], how='inner')
    target = train_tmp[train_tmp['meter'] == meter]
    nbuildings = len(target['building_id'].unique())
    max_val = np.log1p(target['meter_reading']).max()

    if nbuildings > 0:
        nrows =  nbuildings // ncols
        remainder = nbuildings % ncols
        if remainder != 0:
            nrows = nrows + 1

        if nrows != 0 or remainder != 0:
            fig, axarr = plt.subplots(nrows, ncols, figsize=(18, nrows*3))
            plt.subplots_adjust(hspace = 0.4)

            for index, building_id in enumerate(sorted(target['building_id'].unique())):
                building = target[target['building_id'] == building_id]
                
                row = index // ncols
                col = index % ncols
                if nrows > 1:
                    axarr[row][col].set_title('building_id=' + str(building_id) + ' meter=' + str(meter))
                    axarr[row][col].plot(pd.to_datetime(building['timestamp']), np.log1p(building['meter_reading']))
                    axarr[row][col].set_xlim([start, end])
                    axarr[row][col].set_ylim([0, max_val])
                else:
                    axarr[col].set_title('building_id=' + str(building_id) + ' meter=' + str(meter))
                    axarr[col].plot(pd.to_datetime(building['timestamp']), np.log1p(building['meter_reading']))
                    axarr[col].set_xlim([start, end])
                    axarr[col].set_ylim([0, max_val])

In [None]:
plot_meter_reading(site_id=0, meter=0)

In [None]:
plot_meter_reading(site_id=1, meter=0)

In [None]:
plot_meter_reading(site_id=0, meter=1)

In [None]:
plot_meter_reading(site_id=2, meter=1)

In [None]:
plot_meter_reading(site_id=6, meter=2)

In [None]:
plot_meter_reading(site_id=7, meter=2)

In [None]:
plot_meter_reading(site_id=1, meter=3)

In [None]:
plot_meter_reading(site_id=2, meter=3)

## weather_train

* site_idによっては取得時刻の件数が少ない場合があるが僅かなので補完は難しくない
* メトリクスによっては特定のsite_idでデータが欠損している

In [None]:
weather_train.head(5)

In [None]:
weather_train.info()

In [None]:
weather_train.describe()

In [None]:
weather_train.groupby('site_id').count()

In [None]:
def plot_weather_metrics(df, metrics):
    nrows=4
    ncols=4
    fig, axarr = plt.subplots(nrows, ncols, figsize=(18, 3*nrows))
    plt.subplots_adjust(hspace = 0.4)
    
    start = pd.to_datetime(df['timestamp']).min()
    end = pd.to_datetime(df['timestamp']).max()
    max_val = df[metrics].max()

    for i in range(16):
        target = df[(df['site_id']==i)]
        row = i // ncols
        col = i % ncols
        axarr[row][col].set_title('site_id=' + str(i))
        axarr[row][col].plot(pd.to_datetime(target['timestamp']), target[metrics])
        axarr[row][col].set_xlim([start, end])
        axarr[row][col].set_ylim([0, max_val])

In [None]:
plot_weather_metrics(weather_train, 'air_temperature')

In [None]:
plot_weather_metrics(weather_train, 'cloud_coverage')

In [None]:
plot_weather_metrics(weather_train, 'dew_temperature')

In [None]:
plot_weather_metrics(weather_train, 'precip_depth_1_hr')

In [None]:
plot_weather_metrics(weather_train, 'sea_level_pressure')

In [None]:
plot_weather_metrics(weather_train, 'wind_direction')

In [None]:
plot_weather_metrics(weather_train, 'wind_speed')

## weather_test

* weather_trainとの大きな傾向の差異は無し

In [None]:
weather_test.head(5)

In [None]:
weather_test.info()

In [None]:
weather_test.describe()

In [None]:
weather_test.groupby('site_id').count()

In [None]:
plot_weather_metrics(weather_test, 'air_temperature')

In [None]:
plot_weather_metrics(weather_test, 'cloud_coverage')

In [None]:
plot_weather_metrics(weather_test, 'dew_temperature')

In [None]:
plot_weather_metrics(weather_test, 'precip_depth_1_hr')

In [None]:
plot_weather_metrics(weather_test, 'sea_level_pressure')

In [None]:
plot_weather_metrics(weather_test, 'wind_direction')

In [None]:
plot_weather_metrics(weather_test, 'wind_speed')

## クラスタリング

* building_id, meterごとにmeter_readingの統計情報でクラスタリング
* building_id=1099,meter=2のデータはmeter_readingの最大値を取るので誤差への影響が大きい、複数の補正方法を試してみる価値有
* building_id=778,meter=1のデータは第3四分位数まで0であり、最大値が比較的大きい特徴がありそう、こちらも複数の補正方法を試してみる価値有
* meter_readingの連続する0のカウントやその始まり日時などもクラスタリングの特徴量に加えればより細かい分類ができるかもしれない

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

n_clusters = 4

meter_reading_descr = train.groupby(['building_id','meter'])['meter_reading'].describe()
display(meter_reading_descr)

kmeans = KMeans(n_clusters=n_clusters, random_state=0)
clusters = kmeans.fit(meter_reading_descr)
meter_reading_descr['cluster'] = clusters.labels_
print(meter_reading_descr['cluster'].unique())
display(meter_reading_descr.head())
display(meter_reading_descr.groupby('cluster').count())
display(meter_reading_descr.groupby('cluster').mean())
meter_reading_descr = meter_reading_descr.reset_index(drop=True)
display(meter_reading_descr)

X = meter_reading_descr
pca = PCA(n_components=2)
pca.fit(X)
x_pca = pca.transform(X)
pca_df = pd.DataFrame(x_pca)
pca_df['cluster'] = meter_reading_descr['cluster']
plt.figure(figsize=(8,8))
for i in meter_reading_descr['cluster'].unique():
    t = pca_df.loc[pca_df['cluster']==i]
    plt.scatter(t[0], t[1])

In [None]:
meter_reading_descr[meter_reading_descr['cluster']==1]

In [None]:
meter_reading_descr[meter_reading_descr['cluster']==2]

In [None]:
meter_reading_descr[meter_reading_descr['cluster']==3]

In [None]:
# cluster=1
b1099_2 = train[(train['building_id'] == 1099) & (train['meter'] == 2)]
display(b1099_2)
plt.plot(pd.to_datetime(b1099_2['timestamp']), b1099_2['meter_reading']);
plt.xlim([pd.to_datetime('2016-01-01 00:00:00'), pd.to_datetime('2016-12-31 23:00:00')])

In [None]:
# cluster=2
b778_1 = train[(train['building_id'] == 778) & (train['meter'] == 1)]
display(b778_1)
plt.plot(pd.to_datetime(b778_1['timestamp']), b778_1['meter_reading']);
plt.xlim([pd.to_datetime('2016-01-01 00:00:00'), pd.to_datetime('2016-12-31 23:00:00')])