## 1.5.1 LightGBM + FE (1.19)

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div> 

<div style="text-align: right"> Initial upload: 2022.06.28 </div> 
<div style="text-align: right"> Last update: 2022.06.28</div> 

- https://www.kaggle.com/code/isaienkov/lightgbm-fe-1-19

simplelgbm 작성자가 만든 것임

### 라이브러리 import

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import datetime
import missingno as msno
import warnings; warnings.filterwarnings('ignore')
plt.style.use('ggplot')
%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 150

### Loading data

In [2]:
print(os.listdir('data/ashrae-energy-prediction/'))

['building_metadata.csv', 'sample_submission.csv', 'submission.csv', 'test.csv', 'test_df.pkl', 'train.csv', 'train_df.pkl', 'weather_test.csv', 'weather_train.csv']


In [3]:
def degToCompass(num):
    val=int((num/22.5)+.5)
    arr=[i for i in range(0,16)]
    return arr[(val % 16)]

In [4]:
building_df = pd.read_csv('data/ashrae-energy-prediction/building_metadata.csv')
weather_train = pd.read_csv('data/ashrae-energy-prediction/weather_train.csv')
train = pd.read_csv('data/ashrae-energy-prediction/train.csv')

In [5]:
train = train.merge(building_df, left_on='building_id', right_on = 'building_id', how = 'left')

In [6]:
train = train.merge(weather_train, left_on = ['site_id', 'timestamp'], right_on = ['site_id', 'timestamp'],
                   how = 'left')

In [7]:
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0


In [8]:
del weather_train

In [9]:
# train["timestamp"] = pd.to_datetime(train["timestamp"])
# train["hour"] = train["timestamp"].dt.hour
# train["day"] = train["timestamp"].dt.day
# train["weekend"] = train["timestamp"].dt.weekday
# train["month"] = train["timestamp"].dt.month

train["timestamp"] = pd.to_datetime(train["timestamp"])
train["weekday"] = train["timestamp"].dt.weekday
train["hour"] = train["timestamp"].dt.hour
train["weekday"] = train['weekday'].astype(np.uint8)
train["hour"] = train['hour'].astype(np.uint8)
train['year_built'] = train['year_built']-1900
train['square_feet'] = np.log(train['square_feet']) # 추가

In [10]:
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,weekday,hour
0,0,0,2016-01-01,0.0,0,Education,8.91355,108.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0
1,1,0,2016-01-01,0.0,0,Education,7.908387,104.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0
2,2,0,2016-01-01,0.0,0,Education,8.5897,91.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0
3,3,0,2016-01-01,0.0,0,Education,10.072597,102.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0
4,4,0,2016-01-01,0.0,0,Education,11.666565,75.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0


In [11]:
train.shape

(20216100, 18)

In [12]:
def average_imputation(df, column_name):
    imputation = df.groupby(['timestamp'])[column_name].mean()
    
    df.loc[df[column_name].isnull(), column_name] = df[df[column_name].isnull()][[column_name]].apply(lambda x: imputation[df['timestamp'][x.index]].values)
    del imputation
    return df

In [13]:
train = average_imputation(train, 'wind_speed')
train = average_imputation(train, 'wind_direction')

In [14]:
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,weekday,hour
0,0,0,2016-01-01,0.0,0,Education,8.91355,108.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0
1,1,0,2016-01-01,0.0,0,Education,7.908387,104.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0
2,2,0,2016-01-01,0.0,0,Education,8.5897,91.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0
3,3,0,2016-01-01,0.0,0,Education,10.072597,102.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0
4,4,0,2016-01-01,0.0,0,Education,11.666565,75.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,4,0


보퍼트 풍력계급 : https://ko.wikipedia.org/wiki/%EB%B3%B4%ED%8D%BC%ED%8A%B8_%ED%92%8D%EB%A0%A5_%EA%B3%84%EA%B8%89

In [15]:
beaufort = [(0, 0, 0.3), (1, 0.3, 1.6), (2, 1.6, 3.4), (3, 3.4, 5.5), (4, 5.5, 8), (5, 8, 10.8), (6, 10.8, 13.9), 
          (7, 13.9, 17.2), (8, 17.2, 20.8), (9, 20.8, 24.5), (10, 24.5, 28.5), (11, 28.5, 33), (12, 33, 200)]

for item in beaufort:
    train.loc[(train['wind_speed']>=item[1]) & (train['wind_speed']<item[2]), 'beaufort_scale'] = item[0]

In [16]:
del train['timestamp']

In [17]:
train['wind_direction'] = train['wind_direction'].apply(degToCompass)
train['beaufort_scale'] = train['beaufort_scale'].astype(np.uint8)
train["wind_direction"] = train['wind_direction'].astype(np.uint8)
train["meter"] = train['meter'].astype(np.uint8)
train["site_id"] = train['site_id'].astype(np.uint8)

인코딩

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train["primary_use"] = le.fit_transform(train["primary_use"])

In [19]:
categoricals = ["site_id", "building_id", "primary_use", "hour", "weekday", "meter",  "wind_direction"]

drop_cols = ["sea_level_pressure", "wind_speed"]

numericals = ["square_feet", "year_built", "air_temperature", "cloud_coverage",
              "dew_temperature", 'precip_depth_1_hr', 'floor_count', 'beaufort_scale']

In [20]:
feat_cols = categoricals + numericals

In [21]:
target = np.log1p(train["meter_reading"])

del train["meter_reading"] 

train = train.drop(drop_cols, axis = 1)

모델링

In [22]:
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

In [23]:
params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'subsample': 0.25,
            'subsample_freq': 1,
            'learning_rate': 0.4,
            'num_leaves': 20,
            'feature_fraction': 0.9,
            'lambda_l1': 1,  
            'lambda_l2': 1
            }

folds = 4
seed = 666

In [24]:
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
models = []

In [26]:
for train_index, val_index in kf.split(train, train['building_id']):
    X_train = train[feat_cols].iloc[train_index]
    X_val = train[feat_cols].iloc[val_index]
    y_train = target.iloc[train_index]
    y_val = target.iloc[val_index]
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categoricals)
    lgb_eval = lgb.Dataset(X_val, y_val, categorical_feature=categoricals)
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=100,
                verbose_eval = 100)
    models.append(gbm)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2447
[LightGBM] [Info] Number of data points in the train set: 15162075, number of used features: 15
[LightGBM] [Info] Start training from score 4.145695
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.04326	valid_1's rmse: 1.04425
[200]	training's rmse: 1.01233	valid_1's rmse: 1.01447
[300]	training's rmse: 0.997109	valid_1's rmse: 1.00017
[400]	training's rmse: 0.985126	valid_1's rmse: 0.98934
[500]	training's rmse: 0.971871	valid_1's rmse: 0.977172
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 0.971871	valid_1's rmse: 0.977172
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2437
[LightGBM] [Info] Number of data points in the train set: 15162075, number of used features: 

In [27]:
import gc
del train, train_X, val_X, lgb_train, lgb_eval, train_y, val_y, target
gc.collect()

NameError: name 'train_X' is not defined

예측하기

In [None]:
test = pd.read_csv('data/ashrae-energy-prediction/test.csv')
weather_test = pd.read_csv('data/ashrae-energy-prediction/weather_test.csv')

In [None]:
test = test.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
del building_df
gc.collect()
test["primary_use"] = le.transform(test["primary_use"])

test = test.merge(weather_test, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"], how = "left")
del weather_test
gc.collect()

In [None]:
test["timestamp"] = pd.to_datetime(test["timestamp"])
test["hour"] = test["timestamp"].dt.hour
test["weekday"] = test["timestamp"].dt.weekday
test["weekday"] = test['weekday'].astype(np.uint8)
test["hour"] = test['hour'].astype(np.uint8)
test['year_built'] = test['year_built']-1900
test['square_feet'] = np.log(test['square_feet'])

test = average_imputation(test, 'wind_speed')
test = average_imputation(test, 'wind_direction')

for item in beaufort:
    test.loc[(test['wind_speed']>=item[1]) & (test['wind_speed']<item[2]), 'beaufort_scale'] = item[0]
test['wind_direction'] = test['wind_direction'].apply(degToCompass)

test['wind_direction'] = test['wind_direction'].apply(degToCompass)
test['beaufort_scale'] = test['beaufort_scale'].astype(np.uint8)
test["wind_direction"] = test['wind_direction'].astype(np.uint8)
test["meter"] = test['meter'].astype(np.uint8)
test["site_id"] = test['site_id'].astype(np.uint8)

test = test[feat_cols]

In [None]:
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    res.append(np.expm1(sum([model.predict(test.iloc[i:i+step_size]) for model in models])/folds))
    i+=step_size

In [None]:
res = np.concatenate(res)

In [None]:
submission = pd.read_csv('data/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('data/ashrae-energy-prediction/submission.csv', index=False)
submission