In [2]:
# for "2. Data Loading"
import pandas as pd

# for "3-1. Feature Generation"
import numpy as np

# for "3-2. Feature Engineering"
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import RobustScaler, StandardScaler

# for "4. Modeling with Pycaret"
# for "5. Modeling with CatBoostRegressor"
from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold

In [9]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [5]:
train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
test.loc[test.임대보증금=='-', '임대보증금'] = np.nan
train['임대보증금'] = train['임대보증금'].astype(float)
test['임대보증금'] = test['임대보증금'].astype(float)

train.loc[train.임대료=='-', '임대료'] = np.nan
test.loc[test.임대료=='-', '임대료'] = np.nan
train['임대료'] = train['임대료'].astype(float)
test['임대료'] = test['임대료'].astype(float)

train[['임대보증금', '임대료']] = train[['임대보증금', '임대료']].fillna(0)
test[['임대보증금', '임대료']] = test[['임대보증금', '임대료']].fillna(0)

cols = ['도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수']
train[cols] = train[cols].fillna(0)
test[cols] = test[cols].fillna(0)

#### (3) 자격유형

test.loc[test.단지코드.isin(['C2411']) & test.자격유형.isnull(), '자격유형'] = 'A'
test.loc[test.단지코드.isin(['C2253']) & test.자격유형.isnull(), '자격유형'] = 'C'

train = train.drop_duplicates()
test = test.drop_duplicates()

unique_cols = ['총세대수', '지역', '공가수', 
               '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
               '도보 10분거리 내 버스정류장 수',
               '단지내주차면수', '등록차량수']
train_agg = train.set_index('단지코드')[unique_cols].drop_duplicates()
test_agg = test.set_index('단지코드')[[col for col in unique_cols if col!='등록차량수']].drop_duplicates()

def reshape_cat_features(data, cast_col, value_col):
    res = data.drop_duplicates(['단지코드', cast_col]).assign(counter=1).pivot(index='단지코드', columns=cast_col, values=value_col).fillna(0)
    res.columns.name = None
    res = res.rename(columns={col:cast_col+'_'+col for col in res.columns})
    return res

train.loc[train.공급유형.isin(['공공임대(5년)', '공공분양', '공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(5년/10년/분납/분양)'
test.loc[test.공급유형.isin(['공공임대(5년)', '공공분양', '공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(5년/10년/분납/분양)'
train.loc[train.공급유형.isin(['장기전세', '국민임대']), '공급유형'] = '국민임대/장기전세'
test.loc[test.공급유형.isin(['장기전세', '국민임대']), '공급유형'] = '국민임대/장기전세'

train.loc[train.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'
test.loc[test.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'

train.loc[train.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대/장기전세_공급대상'
test.loc[test.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대/장기전세_공급대상'

train.loc[train.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'
test.loc[test.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'

X_train = pd.concat([train_agg,
                       reshape_cat_features(data=train, cast_col='임대건물구분', value_col='counter'),
                       reshape_cat_features(data=train, cast_col='공급유형', value_col='counter'),
                       reshape_cat_features(data=train, cast_col='자격유형', value_col='counter')], axis=1)

X_test = pd.concat([test_agg,
                       reshape_cat_features(data=test, cast_col='임대건물구분', value_col='counter'),
                       reshape_cat_features(data=test, cast_col='공급유형', value_col='counter'),
                       reshape_cat_features(data=test, cast_col='자격유형', value_col='counter')], axis=1)

In [6]:
X_train = X_train.drop(columns = ['임대건물구분_아파트'])
X_test = X_test.drop(columns = ['임대건물구분_아파트'])

- 필요성 - 3. 이상치가 보인다

In [7]:
display('Train Data')
display(X_train.groupby(['도보 10분거리 내 지하철역 수(환승노선 수 반영)'])['총세대수'].agg('count'))
display('='*50)
display('Test Data')
display(X_test.groupby(['도보 10분거리 내 지하철역 수(환승노선 수 반영)'])['총세대수'].agg('count'))
display('='*50)
# 지하철 역 수가 3인 데이터는 제거한다.

display('After Engineering')
X_train = X_train[X_train['도보 10분거리 내 지하철역 수(환승노선 수 반영)'] != 3]
display(X_train.groupby(['도보 10분거리 내 지하철역 수(환승노선 수 반영)'])['총세대수'].agg('count'))

'Train Data'

도보 10분거리 내 지하철역 수(환승노선 수 반영)
0.0    379
1.0     38
2.0      5
3.0      1
Name: 총세대수, dtype: int64



'Test Data'

도보 10분거리 내 지하철역 수(환승노선 수 반영)
0.0    139
1.0      8
2.0      3
Name: 총세대수, dtype: int64



'After Engineering'

도보 10분거리 내 지하철역 수(환승노선 수 반영)
0.0    379
1.0     38
2.0      5
Name: 총세대수, dtype: int64

In [8]:
%%time
encoding_features = ['지역', '도보 10분거리 내 지하철역 수(환승노선 수 반영)']
for f in encoding_features:
    mapping = X_train.groupby([f])['등록차량수'].agg(['mean','median','std'])
    mapping_values = []
    for l in X_train[f].values:
        mapping_values.extend([mapping.loc[l].values])
    bincount = pd.DataFrame(mapping_values, columns = [f+'_mean', f+'_median', f+'_std', ], index = X_train.index)
    X_train = pd.concat([X_train,bincount], axis= 1).drop(columns = [f])

    mapping_values = []
    for l in X_test[f].values:
        mapping_values.extend([mapping.loc[l].values])
    bincount = pd.DataFrame(mapping_values, columns = [f+'_mean', f+'_median', f+'_std', ], index = X_test.index)
    X_test = pd.concat([X_test,bincount], axis= 1).drop(columns = [f])

CPU times: user 90.6 ms, sys: 2.86 ms, total: 93.5 ms
Wall time: 91.2 ms


In [9]:
scailing_features = ['총세대수',
                     '공가수',
                     '도보 10분거리 내 버스정류장 수',
                     '단지내주차면수',
                     '지역_mean', 
                     '지역_median',
                     '지역_std', 
                     '도보 10분거리 내 지하철역 수(환승노선 수 반영)_mean',
                   '도보 10분거리 내 지하철역 수(환승노선 수 반영)_median',
                   '도보 10분거리 내 지하철역 수(환승노선 수 반영)_std'
                    ]

In [10]:
# 도보 10분거리 std는 그림상 의미가 없기에 Drop
X_train = X_train.iloc[:,:-1]
X_test = X_test.iloc[:,:-1]

In [11]:
# 지역은  Scailing

scaler = RobustScaler()
loc_f = [    '지역_mean', 
             '지역_median',
             '지역_std', ]
X_train.loc[:, loc_f] = scaler.fit_transform(X_train[loc_f])
X_test.loc[:, loc_f] = scaler.transform(X_test[loc_f])

In [12]:
# 나머지 Scailing
scailing_features = ['총세대수',
                     '공가수',
                     '도보 10분거리 내 버스정류장 수',
                     '단지내주차면수',
                     '도보 10분거리 내 지하철역 수(환승노선 수 반영)_mean',
                       '도보 10분거리 내 지하철역 수(환승노선 수 반영)_median',
                    ]

scaler = StandardScaler()
X_train.loc[:, scailing_features] = scaler.fit_transform(X_train[scailing_features])
X_test.loc[:, scailing_features] = scaler.transform(X_test[scailing_features])

In [15]:
X = X_train.drop(columns = ['등록차량수'])
y = X_train['등록차량수']

In [14]:
y.skew(), np.log1p(y).skew()

(1.365623185127613, -1.0897027585640997)

In [16]:
y = np.log1p(y)

- Hyper params Tuning

In [20]:
def objective(trial: Trial) -> float:
    params_cat = {
        "random_state" : 42,
        "learning_rate" : 0.01,
        "n_estimators" : 10000,
        "verbose" : 1,
        "objective" : "RMSE",
        "max_depth" : trial.suggest_int("max_depth", 1, 16),
        "colsample_bylevel" : trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        "subsample" : trial.suggest_float("subsample", 0.3, 1.0),
        "min_child_samples" : trial.suggest_int("min_child_samples", 5, 100),
        "max_bin" : trial.suggest_int("max_bin", 200, 500),
    }
    
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

    model = CatBoostRegressor(**params_cat)
    model.fit(
        X_tr,
        y_tr,
        eval_set = [(X_tr, y_tr), (X_val, y_val)],
        early_stopping_rounds = 1000,
        verbose = False,
    )

    cat_pred = model.predict(X_val)
    log_score = mean_absolute_error(y_val, cat_pred)
    
    return log_score

In [21]:
sampler = TPESampler(seed = 42)
study = optuna.create_study(
    study_name = "cat_opt",
    direction = "minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials = 10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-07-26 20:00:08,996][0m A new study created in memory with name: cat_opt[0m
[32m[I 2021-07-26 20:00:18,960][0m Trial 0 finished with value: 0.26085285662617336 and parameters: {'max_depth': 6, 'colsample_bylevel': 0.9901428612819833, 'subsample': 0.8123957592679836, 'min_child_samples': 62, 'max_bin': 246}. Best is trial 0 with value: 0.26085285662617336.[0m
[32m[I 2021-07-26 20:00:19,845][0m Trial 1 finished with value: 0.2139322617183334 and parameters: {'max_depth': 3, 'colsample_bylevel': 0.8116167224336399, 'subsample': 0.9063233020424546, 'min_child_samples': 62, 'max_bin': 413}. Best is trial 1 with value: 0.2139322617183334.[0m
[32m[I 2021-07-26 20:00:21,987][0m Trial 2 finished with value: 0.2476411842349879 and parameters: {'max_depth': 1, 'colsample_bylevel': 0.9939819704323989, 'subsample': 0.8827098485602951, 'min_child_samples': 25, 'max_bin': 254}. Best is trial 1 with value: 0.2139322617183334.[0m
[32m[I 2021-07-26 20:00:23,424][0m Trial 3 finis

Best Score: 0.2139322617183334
Best trial: {'max_depth': 3, 'colsample_bylevel': 0.8116167224336399, 'subsample': 0.9063233020424546, 'min_child_samples': 62, 'max_bin': 413}


In [22]:
cat_p = study.best_trial.params
cat = CatBoostRegressor(**cat_p)

In [29]:
y_cat = pd.cut(y, 10, labels=range(10))
skf = StratifiedKFold(6)
mae_list = []
preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    X_val = X.iloc[val_id]
    y_val = y.iloc[val_id]

    
    cat.fit(X_tr, y_tr, verbose = 0)
    val_pred = cat.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_list.append(mae)
    pred = cat.predict(X_test)
    preds.append(pred)
print(np.mean(mae_list))
cat_pred = np.expm1(np.mean(preds, axis = 0))

0.2647754489268484


In [25]:
from sklearn.linear_model import Lasso, Ridge

In [26]:
ls = Lasso(random_state = 726)

In [31]:
mae_list = []
preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    X_val = X.iloc[val_id]
    y_val = y.iloc[val_id]
    ls.fit(X_tr, y_tr)
    val_pred = ls.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_list.append(mae)
    pred = ls.predict(X_test)
    preds.append(pred)
print(np.mean(mae_list))
ls_pred = np.expm1(np.mean(preds, axis = 0))

0.6263003018900815


In [33]:
rg = Ridge(random_state = 726)

In [35]:
mae_list = []
preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    X_val = X.iloc[val_id]
    y_val = y.iloc[val_id]
    rg.fit(X_tr, y_tr)
    val_pred = rg.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_list.append(mae)
    pred = rg.predict(X_test)
    preds.append(pred)
print(np.mean(mae_list))
rg_pred = np.expm1(np.mean(preds, axis = 0))

0.349613576512557


In [36]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor

In [37]:
rf = RandomForestRegressor(random_state = 726, n_estimators = 1000, criterion = 'mae', max_depth = 6)

In [38]:
mae_list = []
preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    X_val = X.iloc[val_id]
    y_val = y.iloc[val_id]
    rf.fit(X_tr, y_tr)
    val_pred = rf.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_list.append(mae)
    pred = rf.predict(X_test)
    preds.append(pred)
print(np.mean(mae_list))
rf_pred = np.expm1(np.mean(preds, axis = 0))

0.2765466723421235


In [39]:
etc = ExtraTreesRegressor(random_state = 726, n_estimators = 1000, criterion = 'mae', max_depth = 5)

In [40]:
mae_list = []
preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    X_val = X.iloc[val_id]
    y_val = y.iloc[val_id]
    etc.fit(X_tr, y_tr)
    val_pred = etc.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_list.append(mae)
    pred = etc.predict(X_test)
    preds.append(pred)
print(np.mean(mae_list))
etc_pred = np.expm1(np.mean(preds, axis = 0))

0.26738916707864285


In [41]:
abc = AdaBoostRegressor(random_state = 726, learning_rate = .01, n_estimators = 1000, loss = 'linear')

In [42]:
mae_list = []
preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    X_val = X.iloc[val_id]
    y_val = y.iloc[val_id]
    abc.fit(X_tr, y_tr)
    val_pred = abc.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_list.append(mae)
    pred = abc.predict(X_test)
    preds.append(pred)
print(np.mean(mae_list))
abc_pred = np.expm1(np.mean(preds, axis = 0))

0.32261161703510216


In [43]:
from lightgbm import LGBMRegressor

In [44]:
lgbm = LGBMRegressor(random_state = 726, n_estimators = 10000, max_depth = 6, learning_rate = .01)

In [45]:
mae_list = []
preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    X_val = X.iloc[val_id]
    y_val = y.iloc[val_id]
    lgbm.fit(X_tr, y_tr, verbose = 0)
    val_pred = lgbm.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_list.append(mae)
    pred = lgbm.predict(X_test)
    preds.append(pred)
print(np.mean(mae_list))
lgbm_pred = np.expm1(np.mean(preds, axis = 0))

0.32631968521638555


In [46]:
from sklearn.tree import DecisionTreeRegressor

In [49]:
dt = DecisionTreeRegressor(random_state = 726, criterion = 'mse', max_depth = 6)

In [51]:
mae_list = []
preds = []
for tr_id, val_id in skf.split(X, y_cat) : 
    X_tr = X.iloc[tr_id]
    y_tr = y.iloc[tr_id]
    X_val = X.iloc[val_id]
    y_val = y.iloc[val_id]
    dt.fit(X_tr, y_tr)
    val_pred = dt.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_list.append(mae)
    pred = dt.predict(X_test)
    preds.append(pred)
print(np.mean(mae_list))
dt_pred = np.expm1(np.mean(preds, axis = 0))

0.324609926414907


In [54]:
sample = pd.read_csv('sample_submission.csv')
sample['num'] = (etc_pred + abc_pred + cat_pred + rf_pred) / 4
sample.to_csv('cat.csv', index=False)

In [53]:
sample

Unnamed: 0,code,num
0,C1072,677.021199
1,C1128,1103.112293
2,C1456,504.808036
3,C1840,502.099697
4,C1332,965.566019
5,C1563,1429.695599
6,C1794,842.7594
7,C1640,418.323587
8,C1377,360.905548
9,C2072,294.888276
