In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

#한글폰트 설정
import matplotlib.font_manager as fm
path = 'C:\\Users\\myksh\\AppData\\Local\\Microsoft\\Windows\\Fonts\\NanumGothic.ttf'
# path = 'C:\\Users\\myksh\\AppData\\Local\\Microsoft\\Windows\\Fonts\\NanumSquare.ttf'
font_name = fm.FontProperties(fname=path).get_name()
print(font_name)
plt.rc('font', family=font_name)

plt.rcParams['font.family'] = 'NanumGothic'

#마이너스가 깨질 것을 방지
plt.rcParams['axes.unicode_minus'] = False

NanumGothic


# 데이터 로드 및 전처리

In [12]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [13]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [14]:
#연대 정리해주는 함수
def tail_year(x):
    if 0<=x<10:
        return '00'
    elif 10<=x<20:
        return '10'
    elif 20<=x<30:
        return '20'
    elif 30<=x<40:
        return '30'
    elif 40<=x<50:
        return '40'
    elif 50<=x<60:
        return '50'
    elif 60<=x<70:
        return '60'
    elif 70<=x<80:
        return '70'
    elif 80<=x<90:
        return '80'
    elif 90<=x<100:
        return '90'
def year_processing(x):
    xx = str(x)
    if xx[:2] == '18':
        return '18' + tail_year(int(xx[2:]))
    elif xx[:2] == '19':
        return '19' + tail_year(int(xx[2:]))
    elif xx[:2] == '20':
        return '20' + tail_year(int(xx[2:]))

In [15]:
#연대별로 변경
train['Year Built'] = train['Year Built'].apply(lambda x:year_processing(x))
train['Year Built'] = train['Year Built'].astype(int)

test['Year Built'] = test['Year Built'].apply(lambda x:year_processing(x))
test['Year Built'] = test['Year Built'].astype(int)

#연대별로 정리
train['Year Remod/Add'] = train['Year Remod/Add'].apply(lambda x:year_processing(x))
train['Year Remod/Add'] = train['Year Remod/Add'].astype(int)

test['Year Remod/Add'] = test['Year Remod/Add'].apply(lambda x:year_processing(x))
test['Year Remod/Add'] = test['Year Remod/Add'].astype(int)

In [16]:
#차고 자리 개수와 차고 면적은 의미가 비슷하므로 자리 개수를 drop
train = train.drop('Garage Cars', axis=1)
test = test.drop('Garage Cars', axis=1)

In [17]:
#2207년 데이터 삭제
train = train.drop(train[train['Garage Yr Blt']>=2022].index)

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = ['Exter Qual','Kitchen Qual','Bsmt Qual']
for i in cols:
    train[i] = le.fit_transform(train[i])
    test[i] = le.fit_transform(test[i])

## 파생변수
- 참고 자료 : https://dacon.io/competitions/official/235869/codeshare/4304?page=1&dtype=recent
- 2층 면적 2nd flr SF= 지상층 생활 면적 - 1층 면적
- 2층 여부 2nd flr= 1(지상층 생활 면적 - 1층 면적 > 0), 0(지상층 생활 면적 - 1층 면적 < 0)
- _전체 면적 Total SF = 지상층 생활 면적 + 지하실 면적 + 차고 면적 **(제거)**_ 
- 차고 밖/안 Garage In/Out = 1(지상층 생활 면적 != 1층 면적), 0(지상층 생활 면적 == 1층 면적)
- 리모델링 연도 차 Year Gap Remod = 리모델링 연도 - 완공 연도
- 차고 자리당 면적 Car Area= 차고 면적/차고 자리 개수
- 품질 합 Sum Qual = (전반적 + 부억 + 재료 + 지하실) 품질

In [19]:
def feature_eng(data_):
    data = data_.copy()
    data['Year Gap Remod'] = data['Year Remod/Add'] - data['Year Built']
#     data['Car Area'] = data['Garage Area']/data['Garage Cars']
    data['2nd flr SF'] = data['Gr Liv Area'] - data['1st Flr SF']
    data['2nd flr'] = data['2nd flr SF'].apply(lambda x : 1 if x > 0 else 0)
    data['Total SF'] = data[['Gr Liv Area',"Garage Area", "Total Bsmt SF"]].sum(axis=1)
    data['Sum Qual'] = data[["Exter Qual", "Kitchen Qual", "Overall Qual"]].sum(axis=1)
    data['Garage InOut'] = data.apply(lambda x : 1 if x['Gr Liv Area'] != x['1st Flr SF'] else 0, axis=1)
    return data

train = feature_eng(train)
test = feature_eng(test)

# Modeling

In [20]:
# 대회 규칙의 평가 산식 함수를 그대로 사용
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [21]:
X = train.drop('target', axis=1)
y = np.log1p(train['target'])

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((1079, 18), (270, 18))

In [23]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [24]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [25]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [57]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

rf = RandomForestRegressor(max_depth=20, max_features='sqrt', min_samples_split=4, n_estimators=2000, random_state =5)
catboost = CatBoostRegressor(depth = 4, random_state = 42, loss_function = 'MAE', n_estimators = 3000, learning_rate = 0.03, verbose = 0)

In [58]:
score = rmsle_cv(rf)
print("rf score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(catboost)
print("catboost score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(lasso)
print("Lasso score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})" .format(score.mean(), score.std()))

rf score: 0.1428 (0.0262)
catboost score: 0.1387 (0.0215)
Lasso score: 0.1453 (0.0264)
ElasticNet score: 0.1453 (0.0264)
Kernel Ridge score: 0.1450 (0.0190)
Gradient Boosting score: 0.1452 (0.0247)
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification

In [63]:
rf_pred = rf.fit(X_train, y_train)
catboost_pred = catboost.fit(X_train, y_train)
krr_pred = KRR.fit(X_train, y_train)
gb_pred = GBoost.fit(X_train, y_train)
xgb_pred = model_xgb.fit(X_train, y_train)
lgb_pred = model_lgb.fit(X_train, y_train)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [94]:
rf_pred = rf.predict(X_test)
catboost_pred = catboost.predict(X_test)
krr_pred = KRR.predict(X_test)
gb_pred = GBoost.predict(X_test)
xgb_pred = model_xgb.predict(X_test)
lgb_pred = model_lgb.predict(X_test)

# final_pred = ((catboost_pred*0.4) + (rf_pred*0.3) + (xgb_pred*0.3))*0.6 + ((krr_pred*0.5)+(lgb_pred*0.5))*0.4
final_pred = (catboost_pred*0.5) + (xgb_pred * 0.3) + (lgb_pred * 0.2)

In [95]:
NMAE(y_test, final_pred)

0.008367756924140964

In [96]:
NMAE(np.expm1(y_test), np.expm1(final_pred))

0.10017142843223523

In [None]:
# !pip install catboost ngboost

In [38]:
from catboost import CatBoostRegressor, Pool
from ngboost import NGBRegressor

In [106]:
from sklearn.ensemble import VotingRegressor
models = [
    ('catboost', catboost),
#     ('krr', KRR),
#     ('rf', rf),
#     ('gbr', GBoost),
    ('xgb',  model_xgb),
    ('lgbm', model_lgb)
]

voting_rg = VotingRegressor(estimators=models)
voting_rg.fit(X_train, y_train)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




VotingRegressor(estimators=[('catboost',
                             <catboost.core.CatBoostRegressor object at 0x00000252FED29BB0>),
                            ('xgb',
                             XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=0.4603, gamma=0.0468,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.05, max_delta_step=0,
                                          max_depth=3, min_child_weight=1.7817,
                                          missing=nan,
                                          monotone_constraints='()',
                                          n_estimators=2200, n_jobs=8,
                                          nth

In [107]:
NMAE(np.expm1(y_test), np.expm1(voting_rg.predict(X_test)))

0.1010011228017121

In [100]:
catboost_pred = catboost.predict(test)
xgb_pred = model_xgb.predict(test)
# lgb_pred = model_lgb.predict(test)
# final_pred = (catboost_pred*0.5) + (xgb_pred * 0.3) + (lgb_pred * 0.2)
final_pred = (catboost_pred*0.5) + (xgb_pred * 0.5)

In [102]:
# final_model = voting_rg
# pred = final_model.predict(test)
sub = pd.read_csv('./data/sample_submission.csv')
sub['target'] = np.expm1(final_pred)
sub

Unnamed: 0,id,target
0,1,350046.091851
1,2,125369.560169
2,3,172916.387713
3,4,258615.267504
4,5,131589.278620
...,...,...
1345,1346,316173.689777
1346,1347,125480.102252
1347,1348,84485.913125
1348,1349,186160.483757


In [103]:
sub.to_csv('./submission_data/voting_submission10.csv', index=False)