## 데이터 불러오기

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
# XGBoost 라이브러리 설치
!pip install xgboost

import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# LightGBM 라이브러리 설치
!pip install lightgbm

import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split



## EDA 및 전처리

In [98]:
train

Unnamed: 0,exclusive_use_area,floor,year_of_completion,transaction_real_price,transaction_year,transaction_month
0,158.54,13,1983,174000,2014,1
1,127.61,6,1983,157500,2014,1
2,127.61,5,1983,150000,2014,1
3,127.61,9,1983,152000,2014,2
4,84.81,3,1983,116000,2014,2
...,...,...,...,...,...,...
5982,190.47,8,1985,507500,2022,5
5983,126.33,10,1985,380000,2022,6
5984,126.33,10,1985,380000,2022,6
5985,126.33,7,1985,380000,2022,7


In [99]:
test

Unnamed: 0,exclusive_use_area,floor,year_of_completion,transaction_year,transaction_month
0,77.97,2,2021,2023,4
1,59.99,2,2021,2023,4
2,84.81,3,1983,2023,1
3,84.81,9,1983,2023,1
4,84.81,5,1983,2023,2
...,...,...,...,...,...
191,84.96,1,1985,2023,6
192,126.33,1,1985,2023,6
193,190.47,13,1985,2023,6
194,126.33,12,1985,2023,6


### 사용 할 변수만 선택

In [4]:
train = train[['exclusive_use_area', 'transaction_year_month', 'floor', 'year_of_completion', 'transaction_real_price']]
test = test[['exclusive_use_area', 'transaction_year_month', 'floor', 'year_of_completion']]

### `년월` 변수를 연도/월 로 분리

In [5]:
train['transaction_year'] = train['transaction_year_month'].apply(lambda x : int(str(x)[:4]))
train['transaction_month'] = train['transaction_year_month'].apply(lambda x : int(str(x)[4:]))

In [6]:
test['transaction_year'] =test['transaction_year_month'].apply(lambda x : int(str(x)[:4]))
test['transaction_month'] = test['transaction_year_month'].apply(lambda x : int(str(x)[4:]))

In [7]:
train.drop('transaction_year_month',axis= 1 ,inplace = True)
test.drop('transaction_year_month',axis= 1 ,inplace = True)

In [100]:
train

Unnamed: 0,exclusive_use_area,floor,year_of_completion,transaction_real_price,transaction_year,transaction_month
0,158.54,13,1983,174000,2014,1
1,127.61,6,1983,157500,2014,1
2,127.61,5,1983,150000,2014,1
3,127.61,9,1983,152000,2014,2
4,84.81,3,1983,116000,2014,2
...,...,...,...,...,...,...
5982,190.47,8,1985,507500,2022,5
5983,126.33,10,1985,380000,2022,6
5984,126.33,10,1985,380000,2022,6
5985,126.33,7,1985,380000,2022,7


## 모델링

In [9]:
!pip install scikit-learn
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [101]:
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
#, loss_function='MAE'


# 데이터 로드
train_x = train.drop('transaction_real_price', axis=1)
train_y = train['transaction_real_price']

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.03273, random_state=42)

model_lgb = LGBMRegressor(colsample_bytree=0.9, learning_rate=0.2, max_depth=8, n_estimators=200, subsample=0.8)
model_rf = RandomForestRegressor(n_estimators=150, max_depth=15, random_state=42)
model_xgb = XGBRegressor(learning_rate=0.01, n_estimators=150, max_depth=6, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.005, random_state=42)
model_catboost = CatBoostRegressor(iterations=10000, learning_rate=1, depth=4, loss_function='MAE')
model_ridge = Ridge(alpha=0.1)
model_gb = GradientBoostingRegressor(learning_rate=0.1, n_estimators=150, max_depth=3, random_state=42)
model_svr = SVR(kernel='rbf', C=1.0, epsilon=0.2)
model_adaboost = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=100),
    n_estimators=200,
    learning_rate=0.01,
    loss='linear'
)

# 개별 모델들의 리스트를 생성합니다.
voting_models = [
    ('lgb', model_lgb),
    ('rf', model_rf),
    ('xgb', model_xgb),
    ('catboost', model_catboost),
    ('ridge', model_ridge),
    ('gb', model_gb),
    ('svr', model_svr),
    ('adaboost', model_adaboost)
]

# 보팅 앙상블 모델을 생성합니다.
voting_ensemble = VotingRegressor(estimators=voting_models)

# 보팅 앙상블 모델을 학습합니다.
voting_ensemble.fit(X_train, y_train)

# 검증 데이터에 대한 예측을 수행합니다.
val_pred_voting = voting_ensemble.predict(X_val)

# 검증 데이터에 대한 MAE를 계산합니다.
mae_voting = mean_absolute_error(y_val, val_pred_voting)
print("보팅 앙상블 검증 MAE:", mae_voting)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
5001:	learn: 3910.7983007	total: 6.56s	remaining: 6.55s
5002:	learn: 3910.7017655	total: 6.56s	remaining: 6.56s
5003:	learn: 3910.6983380	total: 6.57s	remaining: 6.56s
5004:	learn: 3910.6602231	total: 6.57s	remaining: 6.55s
5005:	learn: 3910.6539684	total: 6.57s	remaining: 6.55s
5006:	learn: 3910.6521853	total: 6.57s	remaining: 6.55s
5007:	learn: 3910.3816443	total: 6.57s	remaining: 6.55s
5008:	learn: 3910.3784701	total: 6.57s	remaining: 6.55s
5009:	learn: 3910.1981518	total: 6.57s	remaining: 6.55s
5010:	learn: 3910.1886288	total: 6.58s	remaining: 6.55s
5011:	learn: 3910.1871475	total: 6.58s	remaining: 6.54s
5012:	learn: 3910.1661873	total: 6.58s	remaining: 6.54s
5013:	learn: 3910.1468283	total: 6.58s	remaining: 6.54s
5014:	learn: 3909.9346467	total: 6.58s	remaining: 6.54s
5015:	learn: 3909.9291879	total: 6.58s	remaining: 6.54s
5016:	learn: 3909.9169537	total: 6.58s	remaining: 6.54s
5017:	learn: 3909.7970114	total: 6.58s	remaining: 6.54

## 예측

In [102]:
# 테스트 데이터에 대한 예측을 수행
pred = voting_ensemble.predict(test)


## 정답파일 제출

In [103]:

submission = pd.read_csv('sample_submission.csv')
submission.head()

Unnamed: 0,id,transaction_real_price
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0


In [104]:
submission['transaction_real_price'] = pred
submission.head(14)

Unnamed: 0,id,transaction_real_price
0,TEST_0000,217673.743115
1,TEST_0001,188109.371478
2,TEST_0002,256284.361232
3,TEST_0003,258599.609684
4,TEST_0004,258045.691187
5,TEST_0005,359263.176908
6,TEST_0006,262710.35552
7,TEST_0007,349184.698923
8,TEST_0008,381269.902981
9,TEST_0009,232837.526448


In [105]:

submission.to_csv('voting_ensemble.csv',index = False)