In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder

In [None]:
df = pd.read_csv('/content/drive/MyDrive/전처리5.09_진영.csv')

In [None]:
encoder = OneHotEncoder()
df_cate = pd.DataFrame(encoder.fit_transform(df[['month']]).toarray(), columns=['month_1','month_2','month_3','month_4','month_5','month_6','month_7','month_8','month_9','month_10','month_11','month_12'])

In [None]:
df = pd.concat([df, df_cate], axis=1)
df.drop('month',axis=1,inplace=True)

# buildingarea, yearbuilt 삭제

In [None]:
df2 = df.copy()
# q3 + 1.5IQR 초과값들을 q3 + 1.5IQR로 값 대체
temp = []
for i in df2.price:
    if i>2350000: temp.append(2350000)
    else: temp.append(i)

df2['price'] = temp

temp = []
for i in df2.distance:
    if i>23.35: temp.append(23.35)
    else: temp.append(i)

df2['distance'] = temp

temp = []
for i in df2.landsize:
    if i>1362: temp.append(1362)
    else: temp.append(i)

df2['landsize'] = temp

temp = []
for i in df2.propertycount:
    if i>19257.5: temp.append(19258)
    else: temp.append(i)

df2['propertycount'] = temp

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler


def calculate_original_scale_metrics(y_test_log, y_pred_log):
    # np.expm1을 사용하여 원래 값으로 되돌림
    y_test_original = np.expm1(y_test_log)
    y_pred_original = np.expm1(y_pred_log)

    # 원래 값에 대한 RMSE와 MAE, R-squared 계산
    rmse_original = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
    mae_original = mean_absolute_error(y_test_original, y_pred_original)
    r2 = r2_score(y_test_original, y_pred_original)

    return rmse_original, mae_original, r2

X = df2.drop(['price','date','lattitude','longtitude'], axis=1)
y = df2['price']
X = np.log1p(X)
y = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# 모델 정의 및 훈련
models = [
    ('LR', LinearRegression()),
    ('LASSO', Lasso()),
    ('EN', ElasticNet()),
    ('KNN', KNeighborsRegressor()),
    ('CART', DecisionTreeRegressor()),
    ('SVR', SVR()),
    ('ABR', AdaBoostRegressor()),
    ('GBR', GradientBoostingRegressor()),
    ('RFR', RandomForestRegressor()),
    ('ETR', ExtraTreesRegressor()),
    ('XGB', XGBRegressor()),
    ('CAT', CatBoostRegressor(silent=True)),
    ('BAG', BaggingRegressor())
]

# 모델별 성과지표 계산
models_names = [name for name, _ in models]
r2_scores, rmse_scores, mae_scores = [], [], []

for name, model in models:
    model.fit(X_train, y_train)  # 모델 훈련
    y_pred_log = model.predict(X_test)  # 예측
    rmse, mae, r2 = calculate_original_scale_metrics(y_test, y_pred_log)

    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)

# 결과를 데이터프레임으로 정리
performance_df = pd.DataFrame({
    'Model': models_names,
    'R-square': r2_scores,
    'RMSE': rmse_scores,
    'MAE': mae_scores
})

# 결과 출력
performance_df.sort_values(by='MAE', ascending=False)

Unnamed: 0,Model,R-square,RMSE,MAE
1,LASSO,-0.057843,542259.883265,409290.595363
2,EN,-0.057843,542259.883265,409290.595363
6,ABR,0.42797,398755.081886,289667.367683
3,KNN,0.650056,311886.419498,221348.253126
4,CART,0.688428,294290.397941,204008.185032
0,LR,0.729732,274090.69544,194581.900311
7,GBR,0.770065,252813.07298,175569.12674
5,SVR,0.769486,253131.089332,175034.736251
9,ETR,0.794621,238932.338766,163064.10248
12,BAG,0.815247,226616.899079,156901.273567


In [None]:
# stacking
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models

    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]

        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self

    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# 탐색할 하이퍼파라미터 그리드
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': sp_randint(4, 10),
    'l2_leaf_reg': sp_randint(1, 10),
    'iterations': [100, 200, 300]
}

# CatBoost 모델 생성
catboost = CatBoostRegressor(random_state=42, verbose=False)

# RandomizedSearchCV 객체 생성
random_search = RandomizedSearchCV(estimator=catboost,
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   scoring='neg_mean_squared_error',
                                   cv=5,
                                   random_state=42)

# 교차 검증 수행
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", random_search.best_params_)

In [None]:
param_dist = {
    'n_estimators': sp_randint(50, 500),  # 트리의 개수
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],  # 학습 속도
    'max_depth': [3, 4, 5, 6, 7],  # 트리의 깊이
    'min_child_weight': sp_randint(1, 10),  # 노드를 분할하기 위한 최소 샘플 수
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]  # 각 트리에서 사용할 피처의 최대 비율
}

# XGBoost 모델 생성
xgb = XGBRegressor(random_state=42)

# RandomizedSearchCV 객체 생성
random_search = RandomizedSearchCV(estimator=xgb,
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   scoring='neg_mean_squared_error',
                                   cv=5,
                                   random_state=42,
                                   verbose=1,
                                   n_jobs=-1)

# RandomizedSearchCV 수행
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", random_search.best_params_)

In [None]:
# RandomizedSearchCV를 위한 하이퍼파라미터 그리드 설정
param_dist = {
    'n_estimators': sp_randint(50, 500),  # 트리의 개수
    'max_features': ['auto', 'sqrt'],  # 각 트리에서 사용할 최대 피처 개수
    'max_depth': [None] + list(range(5, 30, 5)),  # 트리의 깊이
    'min_samples_split': sp_randint(2, 20),  # 노드를 분할하기 위한 최소 샘플 수
    'min_samples_leaf': sp_randint(1, 20)  # 리프 노드가 되기 위한 최소 샘플 수
}

# 랜덤 포레스트 모델 생성
rf = RandomForestRegressor(random_state=42)

# RandomizedSearchCV 객체 생성
random_search = RandomizedSearchCV(estimator=rf,
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   scoring='neg_mean_squared_error',
                                   cv=5,
                                   random_state=42,
                                   verbose=1,
                                   n_jobs=-1)

# RandomizedSearchCV 수행
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", random_search.best_params_)

In [None]:
# 랜덤서치로 찾은 파라미터로 상위 3개 모델 stacking
averaged_models = AveragingModels(models = (RandomForestRegressor(random_state=42, max_depth=25, max_features='auto', min_samples_leaf=3, min_samples_split=2, n_estimators=276),
                                            XGBRegressor(random_state=42, colsample_bytree=0.8, learning_rate=0.05, max_depth=7, min_child_weight=6, n_estimators=465),
                                            CatBoostRegressor(depth=8, iterations=300, l2_leaf_reg=3, learning_rate=0.1, random_state=42)))

In [None]:
# stacking model을 통해 가장 효과가 좋았던 catboost 모델보다 성능 소폭 상승
averaged_models.fit(X_train, y_train)
y_pred_log = averaged_models.predict(X_test.values)

rmse, mae, r2 = calculate_original_scale_metrics(y_test, y_pred_log)
print("RMSE:", rmse)
print("MAE:", mae)
print("R-squared:", r2)

0:	learn: 0.4761677	total: 25.9ms	remaining: 7.75s
1:	learn: 0.4515877	total: 40.7ms	remaining: 6.06s
2:	learn: 0.4285633	total: 52.6ms	remaining: 5.21s
3:	learn: 0.4082201	total: 65ms	remaining: 4.81s
4:	learn: 0.3910125	total: 82.7ms	remaining: 4.88s
5:	learn: 0.3749486	total: 110ms	remaining: 5.4s
6:	learn: 0.3590552	total: 137ms	remaining: 5.74s
7:	learn: 0.3454523	total: 161ms	remaining: 5.89s
8:	learn: 0.3348516	total: 186ms	remaining: 6.02s
9:	learn: 0.3248175	total: 211ms	remaining: 6.13s
10:	learn: 0.3148273	total: 236ms	remaining: 6.19s
11:	learn: 0.3080645	total: 251ms	remaining: 6.02s
12:	learn: 0.2997185	total: 271ms	remaining: 5.99s
13:	learn: 0.2935826	total: 291ms	remaining: 5.94s
14:	learn: 0.2869312	total: 313ms	remaining: 5.95s
15:	learn: 0.2808121	total: 334ms	remaining: 5.93s
16:	learn: 0.2751356	total: 353ms	remaining: 5.88s
17:	learn: 0.2708674	total: 374ms	remaining: 5.86s
18:	learn: 0.2671676	total: 390ms	remaining: 5.77s
19:	learn: 0.2631679	total: 413ms	remai

# buildingarea, yearbuilt 활용 (단, 모든 결측치 제거)

In [None]:
raw_df = pd.read_csv('/content/drive/MyDrive/melb_data.csv')

In [None]:
df1 = df.copy()
df1['buildingarea'] = raw_df['BuildingArea']
df1['yearbuilt'] = raw_df['YearBuilt']

In [None]:
by_df = df1.dropna()

In [None]:
col = 'buildingarea'
q1 = by_df[col].quantile(0.25)
q3 = by_df[col].quantile(0.75)
iqr = q3 - q1
print(q3 + 1.5 * iqr)

292.375


In [None]:
# q3 + 1.5IQR 초과값들을 q3 + 1.5IQR로 값 대체
temp = []
for i in by_df.price:
    if i>2392500.0: temp.append(2392500.0)
    else: temp.append(i)

by_df['price'] = temp

temp = []
for i in by_df.distance:
    if i>23.35: temp.append(23.35)
    else: temp.append(i)

by_df['distance'] = temp

temp = []
for i in by_df.landsize:
    if i>1351.0: temp.append(1351.0)
    else: temp.append(i)

by_df['landsize'] = temp

temp = []
for i in by_df.propertycount:
    if i>18860.0: temp.append(18860.0)
    else: temp.append(i)

by_df['propertycount'] = temp

temp = []
for i in by_df.buildingarea:
    if i>292.375: temp.append(292.375)
    else: temp.append(i)

by_df['buildingarea'] = temp

by_df = by_df[by_df['yearbuilt']>1196]

In [None]:
X = by_df.drop(['price','date','lattitude','longtitude'], axis=1)
y = by_df['price']
X = np.log1p(X)
y = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [None]:

# 모델 정의 및 훈련
models = [
    ('LR', LinearRegression()),
    ('LASSO', Lasso()),
    ('EN', ElasticNet()),
    ('KNN', KNeighborsRegressor()),
    ('CART', DecisionTreeRegressor()),
    ('SVR', SVR()),
    ('ABR', AdaBoostRegressor()),
    ('GBR', GradientBoostingRegressor()),
    ('RFR', RandomForestRegressor()),
    ('ETR', ExtraTreesRegressor()),
    ('XGB', XGBRegressor()),
    ('CAT', CatBoostRegressor(silent=True)),
    ('BAG', BaggingRegressor())
]

# 모델별 성과지표 계산
models_names = [name for name, _ in models]
r2_scores, rmse_scores, mae_scores = [], [], []

for name, model in models:
    model.fit(X_train, y_train)  # 모델 훈련
    y_pred_log = model.predict(X_test)  # 예측
    rmse, mae, r2 = calculate_original_scale_metrics(y_test, y_pred_log)

    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)

# 결과를 데이터프레임으로 정리
performance_df = pd.DataFrame({
    'Model': models_names,
    'R-square': r2_scores,
    'RMSE': rmse_scores,
    'MAE': mae_scores
})

# 결과 출력
performance_df.sort_values(by='MAE', ascending=False)

Unnamed: 0,Model,R-square,RMSE,MAE
1,LASSO,-0.050928,560691.713917,425229.251792
2,EN,-0.050928,560691.713917,425229.251792
6,ABR,0.630523,332453.802972,245996.863627
3,KNN,0.650917,323148.167379,229586.105098
4,CART,0.679022,309866.891669,214094.077988
0,LR,0.762708,266427.804957,188461.555489
5,SVR,0.805407,241269.151426,167635.532943
7,GBR,0.824889,228872.592959,158814.927025
12,BAG,0.827709,227022.714227,156088.450085
8,RFR,0.846016,214622.280294,147581.48066


In [None]:
# 상위 3개 모델 stacking
averaged_models = AveragingModels(models = (ExtraTreesRegressor(random_state=42),
                                            XGBRegressor(random_state=42),
                                            CatBoostRegressor(random_state=42, verbose=False)))

In [None]:
# stacking model을 사용했을 경우 성능 향상되지 않음 -> catboost단일모델에 대한 파라미터 튜닝
averaged_models.fit(X_train, y_train)
y_pred_log = averaged_models.predict(X_test.values)

rmse, mae, r2 = calculate_original_scale_metrics(y_test, y_pred_log)
print("RMSE:", rmse)
print("MAE:", mae)
print("R-squared:", r2)

RMSE: 192726.74812466698
MAE: 131824.26463234148
R-squared: 0.8758322839902032


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# 탐색할 하이퍼파라미터 그리드
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': sp_randint(4, 10),
    'l2_leaf_reg': sp_randint(1, 10),
    'iterations': [100, 200, 300]
}

# CatBoost 모델 생성
catboost = CatBoostRegressor(random_state=42, verbose=False)

# RandomizedSearchCV 객체 생성
random_search = RandomizedSearchCV(estimator=catboost,
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   scoring='neg_mean_squared_error',
                                   cv=5,
                                   random_state=42)

# 교차 검증 수행
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", random_search.best_params_)

Best hyperparameters: {'depth': 6, 'iterations': 300, 'l2_leaf_reg': 1, 'learning_rate': 0.1}


In [None]:
# 파라미터 튜닝 시 결과 하락, 기본 catboost 모형 사용
best_model = random_search.best_estimator_

# 테스트 데이터에 대한 예측 수행
y_pred_log = best_model.predict(X_test)
rmse, mae, r2 = calculate_original_scale_metrics(y_test, y_pred_log)
print("RMSE:", rmse)
print("MAE:", mae)
print("R-squared:", r2)

RMSE: 190533.47261230001
MAE: 131652.0561963088
R-squared: 0.8756150546022569


 # buildingarea 컬럼을 제거하고 yearbuilt 컬럼의 결측치를 행 제거하여 df2 생성


In [None]:
df1 = df.copy()
df1['buildingarea'] = raw_df['BuildingArea']
df1['yearbuilt'] = raw_df['YearBuilt']

In [None]:
df2 = df1.dropna(subset=['yearbuilt']).drop(columns=['buildingarea'])

In [None]:
df2

Unnamed: 0,rooms,price,date,distance,bedroom2,bathroom,car,landsize,lattitude,longtitude,...,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,yearbuilt
1,2,1035000.0,4/02/2016,2.5,2.0,1.0,0.0,156.0,-37.80790,144.99340,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1900.0
2,3,1465000.0,4/03/2017,2.5,3.0,2.0,0.0,134.0,-37.80930,144.99440,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1900.0
4,4,1600000.0,4/06/2016,2.5,3.0,1.0,2.0,120.0,-37.80720,144.99410,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014.0
6,3,1876000.0,7/05/2016,2.5,4.0,2.0,0.0,245.0,-37.80240,144.99930,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1910.0
7,2,1636000.0,8/10/2016,2.5,2.0,1.0,2.0,256.0,-37.80600,144.99540,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1890.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,1245000.0,26/08/2017,16.7,4.0,2.0,2.0,652.0,-37.90562,145.16761,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1981.0
13576,3,1031000.0,26/08/2017,6.8,3.0,2.0,2.0,333.0,-37.85927,144.87904,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1995.0
13577,3,1170000.0,26/08/2017,6.8,3.0,2.0,4.0,436.0,-37.85274,144.88738,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1997.0
13578,4,2500000.0,26/08/2017,6.8,4.0,1.0,5.0,866.0,-37.85908,144.89299,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1920.0
