<a href="https://colab.research.google.com/github/roklp/MLP34/blob/main/2024_03_19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

RF

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

data = pd.read_csv("/content/drive/MyDrive/data/머신러닝/data/final_reordered.csv")

print(data.columns)


In [13]:
print(data.columns)


Index(['매출_금액', '기준_년도', '상권_구분_코드_명', '상권_코드_명', '행정동_코드_명', '시간대_00~06',
       '시간대_06~11', '시간대_11~14', '시간대_14~17', '시간대_17~21', '시간대_21~24',
       '기준_분기_1', '기준_분기_2', '기준_분기_3', '기준_분기_4', '총_가구_수', '총_상주인구_수',
       '총_직장_인구_수', '시간대_별_유동인구_수', '평일_유동인구_수_평균', '주말_유동인구_수_평균',
       '월_평균_소득_금액', '소득_구간_코드', '지출_총금액', '집객시설_수', '가구_대비_인구_비율'],
      dtype='object')


In [None]:
import pandas as pd

data = pd.read_csv("/content/drive/MyDrive/data/머신러닝/data/final_reordered.csv")

data['시간대_매출금액'] = data['시간대_00~06'] + data['시간대_06~11'] + data['시간대_11~14'] + data['시간대_14~17'] + data['시간대_17~21'] + data['시간대_21~24']

data.drop(columns=['시간대_00~06', '시간대_06~11', '시간대_11~14', '시간대_14~17', '시간대_17~21', '시간대_21~24'], inplace=True)

print(data.head())

In [14]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

data = pd.read_csv("/content/drive/MyDrive/data/머신러닝/data/final_reordered.csv")

numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = data.select_dtypes(exclude=[np.number]).columns.tolist()

data['가구_대비_인구_비율'] = data['총_가구_수'] / data['총_상주인구_수']
numeric_columns.append('가구_대비_인구_비율')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

target_column = '매출_금액'

preprocessed_X = preprocessor.fit_transform(X)

print("변환된 데이터의 차원:", preprocessed_X.shape)

변환된 데이터의 차원: (3600, 100)


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

y = data['매출_금액']

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

cv_scores = cross_val_score(rf_model, preprocessed_X, y, cv=kf, scoring=rmse_scorer)

cv_scores_mean = -cv_scores.mean()

r2_scores = cross_val_score(rf_model, preprocessed_X, y, cv=kf, scoring='r2')

r2_scores_mean = r2_scores.mean()

cv_scores_mean, r2_scores_mean

(5131454.083772499, 0.9998918501461374)

In [18]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_distributions,
    n_iter=10,
    cv=kf,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

random_search.fit(preprocessed_X, y)

best_params = random_search.best_params_
best_rmse = np.sqrt(-random_search.best_score_)

best_params, best_rmse

({'n_estimators': 500,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_depth': 50,
  'bootstrap': True},
 5145118.96911789)

In [19]:
# 최적의 하이퍼파라미터를 사용하여 RandomForestRegressor 모델 설정
optimized_rf_model = RandomForestRegressor(
    n_estimators=500,
    min_samples_split=2,
    min_samples_leaf=1,
    max_depth=50,
    bootstrap=True,
    random_state=42
)

# KFold 교차 검증을 사용하여 모델의 RMSE와 R^2 계산
rmse_scores = cross_val_score(optimized_rf_model, preprocessed_X, y, cv=kf, scoring=rmse_scorer)
r2_scores = cross_val_score(optimized_rf_model, preprocessed_X, y, cv=kf, scoring='r2')

# 평균 RMSE와 R^2 출력
optimized_rmse_mean = -rmse_scores.mean()
optimized_r2_mean = r2_scores.mean()

optimized_rmse_mean, optimized_r2_mean

(4852283.1663839, 0.9999022827293755)

### RandomForest & GradientBoosting Ensemble

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

# GradientBoostingRegressor 모델을 위한 하이퍼파라미터 탐색 공간 정의
gb_param_distributions = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# RandomizedSearchCV 설정
gb_random_search = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_distributions=gb_param_distributions,
    n_iter=10,  # 탐색할 파라미터 설정 조합의 수
    cv=kf,  # KFold 교차 검증
    scoring='neg_mean_squared_error',  # 평가 지표
    random_state=42,
    n_jobs=-1  # 모든 CPU 코어 사용
)

# 하이퍼파라미터 탐색 실행
gb_random_search.fit(preprocessed_X, y)

# GradientBoostingRegressor의 최적의 파라미터와 그 때의 RMSE 출력
gb_best_params = gb_random_search.best_params_
gb_best_rmse = np.sqrt(-gb_random_search.best_score_)

gb_best_params, gb_best_rmse

({'n_estimators': 300,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_depth': 5,
  'learning_rate': 0.1},
 4008111.4476043037)

In [21]:
from sklearn.metrics import mean_squared_error, r2_score

# 최적화된 하이퍼파라미터로 GradientBoostingRegressor 모델 훈련
gb_model = GradientBoostingRegressor(
    n_estimators=300,
    min_samples_split=2,
    min_samples_leaf=1,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)

# 모델 훈련
gb_model.fit(preprocessed_X, y)

# 훈련된 모델을 사용하여 예측
y_pred = gb_model.predict(preprocessed_X)

# RMSE와 R^2 계산
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

print("RMSE:", rmse)
print("R^2:", r2)

RMSE: 545903.7386177476
R^2: 0.9999988517792238


### XGBoost

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin

# 새로운 feature engineering 변환기 정의
class AdditionalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # '평일 대비 주말 유동 인구 비율' 계산하여 추가
        X['주말_대비_평일_유동인구_비율'] = X['주말_유동인구_수_평균'] / X['평일_유동인구_수_평균']
        return X

# 새로운 feature engineering 변환기 인스턴스 생성
additional_features = AdditionalFeatures()

# 데이터에 새로운 지표 추가
X_new_features = additional_features.transform(X.copy())

# 전처리 파이프라인에 새로운 feature engineering 단계 추가
preprocessor_with_new_features = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns + ['가구_대비_인구_비율', '주말_대비_평일_유동인구_비율']),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# 전처리 파이프라인 적용
preprocessed_X_with_new_features = preprocessor_with_new_features.fit_transform(X_new_features)

# 새로운 feature를 추가한 후의 데이터 차원 확인
preprocessed_X_with_new_features.shape

(3600, 102)

In [23]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score

# XGBoost 모델 인스턴스 생성
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# 교차 검증으로 모델의 RMSE와 R^2 계산
rmse_scores = cross_val_score(xgb_model, preprocessed_X_with_new_features, y, cv=kf, scoring=rmse_scorer)
r2_scores = cross_val_score(xgb_model, preprocessed_X_with_new_features, y, cv=kf, scoring='r2')

# 평균 RMSE와 R^2 출력
rmse_mean = -rmse_scores.mean()
r2_mean = r2_scores.mean()

rmse_mean, r2_mean

(20462009.530386362, 0.9983442477363116)

In [27]:
from sklearn.model_selection import RandomizedSearchCV

# 탐색할 하이퍼파라미터의 범위 정의
xgb_param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_child_weight': [1, 2, 3, 4, 5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# XGBoost 모델 인스턴스 생성
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# RandomizedSearchCV 설정
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_distributions,
    n_iter=50,  # 탐색할 파라미터 설정 조합의 수
    cv=kf,  # KFold 교차 검증
    scoring='neg_mean_squared_error',  # 평가 지표
    random_state=42,
    n_jobs=2

)

# 하이퍼파라미터 탐색 실행
xgb_random_search.fit(preprocessed_X_with_new_features, y)

# XGBoost의 최적의 파라미터와 그 때의 RMSE 출력
xgb_best_params = xgb_random_search.best_params_
xgb_best_rmse = np.sqrt(-xgb_random_search.best_score_)

xgb_best_params, xgb_best_rmse

({'subsample': 0.9,
  'n_estimators': 400,
  'min_child_weight': 2,
  'max_depth': 8,
  'learning_rate': 0.05,
  'colsample_bytree': 1.0},
 19000647.96897695)

In [28]:
# 최적의 하이퍼파라미터를 사용하여 XGBoost 모델 설정
optimized_xgb_model = xgb.XGBRegressor(
    subsample=0.6,
    n_estimators=500,
    min_child_weight=1,
    max_depth=7,
    learning_rate=0.05,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

# 교차 검증으로 모델의 RMSE 계산
optimized_xgb_rmse_scores = cross_val_score(optimized_xgb_model, preprocessed_X_with_new_features, y, cv=kf, scoring=rmse_scorer)

# 교차 검증으로 모델의 R^2 계산
optimized_xgb_r2_scores = cross_val_score(optimized_xgb_model, preprocessed_X_with_new_features, y, cv=kf, scoring='r2')

# 평균 RMSE와 R^2 출력
optimized_xgb_rmse_mean = -optimized_xgb_rmse_scores.mean()
optimized_xgb_r2_mean = optimized_xgb_r2_scores.mean()

optimized_xgb_rmse_mean, optimized_xgb_r2_mean

(19270820.368527833, 0.9985548404916791)