<a href="https://colab.research.google.com/github/sorrri/plotters/blob/main/sori_LH_Made_Cookies_fin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from google.colab import files
uploaded = files.upload()

Saving 10.성남시_지하철역.csv to 10.성남시_지하철역.csv


In [121]:
import gc
import pandas as pd
import geopandas as gpd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from shapely.geometry import Point
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from scipy.spatial import cKDTree
from xgboost import XGBRegressor
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# 파일 경로 설정
file_paths = {
    "거주인구": "/content/1.성남시_거주인구.csv",
    "상권정보": "/content/2.성남시_상권정보.csv",
    "상가개폐업": "/content/3.성남시_상가개폐업.csv",
    "층별개요": "/content/5.성남시_층별개요.csv",
    "지식산업센터": "/content/7.성남시_지식산업센터.csv",
    "개별공시지가": "/content/8.성남시_개별공시지가.csv",
    "버스정류장": "/content/9.성남시_버스정류장.csv",
    "지하철역": "/content/10.성남시_지하철역.csv"
}

In [98]:
# ✅ TensorFlow retracing 방지
@tf.function(reduce_retracing=True)
def safe_predict(model, data):
    return model.predict(data, batch_size=32)

In [122]:
# ✅ 데이터 로드 및 전처리
df_population = pd.read_csv(file_paths["거주인구"], encoding="utf-8-sig").fillna(0)
df_population["total_population"] = df_population.iloc[:, 2:].sum(axis=1)
df_population = df_population.groupby("gid")["total_population"].sum().reset_index()

df_land_value = pd.read_csv(file_paths["개별공시지가"], encoding="utf-8-sig").fillna(0)
df_land_value_avg = df_land_value.groupby("bjd_nm")["land_value"].mean().reset_index()

df_floors = pd.read_csv(file_paths["층별개요"], encoding="utf-8-sig").fillna(0)
df_floors_grouped = df_floors.groupby("plat_plc")["flr_num"].max().reset_index()

df_shop = pd.read_csv(file_paths["상가개폐업"], encoding="utf-8-sig").fillna(0)
df_shop_grouped = df_shop.groupby("addr")["biz_stts_nm"].count().reset_index()
df_shop_grouped.rename(columns={"biz_stts_nm": "num_business_changes"}, inplace=True)

gdf_industrial = convert_to_gdf(pd.read_csv(file_paths["지식산업센터"], encoding="utf-8-sig"), "lon", "lat")
gdf_industrial = gdf_industrial.merge(df_land_value_avg, left_on="addr", right_on="bjd_nm", how="left")
gdf_industrial = gdf_industrial.merge(df_population, left_on="addr", right_on="gid", how="left")
gdf_industrial = gdf_industrial.merge(df_floors_grouped, left_on="addr", right_on="plat_plc", how="left")
gdf_industrial = gdf_industrial.merge(df_shop_grouped, left_on="addr", right_on="addr", how="left")

  df_floors = pd.read_csv(file_paths["층별개요"], encoding="utf-8-sig").fillna(0)


In [123]:
# ✅ 거리 계산 및 데이터 검증
gdf_bus = convert_to_gdf(pd.read_csv(file_paths["버스정류장"], encoding="utf-8-sig"), "lon", "lat")
gdf_subway = convert_to_gdf(pd.read_csv(file_paths["지하철역"], encoding="utf-8-sig"), "lon", "lat")

def calculate_nearest_distance(source_gdf, target_gdf):
    return [target_gdf.distance(pt).min() * 111 for pt in source_gdf.geometry]

gdf_industrial["bus_distance_km"] = calculate_nearest_distance(gdf_industrial, gdf_bus)
gdf_industrial["subway_distance_km"] = calculate_nearest_distance(gdf_industrial, gdf_subway)


  return [target_gdf.distance(pt).min() * 111 for pt in source_gdf.geometry]

  return [target_gdf.distance(pt).min() * 111 for pt in source_gdf.geometry]


In [124]:
# ✅ 공실률 컬럼 다시 생성
gdf_industrial["vacancy_rate"] = 1 - (gdf_industrial["cpn_in_2406"] / gdf_industrial["tot_cpn"])
gdf_industrial["vacancy_rate"] = gdf_industrial["vacancy_rate"].fillna(0)

In [125]:
# ✅ Feature Engineering 조정 (불필요한 Feature 제거)
gdf_industrial["log_bus_distance"] = np.log1p(gdf_industrial["bus_distance_km"])
gdf_industrial["log_subway_distance"] = np.log1p(gdf_industrial["subway_distance_km"])

gdf_industrial.replace([np.inf, -np.inf], np.nan, inplace=True)
gdf_industrial.fillna(0, inplace=True)

  gdf_industrial.replace([np.inf, -np.inf], np.nan, inplace=True)


In [126]:
# ✅ Feature Selection (RandomForest 기반)
rfr_selector = RandomForestRegressor(n_estimators=300, max_depth=4, random_state=42)
X_selected = gdf_industrial[["log_bus_distance", "log_subway_distance", "total_population", "num_business_changes"]]
y_selected = gdf_industrial["vacancy_rate"]
rfr_selector.fit(X_selected, y_selected)
feature_importances = pd.Series(rfr_selector.feature_importances_, index=X_selected.columns)
selected_features = feature_importances.nlargest(3).index.tolist()

scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_selected[selected_features])
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_selected, test_size=0.2, random_state=42)

In [127]:
# ✅ Stacking Model 최적화
stacking_model = StackingRegressor(
    estimators=[
        ("rf", RandomForestRegressor(n_estimators=300, max_depth=4, random_state=42)),
        ("xgb", XGBRegressor(n_estimators=300, learning_rate=0.03, max_depth=4, random_state=42)),
        ("lgb", lgb.LGBMRegressor(n_estimators=300, learning_rate=0.03, max_depth=4, random_state=42))
    ],
    final_estimator=Ridge(alpha=1.0)
)
stacking_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 2
[LightGBM] [Info] Start training from score 0.564526
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 0
[LightGBM] [Info] Start training from score 0.538813
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 0
[LightGBM] [Info] Start training from score 0.562135
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 0
[LightGBM] [Info] Start training from score 0.599901
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 0
[LightGBM] [Info] Start t

In [128]:
# ✅ 모델 평가
stacking_pred = stacking_model.predict(X_test)
(mean_absolute_error(y_test, stacking_pred), r2_score(y_test, stacking_pred))

(0.17215072626241856, 0.03292690338740922)