In [6]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

in_path = "/content/drive/MyDrive/상권분석/features_panel.parquet"
df = pd.read_parquet(in_path)

print(df.shape)
df.head()


Mounted at /content/drive
(43624, 20)


Unnamed: 0,dong_code,quarter,biz_code,sales_amt,sales_next,y_growth_rate_nextq,store_cnt,open_cnt,close_cnt,franchise_cnt,close_rate,net_open_rate,franchise_ratio,y_risk_close_rate_nextq,pop_mean,pop_max,pop_weekend_mean,pop_night_mean,y_growth_cls,y_risk_cls
0,11110515,2024Q1,CS100001,3282036149,3609853000.0,0.099882,73,2,2,0,0.027397,0.0,0.0,0.0,,,,,0,0
1,11110515,2024Q2,CS100001,3609852542,3010345000.0,-0.166075,77,4,0,0,0.0,0.051948,0.0,0.053333,,,,,0,0
2,11110515,2024Q3,CS100001,3010345082,4315503000.0,0.433557,75,2,4,0,0.053333,-0.026667,0.0,0.012821,,,,,1,0
3,11110515,2024Q1,CS100003,315907116,420482700.0,0.331033,18,2,2,1,0.111111,0.0,0.055556,0.058824,,,,,1,0
4,11110515,2024Q2,CS100003,420482695,391413000.0,-0.069134,17,0,1,1,0.058824,-0.058824,0.058824,0.125,,,,,0,1


In [7]:
import numpy as np
import pandas as pd


df_ml = df.copy()

df_ml["log_sales"] = np.log1p(df_ml["sales_amt"])

for col in ["pop_mean","pop_max","pop_weekend_mean","pop_night_mean"]:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].fillna(df_ml[col].median())

num_features = [
    "log_sales",
    "store_cnt",
    "franchise_ratio",
    "net_open_rate",
    "close_rate",
    "pop_mean",
    "pop_weekend_mean",
    "pop_night_mean",
    "pop_max",
]

num_features = [c for c in num_features if c in df_ml.columns]

cat_features = ["biz_code", "dong_code", "quarter"]

X = df_ml[num_features + cat_features].copy()
y_growth = df_ml["y_growth_cls"].astype(int)
y_risk   = df_ml["y_risk_cls"].astype(int)

print(sorted(df_ml["quarter"].unique()))


['2024Q1', '2024Q2', '2024Q3']


In [8]:
train_quarters = ["2024Q1", "2024Q2"]
test_quarters  = ["2024Q3"]

train_idx = df_ml["quarter"].isin(train_quarters)
test_idx  = df_ml["quarter"].isin(test_quarters)

X_train, X_test = X.loc[train_idx], X.loc[test_idx]
yg_train, yg_test = y_growth.loc[train_idx], y_growth.loc[test_idx]
yr_train, yr_test = y_risk.loc[train_idx], y_risk.loc[test_idx]

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (29233, 12) Test: (14391, 12)


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score


num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_features),
        ("cat", cat_pipe, cat_features),
    ],
    remainder="drop"
)

def make_model():
    return Pipeline(steps=[
        ("prep", preprocess),
        ("clf", LogisticRegression(max_iter=2000, solver="saga", n_jobs=-1))
    ])

model_growth = make_model()
model_growth.fit(X_train, yg_train)

model_risk = make_model()
model_risk.fit(X_train, yr_train)

proba_g = model_growth.predict_proba(X_test)[:, 1]
proba_r = model_risk.predict_proba(X_test)[:, 1]

print("[Growth] ROC-AUC:", roc_auc_score(yg_test, proba_g))
print(classification_report(yg_test, (proba_g >= 0.5).astype(int)))

print("[Risk]   ROC-AUC:", roc_auc_score(yr_test, proba_r))
print(classification_report(yr_test, (proba_r >= 0.5).astype(int)))





[Growth] ROC-AUC: 0.7293198561580696
              precision    recall  f1-score   support

           0       0.90      0.63      0.74     11512
           1       0.32      0.71      0.44      2879

    accuracy                           0.64     14391
   macro avg       0.61      0.67      0.59     14391
weighted avg       0.78      0.64      0.68     14391

[Risk]   ROC-AUC: 0.7149902682884097
              precision    recall  f1-score   support

           0       0.90      0.58      0.70     11470
           1       0.31      0.74      0.43      2921

    accuracy                           0.61     14391
   macro avg       0.60      0.66      0.57     14391
weighted avg       0.78      0.61      0.65     14391





In [10]:
test_view = df_ml.loc[test_idx, ["dong_code","quarter","biz_code","sales_amt","store_cnt"]].copy()

test_view["proba_growth"] = proba_g
test_view["proba_risk"]   = proba_r

alpha = 1.0
test_view["score"] = test_view["proba_growth"] - alpha * test_view["proba_risk"]

test_view.sort_values("score", ascending=False).head(10)


Unnamed: 0,dong_code,quarter,biz_code,sales_amt,store_cnt,proba_growth,proba_risk,score
2054,11140615,2024Q3,CS300011,33371558772,9805,0.813913,2.8471710000000004e-17,0.813913
1323,11140520,2024Q3,CS300032,13278558,20,0.936075,0.1742433,0.761832
26264,11530530,2024Q3,CS300032,1803096,16,0.954796,0.2056925,0.749103
28528,11545690,2024Q3,CS300036,3974031,310,0.813411,0.07548854,0.737922
34106,11620745,2024Q3,CS300031,4392725,6,0.872462,0.1462197,0.726243
5131,11200670,2024Q3,CS300025,13446245,5,0.933748,0.2121281,0.72162
29285,11560550,2024Q3,CS300032,74426033,20,0.907418,0.1892317,0.718187
1546,11140550,2024Q3,CS300003,38407742,27,0.840784,0.1233027,0.717482
21086,11440680,2024Q3,CS200032,3241158,7,0.868264,0.1529739,0.71529
9114,11260550,2024Q3,CS300025,18257981,5,0.890546,0.1811494,0.709397


In [12]:
import joblib

joblib.dump(model_growth, "/content/model_growth.pkl")
joblib.dump(model_risk, "/content/model_risk.pkl")
print("saved models")


saved models
