In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("features_panel.parquet")

df_ml = df.copy()

df_ml["log_sales"] = np.log1p(df_ml["sales_amt"])

for col in ["pop_mean","pop_max","pop_weekend_mean","pop_night_mean"]:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].fillna(df_ml[col].median())

print("loaded:", df_ml.shape)
df_ml.head()


loaded: (43624, 21)


Unnamed: 0,dong_code,quarter,biz_code,sales_amt,sales_next,y_growth_rate_nextq,store_cnt,open_cnt,close_cnt,franchise_cnt,...,net_open_rate,franchise_ratio,y_risk_close_rate_nextq,pop_mean,pop_max,pop_weekend_mean,pop_night_mean,y_growth_cls,y_risk_cls,log_sales
0,11110515,2024Q1,CS100001,3282036149,3609853000.0,0.099882,73,2,2,0,...,0.0,0.0,0.0,,,,,0,0,21.91173
1,11110515,2024Q2,CS100001,3609852542,3010345000.0,-0.166075,77,4,0,0,...,0.051948,0.0,0.053333,,,,,0,0,22.006933
2,11110515,2024Q3,CS100001,3010345082,4315503000.0,0.433557,75,2,4,0,...,-0.026667,0.0,0.012821,,,,,1,0,21.825321
3,11110515,2024Q1,CS100003,315907116,420482700.0,0.331033,18,2,2,1,...,0.0,0.055556,0.058824,,,,,1,0,19.570959
4,11110515,2024Q2,CS100003,420482695,391413000.0,-0.069134,17,0,1,1,...,-0.058824,0.058824,0.125,,,,,0,1,19.856914


In [None]:
import pandas as pd
import numpy as np
import joblib

df_ml = pd.read_parquet("/content/features_panel.parquet").copy()

df_ml["log_sales"] = np.log1p(df_ml["sales_amt"])
for col in ["pop_mean","pop_max","pop_weekend_mean","pop_night_mean"]:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].fillna(df_ml[col].median())

model_growth = joblib.load("/content/model_growth.pkl")
model_risk   = joblib.load("/content/model_risk.pkl")

num_features = [
    "log_sales","store_cnt","franchise_ratio","net_open_rate","close_rate",
    "pop_mean","pop_weekend_mean","pop_night_mean","pop_max",
]
num_features = [c for c in num_features if c in df_ml.columns]
cat_features = ["biz_code","dong_code","quarter"]

print("df_ml:", df_ml.shape)
print("num_features:", num_features)
print("cat_features:", cat_features)

def recommend_top_industries(dong_code, quarter="2024Q3", top_n=10, alpha=1.0):
    subset = df_ml[(df_ml["dong_code"] == str(dong_code)) & (df_ml["quarter"] == quarter)].copy()
    if subset.empty:
        return None

    X_sub = subset[num_features + cat_features].copy()
    pg = model_growth.predict_proba(X_sub)[:, 1]
    pr = model_risk.predict_proba(X_sub)[:, 1]

    subset["proba_growth"] = pg
    subset["proba_risk"] = pr
    subset["score"] = subset["proba_growth"] - alpha * subset["proba_risk"]

    cols = ["dong_code","quarter","biz_code","score","proba_growth","proba_risk","sales_amt","store_cnt"]
    cols = [c for c in cols if c in subset.columns]
    return subset.sort_values("score", ascending=False)[cols].head(top_n)

recommend_top_industries("11740620", quarter="2024Q3", top_n=10, alpha=1.0)


df_ml: (43624, 21)
num_features: ['log_sales', 'store_cnt', 'franchise_ratio', 'net_open_rate', 'close_rate', 'pop_mean', 'pop_weekend_mean', 'pop_night_mean', 'pop_max']
cat_features: ['biz_code', 'dong_code', 'quarter']




Unnamed: 0,dong_code,quarter,biz_code,score,proba_growth,proba_risk,sales_amt,store_cnt
42886,11740620,2024Q3,CS300003,0.562732,0.823998,0.261266,23914160,7
42928,11740620,2024Q3,CS300031,0.486604,0.73279,0.246185,95276788,18
42871,11740620,2024Q3,CS200032,0.472526,0.777804,0.305278,23940000,14
42867,11740620,2024Q3,CS200030,0.360161,0.725787,0.365626,31210082,25
42925,11740620,2024Q3,CS300028,0.350841,0.740876,0.390035,10540208,10
42864,11740620,2024Q3,CS200029,0.325831,0.83069,0.50486,3158214,17
42902,11740620,2024Q3,CS300011,0.307421,0.548728,0.241307,1548941996,107
42831,11740620,2024Q3,CS200001,0.303261,0.597682,0.294421,59597273,24
42870,11740620,2024Q3,CS200031,0.302178,0.758225,0.456046,33367356,20
42855,11740620,2024Q3,CS200024,0.292214,0.893534,0.60132,1000000,9


In [20]:
rec = recommend_top_industries("11740620", quarter="2024Q3", top_n=10, alpha=1.0)
biz_map = pd.read_csv("/content/biz_code_map.csv", encoding="utf-8-sig")
rec.merge(biz_map, on="biz_code", how="left")




Unnamed: 0,dong_code,quarter,biz_code,score,proba_growth,proba_risk,sales_amt,store_cnt,biz_name
0,11740620,2024Q3,CS300003,0.562732,0.823998,0.261266,23914160,7,컴퓨터및주변장치판매
1,11740620,2024Q3,CS300031,0.486604,0.73279,0.246185,95276788,18,가구
2,11740620,2024Q3,CS200032,0.472526,0.777804,0.305278,23940000,14,가전제품수리
3,11740620,2024Q3,CS200030,0.360161,0.725787,0.365626,31210082,25,피부관리실
4,11740620,2024Q3,CS300028,0.350841,0.740876,0.390035,10540208,10,화초
5,11740620,2024Q3,CS200029,0.325831,0.83069,0.50486,3158214,17,네일숍
6,11740620,2024Q3,CS300011,0.307421,0.548728,0.241307,1548941996,107,일반의류
7,11740620,2024Q3,CS200001,0.303261,0.597682,0.294421,59597273,24,일반교습학원
8,11740620,2024Q3,CS200031,0.302178,0.758225,0.456046,33367356,20,세탁소
9,11740620,2024Q3,CS200024,0.292214,0.893534,0.60132,1000000,9,스포츠클럽


In [21]:
df_ml[["pop_mean","pop_weekend_mean","pop_night_mean","pop_max"]].isna().mean()


Unnamed: 0,0
pop_mean,1.0
pop_weekend_mean,1.0
pop_night_mean,1.0
pop_max,1.0
