In [24]:
import pandas as pd
from datetime import datetime, date, timedelta
import pymysql
import os
from dotenv import load_dotenv
import numpy as np

## 前置作業

In [25]:
def create_pymysql_connect():
    """
    自動透過pymysql建立連線，回傳conn連線物件。
    所需各項資料請寫入.env檔案中。請勿直接寫於程式中。
    """

    load_dotenv()

    username = os.getenv("MYSQL_USERNAME")
    password = os.getenv("MYSQL_PASSWORD")
    target_ip = os.getenv("MYSQL_IP")
    target_port = int(os.getenv("MYSQL_PORTT"))
    db_name = os.getenv("MYSQL_DB_NAME")

    conn = pymysql.connect(
        host=target_ip,
        port=target_port,
        user=username,
        password=password,
        database=db_name,
        charset='utf8mb4'
    )

    return conn


def E_load_from_sql(table_name: str) -> pd.DataFrame:
    """
    輸入欲查詢的表名table_name，透過pymysql連線資料庫，
    並取得該表後將其轉成dataframe。

    連線所需資訊請寫入.env中，請勿寫入程式中。
    """

    conn = create_pymysql_connect()
    sql = f"SELECT * FROM {table_name}"

    try:
        df = pd.read_sql(sql, conn)
        return df.to_dict(orient='records')

    except Exception as e:
        raise Exception(f"讀取{table_name}表時發生錯誤：{e}")

In [26]:
data_salon = E_load_from_sql(table_name="salon")
df_salon = pd.DataFrame(data=data_salon)

data_hotel = E_load_from_sql(table_name="hotel")
df_hotel = pd.DataFrame(data=data_hotel)

data_hospital = E_load_from_sql(table_name="hospital")
df_hospital = pd.DataFrame(data=data_hospital)

data_supplies = E_load_from_sql(table_name="supplies")
df_supplies = pd.DataFrame(data=data_supplies)

data_restaurant = E_load_from_sql(table_name="restaurant")
df_restaurant = pd.DataFrame(data=data_restaurant)

data_shelter = E_load_from_sql(table_name="shelter")
df_shelter = pd.DataFrame(data=data_shelter)

  df = pd.read_sql(sql, conn)
  df = pd.read_sql(sql, conn)
  df = pd.read_sql(sql, conn)
  df = pd.read_sql(sql, conn)
  df = pd.read_sql(sql, conn)
  df = pd.read_sql(sql, conn)


In [27]:
df_main = pd.concat([df_salon, df_hotel, df_hospital, df_supplies, df_restaurant, df_shelter], ignore_index=True)
df_main = df_main.dropna(subset="loc_id")
df_main["city_id"] = df_main["loc_id"].str[:3]

In [28]:
data_loc = E_load_from_sql(table_name="location")
df_loc = pd.DataFrame(data=data_loc)

  df = pd.read_sql(sql, conn)


In [29]:
data_category = E_load_from_sql(table_name="category")
df_category = pd.DataFrame(data=data_category)

  df = pd.read_sql(sql, conn)


In [30]:
data_pet = E_load_from_sql(table_name="pet_regis")
df_pet = pd.DataFrame(data=data_pet)
df_pet["city_id"] = df_pet["loc_id"].str[:3]

  df = pd.read_sql(sql, conn)


## 計算單店分數

In [31]:
# 計算w_area_cat值
def T_calculate_w_area_cat(df: pd.DataFrame, t: int = 30) -> pd.DataFrame:
    df[["loc_id", "category_id"]] = df[["loc_id", "category_id"]].astype(str)

    loc_store_count = df.groupby(["loc_id", "category_id"]).size().reset_index(name="store_count")

    loc_store_count["w_area_cat"] = loc_store_count["store_count"] / (loc_store_count["store_count"] + t)

    df = df.merge(loc_store_count, how="left", on=["loc_id", "category_id"])

    return df

df_main = T_calculate_w_area_cat(df=df_main)

df_test = df_main.copy()
df_test[df_test["loc_id"] == "None"]

Unnamed: 0,id,name,buss_status,loc_id,address,phone,op_hours,category_id,rating,rating_total,newest_review,longitude,latitude,map_url,website,place_id,update_time,city_id,store_count,w_area_cat


In [32]:
def T_calculate_P75_score(df: pd.DataFrame) -> pd.DataFrame:
    # 計算P75_city_area_cat
    P75_district_rating_total = df.groupby(["loc_id", "category_id"])["rating_total"].quantile(0.75).reset_index(name="P75_district_rating_total")

    # 計算P75_city_cat
    P75_city_rating_total = df.groupby(["city_id", "category_id"])["rating_total"].quantile(0.75).reset_index(name="P75_city_rating_total")

    # merge回店家總表
    df = df.merge(P75_district_rating_total, how="left", on=["loc_id", "category_id"])
    df = df.merge(P75_city_rating_total, how="left", on=["city_id", "category_id"])

    return df

df_main = T_calculate_P75_score(df=df_main)

df_test = df_main.copy()
df_test[df_test["loc_id"] == "None"]

Unnamed: 0,id,name,buss_status,loc_id,address,phone,op_hours,category_id,rating,rating_total,...,latitude,map_url,website,place_id,update_time,city_id,store_count,w_area_cat,P75_district_rating_total,P75_city_rating_total


In [33]:
def T_calculate_mscore(df: pd.DataFrame) -> pd.DataFrame:
    # 計算m_city_area_cat
    df["m_city_area_cat"] = (df["w_area_cat"] * df["P75_district_rating_total"]) + ((1 - df["w_area_cat"]) * df["P75_city_rating_total"])

    return df

df_main = T_calculate_mscore(df=df_main)

df_test = df_main.copy()
df_test[df_test["loc_id"] == "None"]

Unnamed: 0,id,name,buss_status,loc_id,address,phone,op_hours,category_id,rating,rating_total,...,map_url,website,place_id,update_time,city_id,store_count,w_area_cat,P75_district_rating_total,P75_city_rating_total,m_city_area_cat


In [34]:
def T_calculate_rating_avg(df: pd.DataFrame) -> pd.DataFrame:
    district_rating_avg = df.groupby(["loc_id", "category_id"])["rating"].mean().reset_index(name="district_rating_avg")
    df = df.merge(district_rating_avg, how="left", on=["loc_id", "category_id"])

    return df

df_main = T_calculate_rating_avg(df=df_main)

df_test = df_main.copy()
df_test[df_test["loc_id"] == "None"]

Unnamed: 0,id,name,buss_status,loc_id,address,phone,op_hours,category_id,rating,rating_total,...,website,place_id,update_time,city_id,store_count,w_area_cat,P75_district_rating_total,P75_city_rating_total,m_city_area_cat,district_rating_avg


In [35]:
def T_calculate_avb_score(df: pd.DataFrame) -> pd.DataFrame:
    df["avb_score"] = ((df["op_hours"]/168)*0.5) + 0.5

    return df

df_main = T_calculate_avb_score(df=df_main)

df_test = df_main.copy()
df_test[df_test["loc_id"] == "None"]

Unnamed: 0,id,name,buss_status,loc_id,address,phone,op_hours,category_id,rating,rating_total,...,place_id,update_time,city_id,store_count,w_area_cat,P75_district_rating_total,P75_city_rating_total,m_city_area_cat,district_rating_avg,avb_score


In [36]:
def T_calculate_store_score(df: pd.DataFrame) -> pd.DataFrame:
    df["store_score"] = (((df["rating"]/5) * (df["rating_total"]/(df["rating_total"]+df["m_city_area_cat"]))) + (
        (df["district_rating_avg"]/5)*(df["m_city_area_cat"]/(df["m_city_area_cat"]+df["rating_total"])))) * df["avb_score"]

    return df

df_main = T_calculate_store_score(df=df_main)

df_test = df_main.copy()
df_test[df_test["loc_id"] == "None"]

Unnamed: 0,id,name,buss_status,loc_id,address,phone,op_hours,category_id,rating,rating_total,...,update_time,city_id,store_count,w_area_cat,P75_district_rating_total,P75_city_rating_total,m_city_area_cat,district_rating_avg,avb_score,store_score


In [63]:
df_main["store_score"].max()

np.float64(1.0)

In [None]:
df_main.columns

new_col = ['id', 'name', 'buss_status', 'loc_id', 'city_id', 'category_id', 'address', 'phone', 'op_hours',
           'rating', 'rating_total', 'newest_review', 'longitude',
           'latitude', 'map_url', 'website', 'place_id', 'update_time',
           'store_count', 'w_area_cat', 'P75_district_rating_total',
           'P75_city_rating_total', 'm_city_area_cat', 'district_rating_avg',
           'avb_score', 'store_score']

Index(['id', 'name', 'buss_status', 'loc_id', 'address', 'phone', 'op_hours',
       'category_id', 'rating', 'rating_total', 'newest_review', 'longitude',
       'latitude', 'map_url', 'website', 'place_id', 'update_time', 'city_id',
       'store_count', 'w_area_cat', 'P75_district_rating_total',
       'P75_city_rating_total', 'm_city_area_cat', 'district_rating_avg',
       'avb_score', 'store_score'],
      dtype='object')

## 計算區域分數

In [None]:
def T_calculate_category_raw_score(df_pet: pd.DataFrame, df_main: pd.DataFrame) -> pd.DataFrame:
    df_pet_count = df_pet.groupby("loc_id").agg(
        regis = ("regis_count", "sum"),
        removal = ("removal_count", "sum")
    ).reset_index()
    df_pet_count["pet_count"] = df_pet_count["regis"] - df_pet_count["removal"]

    sum_store_score = df_main.groupby(["loc_id", "category_id"])["store_score"].sum().reset_index(name="sum_store_score")

    df_ctgry_score = sum_store_score.merge(df_pet_count, how="left", on="loc_id")
    df_ctgry_score["category_raw_score"] = df_ctgry_score["sum_store_score"] / (df_ctgry_score["pet_count"]/10000)
    df_ctgry_score["category_raw_score"] = df_ctgry_score["category_raw_score"].fillna(0)

    df_ctgry_score["city_id"] = df_ctgry_score["loc_id"].str[:3]

    df_ctgry_score

    return df_ctgry_score

df_ctgry_score = T_calculate_category_raw_score(df_pet=df_pet, df_main=df_main)

df_test = df_ctgry_score.copy()
df_test[df_test["loc_id"] == "None"]

Unnamed: 0,loc_id,category_id,sum_store_score,regis,removal,pet_count,ctgry_raw_score,city_id


In [None]:
def normalize_series(x: pd.Series, p10: pd.Series, p90: pd.Series) -> pd.Series:
    """
    x   : 要轉換的原始分數（Series）
    p10 : 同長度的第10百分位數（Series，已對齊 x）
    p90 : 同長度的第90百分位數（Series，已對齊 x）
    回傳：回傳到 0.5-9.5 的分數
    """
    # 避免 P90==P10 造成除0，先把相等的分母換成NaN
    denom = (p90 - p10).replace(0, pd.NA)
    ratio = (x - p10) / denom
    # 若分母為NaN（等於 0 的情況），或原本就NaN視為0
    ratio = ratio.fillna(0.0)
    # 夾在 [0, 1]
    ratio = ratio.clip(0, 1)
    # 回傳到 [0.5, 9.5]
    return 0.5 + ratio * 9.0


def T_get_normalize_score(df: pd.DataFrame, col_list: list, col_name: str) -> pd.DataFrame:
    df_copy = pd.DataFrame(df)

    group = df_copy.groupby(col_list)["category_raw_score"]
    p10 = group.transform(lambda s: s.quantile(0.10))
    p90 = group.transform(lambda s: s.quantile(0.90))
    df_copy[col_name] = normalize_series(x=df_copy["category_raw_score"], p10=p10, p90=p90)
    df_copy[col_name] = df_copy[col_name].round(2)

    return df_copy

# 先處理市內比較
city_col_list = ["city_id", "category_id"]
df_city = T_get_normalize_score(df=df_ctgry_score, col_list=city_col_list, col_name="norm_city")

# 再處理六都全部
all_col_list = ["category_id"]
df_all = T_get_normalize_score(df=df_ctgry_score, col_list=all_col_list, col_name="norm_all")

  ratio = ratio.fillna(0.0)


In [39]:
def T_merge_city_and_all(df_city: pd.DataFrame, df_all: pd.DataFrame) -> pd.DataFrame:
    df_all = df_all[["loc_id", "category_id", "norm_all"]]
    df_final = df_city.merge(df_all, how="left", on=["loc_id", "category_id"])

    return df_final

df_final = T_merge_city_and_all(df_city=df_city, df_all=df_all)

df_test = df_final.copy()
df_test[df_test["loc_id"] == "None"]

Unnamed: 0,loc_id,category_id,sum_store_score,regis,removal,pet_count,ctgry_raw_score,city_id,norm_city,norm_all


## 計算final score

In [None]:
def T_calculate_ctgry_score(df: pd.DataFrame, weighted_dict: dict) -> pd.DataFrame:
    df["category_city_score"] = df["norm_city"] * df["category_id"].map(weighted_dict)
    df["category_all_score"] = df["norm_all"] * df["category_id"].map(weighted_dict)

    return df

weighted_dict = {
    "1":0.27,
    "2":0.09,
    "3":0.11,
    "4":0.15,
    "5":0.13,
    "6":0.03,
    "7":0.22
}

df_final = T_calculate_ctgry_score(df=df_final, weighted_dict=weighted_dict)

In [None]:
def T_calculate_city_all_score(df: pd.DataFrame) -> pd.DataFrame:
    df_city_score = df.groupby("loc_id")["category_city_score"].sum().reset_index(name="city_score")
    df = df.merge(df_city_score, how="left", on="loc_id")
    df_all_score = df.groupby("loc_id")["category_all_score"].sum().reset_index(name="all_score")
    df = df.merge(df_all_score, how="left", on="loc_id")

    return df

df_final = T_calculate_city_all_score(df=df_final)

In [42]:
def T_calculate_final_score(df: pd.DataFrame) -> pd.DataFrame:
    df["final_score"] = (df["city_score"]*0.7) + (df["all_score"]*0.3)

    return df

df_final = T_calculate_final_score(df=df_final)

In [None]:
def T_transform_final_to_10(df: pd.DataFrame, new_col: list) -> pd.DataFrame:
    df["final_score_10"] = df["final_score"] * (10/9.5)
    df["final_score_10"] = df["final_score_10"].round(2)

    df = df[new_col]

    return df

new_col_sort = ['loc_id', 'city_id', 'category_id', 'regis', 'removal', 'pet_count',
                'sum_store_score', 'category_raw_score', 'norm_city', 'norm_all',
                'category_city_score', 'category_all_score', 'city_score', 'all_score',
                'final_score', 'final_score_10']

df_final = T_transform_final_to_10(df=df_final, new_col=new_col_sort)

16