In [66]:
import os
import time
import ast
import re
from datetime import date
import pymysql

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from opencc import OpenCC

from mods import gmap
from mods import pandas_mod as pdm
from mods import date_mod as dtm

In [67]:
def create_pymysql_connect():
    """
    自動透過pymysql建立連線，回傳conn連線物件。
    所需各項資料請寫入.env檔案中。請勿直接寫於程式中。
    """

    load_dotenv()

    username = os.getenv("MYSQL_USERNAME")
    password = os.getenv("MYSQL_PASSWORD")
    target_ip = os.getenv("MYSQL_IP")
    target_port = int(os.getenv("MYSQL_PORTT"))
    db_name = os.getenv("MYSQL_DB_NAME")

    conn = pymysql.connect(
        host=target_ip,
        port=target_port,
        user=username,
        password=password,
        database=db_name,
        charset='utf8mb4'
    )

    return conn


def E_load_from_sql(table_name: str) -> pd.DataFrame:
    """
    輸入欲查詢的表名table_name，透過pymysql連線資料庫，
    並取得該表後將其轉成dataframe。

    連線所需資訊請寫入.env中，請勿寫入程式中。
    """

    conn = create_pymysql_connect()
    sql = f"SELECT * FROM {table_name}"

    try:
        df = pd.read_sql(sql, conn)
        return df.to_dict(orient='records')

    except Exception as e:
        raise Exception(f"讀取{table_name}表時發生錯誤：{e}")
    

def reassign_id(df, id_col_name, id_str):
    """根據原有最後一筆資料進行自動延續編號
    對於未編號的資料，需要先建立id欄位並且賦予空字串
    df請輸入想要增加編號的df
    id_col_name請輸入id的「欄位名」
    id_str請輸入編號的「前綴字串」"""

    # 先找出原本的id編號（id欄位非空）最大值
    nums = df.loc[df[id_col_name] != "",
                  id_col_name].str.extract(r"(\d+)").astype(int)

    if nums.empty:
        start_num = 1876
    else:
        start_num = nums.max()[0] + 1

    # 計算需要新增的資料數
    empty_id = df[id_col_name] == ""
    empty_id_count = empty_id.sum()

    # 先列出編號list
    new_id = [f"{id_str}{i:04d}" for i in range(
        start_num, start_num + empty_id_count)]

    # 將list放入df欄位
    df.loc[empty_id, id_col_name] = new_id

    return df

In [68]:
type_id_dict = {
    "寵物美容":"sal",
    "寵物餐廳":"res",
    "寵物用品":"supl"
}

type_file_dict = {
    "寵物美容":"salon",
    "寵物餐廳":"restaurant",
    "寵物用品":"supplies"
}

replace_word_dict = {
    "区":"區",
    "霧峯":"霧峰",
    "中壢市":"中壢區",
    "省":"",
    "萬裏":"萬里",
    "區區":"區"
}

store_type = "寵物用品"

In [69]:
folder = r"C:\Users\add41\Documents\Data_Engineer\Project\example_data\gmap_full_search\temp"
file = f"{type_file_dict[store_type]}_TPEI_combine.csv"
path = os.path.join(folder, file)

df_main = pd.read_csv(path, dtype={"phone": str})

print(f"資料筆數：{len(df_main)}")

資料筆數：261


In [70]:
def remove_data(df, keywords):
    del_idx = []

    for keyword in keywords:
        idx = df[df["name"].str.contains(keyword, na=False)].index
        if len(idx) != 0:
            del_idx.extend(idx)

    del_idx = list(set(del_idx))

    df.drop(index=del_idx, axis=0, inplace=True)
    return df


del_keywords = ["停業", "歇業", "暫停營業", "暫停服務", "停止營業", "停止服務"]
df_main = remove_data(df=df_main, keywords=del_keywords)

In [71]:
def pre_trans_addres(df, store_type, replace_word_dict):
    """先新增類別欄位，並簡單將地址做簡轉繁、錯別字處理"""

    # 新增類別
    df["category"] = store_type

    # 處理掉不要的欄位
    df.drop(columns=["in_boundary", "update_date"], axis=1, inplace=True)
    df.dropna(subset=["address"], inplace=True)

    # 將評分和評論數補空值為0
    df_main[["rating", "rating_total"]] = df_main[["rating", "rating_total"]].fillna(0)

    # 先將地址全部做簡體轉繁體
    cc = OpenCC("s2t")
    df["address"] = df["address"].apply(cc.convert)

    # 清理一些地址中會出現的錯別字
    for word in replace_word_dict:
        df["address"] = df["address"].str.replace(word, replace_word_dict[word])

    # 先新增市和區的空欄位
    df["city"] = None
    df["district"] = None

    return df


df_main = pre_trans_addres(df_main, store_type, replace_word_dict)

In [72]:
# 先編寫處理地址的規則
pattern1 = r"([^\d\s]{2}市)([^\d\s]{1,3}區)"
pattern2 = r"灣([^\d\s]{1,2}區)"
pattern3 = r"([^\d\s]{2}區)([^\d\s]{2}市)"

In [73]:
def first_add_trans(df, pattern):
    """處理第一種情況：正常地址格式"""

    mask1 = df["address"].str.contains(pattern, regex=True, na=False)
    extracted1 = df.loc[mask1, "address"].str.extract(pattern)
    df.loc[mask1, ["city", "district"]] = extracted1.values

    return df

df_main = first_add_trans(df_main, pattern1)

  mask1 = df["address"].str.contains(pattern, regex=True, na=False)


In [74]:
def second_add_trans(df, pattern):
    """處理第二種情況：地址格式倒反"""

    mask3 = df["address"].str.contains(pattern, regex=True, na=False)
    extracted3 = df.loc[mask3, "address"].str.extract(pattern)

    df.loc[mask3, "city"] = extracted3[1].values
    df.loc[mask3, "district"] = extracted3[0].values

    return df

df_main = second_add_trans(df_main, pattern3)

  mask3 = df["address"].str.contains(pattern, regex=True, na=False)


In [75]:
def third_add_trans(df, pattern):
    """處理第三種狀況：只有區沒有市"""

    mask2 = df["address"].str.contains(pattern, regex=True, na=False)
    extracted2 = df.loc[mask2, "address"].str.extract(pattern)
    df.loc[mask2, "district"] = extracted2[0].values

    return df

df_main = third_add_trans(df_main, pattern2)

  mask2 = df["address"].str.contains(pattern, regex=True, na=False)


In [76]:
def fourth_add_trans(df, drop_words):
    """清理區中不乾淨的字元"""

    df["district"] = df["district"].str.replace("路竹", "鹿竹")
    pattern4 = "|".join(map(re.escape, drop_words))
    df["district"] = df["district"].str.replace(pattern4, "", regex=True)
    df["district"] = df["district"].str.replace("鹿竹", "路竹")

    return df

drop_words = ["路", "街", "巷", "弄", "段", "道"]
df_main = fourth_add_trans(df_main, drop_words)

In [77]:
def join_loc_id(df):
    """讀取地區資料，並與店家表join，留下loc id，並將city和district欄位去除"""

    # 讀取地區table
    df_loc_data = E_load_from_sql(table_name="location")
    df_loc = pd.DataFrame(data=df_loc_data)
    df_loc_select = df_loc[["loc_id", "city", "district"]]

    # 進行第一次join，根據市/區合併
    df = df.merge(df_loc_select, how="left", on=["city", "district"])

    # 取出loc id為空、沒有市資料的索引
    miss_loc = df["loc_id"].isna()

    # 如果有，進行第二次join
    if len(miss_loc) != 0:
        df_miss = df[miss_loc].drop(columns="loc_id", axis=1)

        df_miss = df_miss.merge(df_loc_select, how="left", on="district")

        df.loc[miss_loc, "loc_id"] = df_miss["loc_id"].values

    # 移除市區欄位
    df.drop(columns=["city", "district"], axis=1, inplace=True)

    return df

df_main = join_loc_id(df_main)

  df = pd.read_sql(sql, conn)


In [78]:
def join_type_id(df):
    """讀取類別資料，並與店家表join，留下類別id，將類別欄位去除"""

    df_catego_data = E_load_from_sql(table_name="category")
    df_catego = pd.DataFrame(data=df_catego_data)

    df = df.merge(df_catego, how="left",
                            left_on="category", right_on="category_name")
    df.drop(columns=["category_name", "category_eng", "category"], axis=1, inplace=True)

    return df

df_main = join_type_id(df_main)

  df = pd.read_sql(sql, conn)


In [79]:
def add_id(df, type_id_dict, store_type):
    """加上類別id"""

    df["id"] = ""
    df = reassign_id(df, "id", type_id_dict[store_type])

    return df

df_main = add_id(df_main, type_id_dict, store_type)

In [80]:
def trans_op_hours(df):
    """將營業時間轉為時數"""
    op_hours_list = []
    for index, row in df.iterrows():
        op_time = row["opening_hours"]
        op_hours = dtm.trans_op_time_to_hours(op_time)
        op_hours_list.append(op_hours)

    df["op_hours"] = op_hours_list
    df.drop(columns="opening_hours", axis=1, inplace=True)

    return df

df_main = trans_op_hours(df_main)

In [81]:
new_col = ["id", 'name', 'buss_status', 'loc_id', 'address', 'phone', "op_hours", 'category_id', 'rating',
           'rating_total', 'newest_review', 'longitude', 'latitude', 'map_url', 'website', 'place_id', 'update_time']

df_main = df_main[new_col]

In [82]:
today = date.today().strftime("%Y%m%d")

finish_file = f"{today}{type_file_dict[store_type]}_TPEI_finish.csv"
finish_path = os.path.join(folder, finish_file)
df_main.to_csv(finish_path, index=False, encoding="utf-8-sig")

loc_na = df_main[df_main["loc_id"].isna()]

print(f"存檔完成，資料筆數：{len(df_main)}")
print(f"地區缺失資料數：{len(loc_na)}")

存檔完成，資料筆數：261
地區缺失資料數：0


In [83]:
len(df_main.columns)

17

In [84]:
df_main.head(25)

Unnamed: 0,id,name,buss_status,loc_id,address,phone,op_hours,category_id,rating,rating_total,newest_review,longitude,latitude,map_url,website,place_id,update_time
0,supl1876,咕咕G寵物城-木柵店,OPERATIONAL,TPE008,116臺灣臺北市文山區興隆路四段70-1號,229398833,70.0,5,4.1,261.0,2025-04-19,121.561562,24.984316,https://maps.google.com/?cid=9590099804699171261,,ChIJVVVVVd4BaDQRvRmjO_bgFoU,2025/11/16 17:31:46
1,supl1877,PetPark,OPERATIONAL,TPE008,11674臺灣臺北市文山區羅斯福路六段302號,229317200,63.0,5,4.6,232.0,2021-05-05,121.539704,24.990713,https://maps.google.com/?cid=707242357215644003,http://www.wonderpet.asia/,ChIJqaK_YQCqQjQRY8nyPy2h0Ak,2025/11/16 17:31:46
2,supl1878,木星寵物,OPERATIONAL,TPE008,116臺灣臺北市文山區木新路三段226號,229368099,70.0,5,4.8,64.0,2025-04-28,121.559249,24.98149,https://maps.google.com/?cid=78979114800680598,,ChIJaRgNdAwBaDQRlo5eIhaXGAE,2025/11/16 17:31:46
3,supl1879,PetPark,OPERATIONAL,TPE008,116臺灣臺北市文山區木新路三段272號,229372900,63.0,5,4.4,189.0,2025-03-23,121.558347,24.98123,https://maps.google.com/?cid=469218073759243828,https://www.wonderpet.asia/,ChIJ6apPwOABaDQRNN6cIFr_ggY,2025/11/16 17:31:46
4,supl1880,晶晶寵物店（原金美滿寵物）,OPERATIONAL,TPE008,116臺灣臺北市文山區興隆路三段261號,222393188,72.0,5,4.3,55.0,2021-04-28,121.558864,24.992368,https://maps.google.com/?cid=13521194320155065958,,ChIJV_b0xAyqQjQRZj7fh5jupLs,2025/11/16 17:31:46
5,supl1881,哈哈窩寵物精品館-北市景興店,OPERATIONAL,TPE008,116臺灣臺北市文山區景興路169號,229337700,59.5,5,4.2,68.0,2021-07-06,121.544583,24.993163,https://maps.google.com/?cid=10431518333823885977,https://www.facebook.com/hahahouse107/,ChIJSRm-kAaqQjQRmSKM86cyxJA,2025/11/16 17:31:46
6,supl1882,喜羊羊寵物生活館(北市景興店),OPERATIONAL,TPE008,116臺灣臺北市文山區景興路169號,229337700,77.0,5,5.0,2.0,2024-09-05,121.544539,24.993116,https://maps.google.com/?cid=9567540067286104302,http://www.cyypet.com.tw/,ChIJj0CozxWrQjQR7rjnDAC7xoQ,2025/11/16 17:31:46
7,supl1883,豆恩凱寵物沙龍,OPERATIONAL,TPE008,116臺灣臺北市文山區木新路二段1號,286612369,54.0,5,4.5,104.0,2025-02-12,121.570835,24.987609,https://maps.google.com/?cid=12388809864078626836,,ChIJ4SSyQHWqQjQRFBBDwu7k7as,2025/11/16 17:31:46
8,supl1884,黛希寵物美容沙龍,OPERATIONAL,TPE008,116臺灣臺北市文山區景華街18號1樓,289319906,60.0,5,4.8,77.0,2024-08-30,121.542148,24.994844,https://maps.google.com/?cid=9574317886448112929,https://line.me/R/ti/p/@zfz5058g,ChIJfe1t-wOqQjQRIZVoDGTP3oQ,2025/11/16 17:31:46
9,supl1885,景興動物醫院,OPERATIONAL,TPE008,116臺灣臺北市文山區景華街92號,229330600,60.0,5,4.5,222.0,2025-08-24,121.545716,24.995151,https://maps.google.com/?cid=12713671958385079329,,ChIJ6Td9tQWqQjQRITRjRkkJcLA,2025/11/16 17:31:46


In [85]:
conn = create_pymysql_connect()
cursor = conn.cursor()

df_main = df_main.replace({pd.NA: None, pd.NaT: None})
data = list(df_main.itertuples(index=False, name=None))

sql = f"INSERT INTO {type_file_dict[store_type]} (id, name, buss_status, loc_id, address, phone, op_hours, category_id, rating, rating_total, newest_review, longitude, latitude, map_url, website, place_id, update_time) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

try:
    cursor.executemany(sql, data)
    conn.commit()
    print("資料寫入資料庫成功！")
except Exception as e:
    print(f"資料寫入資料褲時發生錯誤：{e}")
    conn.rollback()
finally:
    cursor.close()
    conn.close()

資料寫入資料庫成功！
