In [128]:
import os
import time
import ast
import re
from datetime import date

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from opencc import OpenCC

from mods import gmap
from mods import pandas_mod as pdm
from mods import date_mod as dtm

In [129]:
load_dotenv()

username = os.getenv("MYSQL_USERNAME")
password = os.getenv("MYSQL_PASSWORD")
target_ip = os.getenv("MYSQL_IP")
target_port = int(os.getenv("MYSQL_PORTT"))
db_name = os.getenv("MYSQL_DB_NAME")

engine = create_engine(
    f"mysql+pymysql://{username}:{password}@{target_ip}:{target_port}/{db_name}")

In [130]:
type_id_dict = {
    "寵物美容":"sal",
    "寵物餐廳":"res",
    "寵物用品":"supl"
}

type_file_dict = {
    "寵物美容":"salon",
    "寵物餐廳":"restaurant",
    "寵物用品":"supplies"
}

replace_word_dict = {
    "区":"區",
    "霧峯":"霧峰",
    "中壢市":"中壢區",
    "省":"",
    "萬裏":"萬里",
    "區區":"區"
}

store_type = "寵物美容"

In [131]:
folder = r"C:\Users\add41\Documents\Data_Engineer\Project\example_data\gmap_full_search\temp"
file = f"{type_file_dict[store_type]}_combine.csv"
path = os.path.join(folder, file)

df_main = pd.read_csv(path, dtype={"phone": str})

print(f"資料筆數：{len(df_main)}")

資料筆數：2087


In [132]:
def remove_data(df, keywords):
    del_idx = []

    for keyword in keywords:
        idx = df[df["name"].str.contains(keyword, na=False)].index
        if len(idx) != 0:
            del_idx.extend(idx)

    del_idx = list(set(del_idx))

    df.drop(index=del_idx, axis=0, inplace=True)
    return df


del_keywords = ["停業", "歇業", "暫停營業", "暫停服務", "停止營業", "停止服務"]
df_main = remove_data(df=df_main, keywords=del_keywords)

In [133]:
def pre_trans_addres(df, store_type, replace_word_dict):
    """先新增類別欄位，並簡單將地址做簡轉繁、錯別字處理"""

    # 新增類別
    df["category"] = store_type

    # 處理掉不要的欄位
    df.drop(columns=["in_boundary", "update_date"], axis=1, inplace=True)
    df.dropna(subset=["address"], inplace=True)

    # 將評分和評論數補空值為0
    df_main[["rating", "rating_total"]] = df_main[["rating", "rating_total"]].fillna(0)

    # 先將地址全部做簡體轉繁體
    cc = OpenCC("s2t")
    df["address"] = df["address"].apply(cc.convert)

    # 清理一些地址中會出現的錯別字
    for word in replace_word_dict:
        df["address"] = df["address"].str.replace(word, replace_word_dict[word])

    # 先新增市和區的空欄位
    df["city"] = None
    df["district"] = None

    return df


df_main = pre_trans_addres(df_main, store_type, replace_word_dict)

In [134]:
# 先編寫處理地址的規則
pattern1 = r"([^\d\s]{2}市)([^\d\s]{1,3}區)"
pattern2 = r"灣([^\d\s]{1,2}區)"
pattern3 = r"([^\d\s]{2}區)([^\d\s]{2}市)"

In [135]:
def first_add_trans(df, pattern):
    """處理第一種情況：正常地址格式"""

    mask1 = df["address"].str.contains(pattern, regex=True, na=False)
    extracted1 = df.loc[mask1, "address"].str.extract(pattern)
    df.loc[mask1, ["city", "district"]] = extracted1.values

    return df

df_main = first_add_trans(df_main, pattern1)

  mask1 = df["address"].str.contains(pattern, regex=True, na=False)


In [136]:
def second_add_trans(df, pattern):
    """處理第二種情況：地址格式倒反"""

    mask3 = df["address"].str.contains(pattern, regex=True, na=False)
    extracted3 = df.loc[mask3, "address"].str.extract(pattern)

    df.loc[mask3, "city"] = extracted3[1].values
    df.loc[mask3, "district"] = extracted3[0].values

    return df

df_main = second_add_trans(df_main, pattern3)

  mask3 = df["address"].str.contains(pattern, regex=True, na=False)


In [137]:
def third_add_trans(df, pattern):
    """處理第三種狀況：只有區沒有市"""

    mask2 = df["address"].str.contains(pattern, regex=True, na=False)
    extracted2 = df.loc[mask2, "address"].str.extract(pattern)
    df.loc[mask2, "district"] = extracted2[0].values

    return df

df_main = third_add_trans(df_main, pattern2)

  mask2 = df["address"].str.contains(pattern, regex=True, na=False)


In [138]:
def fourth_add_trans(df, drop_words):
    """清理區中不乾淨的字元"""

    df["district"] = df["district"].str.replace("路竹", "鹿竹")
    pattern4 = "|".join(map(re.escape, drop_words))
    df["district"] = df["district"].str.replace(pattern4, "", regex=True)
    df["district"] = df["district"].str.replace("鹿竹", "路竹")

    return df

drop_words = ["路", "街", "巷", "弄", "段", "道"]
df_main = fourth_add_trans(df_main, drop_words)

In [139]:
def join_loc_id(df):
    """讀取地區資料，並與店家表join，留下loc id，並將city和district欄位去除"""

    # 讀取地區table
    sql_loc = "SELECT * FROM location"
    df_loc = pd.read_sql(sql_loc, engine)
    df_loc_select = df_loc[["loc_id", "city", "district"]]

    # 進行第一次join，根據市/區合併
    df = df.merge(df_loc_select, how="left", on=["city", "district"])

    # 取出loc id為空、沒有市資料的索引
    miss_loc = df["loc_id"].isna()

    # 如果有，進行第二次join
    if len(miss_loc) != 0:
        df_miss = df[miss_loc].drop(columns="loc_id", axis=1)

        df_miss = df_miss.merge(df_loc_select, how="left", on="district")

        df.loc[miss_loc, "loc_id"] = df_miss["loc_id"].values

    # 移除市區欄位
    df.drop(columns=["city", "district"], axis=1, inplace=True)

    return df

df_main = join_loc_id(df_main)

In [140]:
def join_type_id(df):
    """讀取類別資料，並與店家表join，留下類別id，將類別欄位去除"""

    sql_catego = "SELECT * FROM Category"
    df_catego = pd.read_sql(sql_catego, engine)

    df = df.merge(df_catego, how="left",
                            left_on="category", right_on="category_name")
    df.drop(columns=["category_name", "category_eng", "category"], axis=1, inplace=True)

    return df

df_main = join_type_id(df_main)

In [141]:
def add_id(df, type_id_dict, store_type):
    """加上類別id"""

    df["id"] = ""
    df = pdm.reassign_id(df, "id", type_id_dict[store_type])

    return df

df_main = add_id(df_main, type_id_dict, store_type)

In [142]:
def trans_op_hours(df):
    """將營業時間轉為時數"""

    df["opening_hours"] = df["opening_hours"].apply(dtm.trans_ophours_columns)

    op_hours_list = []
    for index, row in df.iterrows():
        op_time = row["opening_hours"]
        op_hours = dtm.trans_op_time_to_hours(op_time)
        op_hours_list.append(op_hours)

    df["op_hours"] = op_hours_list
    df.drop(columns="opening_hours", axis=1, inplace=True)

    return df

df_main = trans_op_hours(df_main)

In [143]:
new_col = ["id", 'name', 'buss_status', 'loc_id', 'address', 'phone', "op_hours", 'category_id', 'rating',
           'rating_total', 'newest_review', 'longitude', 'latitude', 'map_url', 'website', 'place_id', 'update_time']

df_main = df_main[new_col]

In [144]:
today = date.today().strftime("%Y%m%d")

finish_file = f"{today}{type_file_dict[store_type]}_finish.csv"
finish_path = os.path.join(folder, finish_file)
df_main.to_csv(finish_path, index=False, encoding="utf-8")

loc_na = df_main[df_main["loc_id"].isna()]

print(f"存檔完成，資料筆數：{len(df_main)}")
print(f"地區缺失資料數：{len(loc_na)}")

存檔完成，資料筆數：2085
地區缺失資料數：0


In [145]:
df_main.to_sql(name=f"{type_file_dict[store_type]}", con=engine, index=False, if_exists="replace")

2085