In [54]:
import re, os
import pandas as pd
from mako.template import Template

def format_punctuation(text):

    t = text
    t = re.sub(r'"([^"\n]+)"', r'"\1"', t)  # 替换英文引号为中文引号
    t = t.replace(",", "，")
    t = t.replace(":", "：")
    t = t.replace(";", "；")
    t = t.replace("?", "？")
    t = t.replace("!", "！")
    t = t.replace("(", "（")
    t = t.replace(")", "）")
    t = re.sub(r"(?<!\d)\.(?!\d)", "。", t) # 替换句号，但不替换小数点

     # ===== 去除 markdown 不需要的空格 =====
    # 1) 先把连续空白收敛成单个空格
    t = re.sub(r"[ \t]+", " ", t)

    # 2) 去掉中文标点前后的空格（含全角/半角括号、书名号等）
    t = re.sub(r"\s+([，。；：！？、（）【】《》「」『』])", r"\1", t)   # 标点前的空格
    t = re.sub(r"([（）【】《》「」『』])\s+", r"\1", t)            # 标点后的空格

    # 3) 去掉"中文-中文""数字-中文""中文-数字"之间的空格
    #    例：'成 都' -> '成都'，'2500 克' -> '2500克'
    t = re.sub(r"(?<=[\u4e00-\u9fff])[ \t]+(?=[\u4e00-\u9fff])", "", t)
    t = re.sub(r"(?<=\d)[ \t]+(?=[\u4e00-\u9fff])", "", t)
    t = re.sub(r"(?<=[\u4e00-\u9fff])[ \t]+(?=\d)", "", t)

    return t

def format_titles(text):
    # 处理序号为标准格式
    
    CIRCLED = "①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳㉑㉒㉓㉔㉕㉖㉗㉘㉙㉚"
    CHINESE_NUMS = "一二三四五六七八九十"
    
    t = text
    
    # 1) Replace escaped ASCII parentheses with circled numbers: \(①\), \(②\), etc. → 1., 2., etc.
    RE_ESCAPED_ASCII_CIRCLED = re.compile(rf"\\\(\s*([{re.escape(CIRCLED)}])\s*\\\)")
    
    def replace_escaped_ascii_circled(match):
        circled_char = match.group(1)
        list_num = CIRCLED.index(circled_char) + 1
        return f"{list_num}. "
    
    t = RE_ESCAPED_ASCII_CIRCLED.sub(replace_escaped_ascii_circled, t)
    
    # 2) Replace escaped fullwidth parentheses with circled numbers: \（①\）, \（②\）, etc. → 1., 2., etc.
    # Note: The backslashes before （ and ） are literal backslash characters
    RE_ESCAPED_FULLWIDTH_CIRCLED = re.compile(rf"\\\（\s*([{re.escape(CIRCLED)}])\s*\\\）")
    
    def replace_escaped_fullwidth_circled(match):
        circled_char = match.group(1)
        list_num = CIRCLED.index(circled_char) + 1
        return f"{list_num}. "
    
    t = RE_ESCAPED_FULLWIDTH_CIRCLED.sub(replace_escaped_fullwidth_circled, t)
    
    # 3) Replace fullwidth Chinese parentheses with digits: （1）, （2）, etc. → 1., 2., etc.
    RE_FULLWIDTH_PARENS = re.compile(r"（\s*(\d{1,2})\s*）")
    
    def replace_fullwidth(match):
        num = match.group(1)
        return f"{num}. "
    
    t = RE_FULLWIDTH_PARENS.sub(replace_fullwidth, t)
    
    # 4) Replace fullwidth Chinese parentheses with circled numbers: （①）, （②）, etc. → 1., 2., etc.
    RE_FULLWIDTH_PARENS_CIRCLED = re.compile(rf"（\s*([{re.escape(CIRCLED)}])\s*）")
    
    def replace_fullwidth_circled(match):
        circled_char = match.group(1)
        list_num = CIRCLED.index(circled_char) + 1
        return f"{list_num}. "
    
    t = RE_FULLWIDTH_PARENS_CIRCLED.sub(replace_fullwidth_circled, t)
    
    # 5) Replace standard English parentheses with digits: (1), (2), etc. → 1., 2., etc.
    RE_STANDARD_PARENS = re.compile(r"\(\s*(\d{1,2})\s*\)")
    
    def replace_standard(match):
        num = match.group(1)
        return f"{num}. "
    
    t = RE_STANDARD_PARENS.sub(replace_standard, t)
    
    # 6) Replace Chinese ordinal markers: 一、二、三、etc. → 1., 2., 3., etc.
    # This handles 一、 二、 三、 ... 十、 （十 = 10）
    RE_CHINESE_ORDINAL = re.compile(rf"([{re.escape(CHINESE_NUMS)}])、")
    
    def replace_chinese_ordinal(match):
        chinese_char = match.group(1)
        list_num = CHINESE_NUMS.index(chinese_char) + 1
        return f"{list_num}. "
    
    t = RE_CHINESE_ORDINAL.sub(replace_chinese_ordinal, t)
    
    # 7) Apply punctuation normalization
    t = format_punctuation(t)
    
    return t


In [68]:
def read_recipe(cuisine, source):
    data_file = f"./src/{cuisine}/{cuisine}_{source}.md"
    with open(data_file, "r") as f:
        content = f.read()
    class_list = content.split("\n# ")
    class_array = []
    data_list = []
    class_array.append(class_list[0].split(" ", 2)[1:])
    for class1 in class_list[1:]:
        class_array.append(class1.split(" ", 1))
    for class_row in class_array:
        class1, menu_content = class_row
        for menu in menu_content.split("\n## ")[1:]:
            menu_name, menu_detail = menu.split(" ", 1)
            data_list.append((class1, menu_name, menu_detail))
    columns = ["class", "menu_name", "menu_detail"]
    df = pd.DataFrame(data=data_list, columns=columns)
    df["cuisine"] = cuisine
    df["source"] = source
    return df

def read_recipes(cuisine, sources):
    df_list = []
    for source in sources:
        df_list.append(read_recipe(cuisine, source))
    return pd.concat(df_list)

def save_md(df):
    cuisine = df["cuisine"].iloc[0]

    for menu_name in df["menu_name"].drop_duplicates():
        df1 = df.loc[df["menu_name"] == menu_name]
        class1 = df1["class"].iloc[0]
        md_content = ""
        for idx in df1.index:
            menu_detail, source = df1.loc[idx, ["menu_detail", "source"]]
            # menu_detail = format_titles(menu_detail)
            md_content += f"## {source}版本\n\n{menu_detail}\n\n"
        out_folder = f"./src/{cuisine}/{class1}"
        md_file = f"{out_folder}/{menu_name}.md"
        if not os.path.exists(out_folder):
            os.mkdir(out_folder)
        with open(md_file, "w") as f:
            f.write(md_content)
        
    for class1 in df["class"].drop_duplicates():
        df1 = df.loc[df["class"] == class1]
        out_folder = f"./src/{cuisine}/{class1}"
        index_file = f"{out_folder}/index.md"

        index_content = f"# {class1}\n\n"
        index_content += "| | | | | |\n"
        index_content += "|--- |--- |--- |--- |---|\n"
        n = 1
        for menu_name in df1["menu_name"].drop_duplicates():
            index_content += f"| [{menu_name}]({menu_name}.md)"
            if n % 5 == 0:
                index_content += "|\n"
            n += 1
        with open(index_file, "w") as f:
            f.write(index_content)

def gen_summary(cuisine_dict):
    data_dict = {}
    for cuisine in cuisine_dict.keys():
        sources = cuisine_dict[cuisine]
        df = read_recipes(cuisine, sources)

        data_list = []
        for class1 in df["class"].drop_duplicates():
            df1 = df.loc[df["class"] == class1]
            menu_list = list(df1["menu_name"].drop_duplicates())
            data_list.append((class1, menu_list))
        data_dict[cuisine] = data_list

    mytemplate = Template(filename='./src/templates/summary.md')
    with open("./src/SUMMARY.md", "w") as f:
        f.write(mytemplate.render(data_dict=data_dict))
    

In [None]:
cuisine_dict = {
    # "sichuan":      ["北京饭店","北京饭店分册","锦江宾馆", "四川饭店", "陈松如"],
    # "shandong":     ["胡丽妹", "丰泽园"],
    # "huaiyang":     ["北京饭店","北京饭店分册"],
    # "guangdong":    ["北京饭店","北京饭店分册"],
    # "anhui":        ["中国名菜谱"],
    # "qingzhen":     ["杨永和"],
    # "shanghai":     ["李伯荣家常菜", "李伯荣宴席菜"],
    # "tanjia":       ["北京饭店", "北京饭店分册"]

}
for cuisine in cuisine_dict.keys():
    sources = cuisine_dict[cuisine]
    df = read_recipes(cuisine, sources)
    # save_md(df)
# gen_summary(cuisine_dict)


In [89]:
idx_list = []
for idx in df.index:
    # idx_list += [idx]
    tmp_list = [i for i in df.loc[idx, "menu_detail"].split("###")[1].split("\n")[1:] if i != ""]
    idx_list += [i for i in tmp_list if i[0] not in [str(j) for j in range(1, 10)]]
idx_list

['杏仁200克,大米100克,白糖300克。  ',
 '杏仁150克,大米50克,洋粉10克,白糖300克,杏仁精0.5克,金糕50克。  ',
 '核桃仁 300 克, 大米 100 克, 白糖 300 克, 小枣 100 克, 清水 1 千克。  ']

In [76]:
df.loc[df["menu_detail"].map(lambda x: len(x.split("### "))) > 4]

Unnamed: 0,class,menu_name,menu_detail,cuisine,source
137,鸡类,龙胎藏凤,"\n\n### 原料 \n\n1. 主料:生猪肚1个,仔母鸡1只(1250克左右)。 \...",sichuan,北京饭店分册
200,猪肉类,酒米酿肥肠,\n\n### 原料 \n\n1. 主料:肥肠头500克(二至三节)。 \n\n2. 配...,sichuan,北京饭店分册
