# 文件合并

In [1]:
import pandas as pd

# 定义要合并的 Excel 文件列表
excel_files = [
    r"C:\Users\wangz\Downloads\processed\part_1_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_2_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_3_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_4_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_5_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_6_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_7_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_8_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_9_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_10_translated.xlsx",
    r"C:\Users\wangz\Downloads\processed\part_11_translated.xlsx",
]

# 读取并合并所有 Excel 文件
df_list = [pd.read_excel(file) for file in excel_files]
merged_df = pd.concat(df_list, ignore_index=True)

# 保存合并后的 DataFrame 到新的 Excel 文件
merged_df.to_excel(r"D:\Projects\ai-translator\src\multilangInitData20250210空_translated.xlsx", index=False)


FileCreateError: [WinError 2] 系统找不到指定的文件。: 'C:\\Users\\wangz\\AppData\\Local\\Temp\\tmp2dbjpq31'

# 文件拆分

In [1]:
import os
import pandas as pd

def split_excel(input_file, output_folder, chunk_size=100000):
    """
    将一个Excel文件按指定行数拆分为多个Excel文件，并存储到指定文件夹。

    Args:
        input_file (str): 输入的Excel文件路径。
        output_folder (str): 输出文件夹路径。
        chunk_size (int): 每个拆分Excel文件包含的最大行数，默认10万。
    """
    # 检查输出文件夹是否存在，不存在则创建
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # 读取Excel文件（假设默认是Sheet1）
    print("正在读取Excel文件...")
    excel_data = pd.read_excel(input_file, sheet_name=0)

    # 获取总行数
    total_rows = excel_data.shape[0]
    print(f"Excel文件总行数: {total_rows} 行")

    # 按chunk_size进行分割
    for i in range(0, total_rows, chunk_size):
        chunk = excel_data.iloc[i:i + chunk_size]  # 提取数据块
        output_file = os.path.join(output_folder, f"part_{i // chunk_size + 1}.xlsx")
        chunk.to_excel(output_file, index=False, engine='openpyxl')  # 保存为Excel
        print(f"已保存: {output_file}")
    
    print("拆分完成！")

# 示例用法
if __name__ == "__main__":
    input_file = r"C:\Users\wangz\Downloads\开放平台内容多语.xlsx"  # 输入文件路径
    output_folder = r"D:\Projects\ai-translator\src\开放平台内容多语"  # 输出文件夹路径
    split_excel(input_file, output_folder)


正在读取Excel文件...
Excel文件总行数: 302099 行
已保存: D:\Projects\ai-translator\src\开放平台内容多语\part_1.xlsx
已保存: D:\Projects\ai-translator\src\开放平台内容多语\part_2.xlsx
已保存: D:\Projects\ai-translator\src\开放平台内容多语\part_3.xlsx
已保存: D:\Projects\ai-translator\src\开放平台内容多语\part_4.xlsx
拆分完成！


# 友互通翻译

In [2]:
from openai import OpenAI
from dotenv import load_dotenv
import zhconv

load_dotenv()
client = OpenAI()
target_columns = [
    # "繁体中文",
    "印尼语",
    "匈牙利语",
    "葡萄牙语",
    "泰语",
    "土耳其语",
    "越南语",
    "俄语",
    "阿拉伯语",
    "芬兰语",
    "丹麦语",
    "荷兰语",
    "波兰语",
    "法语",
    "德语",
    "日语",
    "挪威语",
    "希伯来语",
    "韩语",
    "西班牙语",
    "捷克语",
    "意大利语",
    "瑞典语",
    "希腊语",
    "马来语",
    "斯洛伐克语",
    "柬埔寨语",
    "罗马尼亚语",
    "克罗地亚语",
    "乌兹别克语",
    "缅甸语"
]

simple_column_name = "简体中文(源)"
english_column_name = "English"
trans_column_name = "繁体中文"

for target_column in target_columns:
    print(target_column)
    dictionary = {}

    def generate_text(index, data):
        if not pd.isnull(data[target_column]):
            dictionary[str(data[english_column_name])] = data[target_column]
            return index, data[target_column]
        if str(data[english_column_name]) in dictionary.keys():
            return index, dictionary[str(data[english_column_name])]
        if target_column == trans_column_name:
            return index, zhconv.convert(str(data[simple_column_name]), 'zh-tw')
        completion = client.chat.completions.create(
            model='gpt-4o-mini',
            messages=[
                {"role": "user", "content": f"Translate the following sentence or word from English to {simple_column_name}: {str(data[english_column_name])}, please directly translate it and do not output any extra content"},
                {"role": "assistant", "content": str(data[simple_column_name])},
                {"role": "user", "content": f"Translate the following sentence or word from English to {trans_column_name}: {str(data[english_column_name])}, please directly translate it and do not output any extra content"},
                {"role": "assistant", "content": zhconv.convert(str(data[simple_column_name]), 'zh-tw')},
                {"role": "user", "content": f"Translate the following sentence or word from English to {target_column}: {str(data[english_column_name])}, please directly translate it and do not output any extra content"}
            ],
            # messages=[
            #     {"role": "user", "content": f"Translate the following sentence from Chinese to {english_column_name}: {str(data[simple_column_name])}, please directly translate it and do not output any extra content"},
            #     {"role": "assistant", "content": str(data[english_column_name])},
            #     {"role": "user", "content": f"Translate the following sentence from Chinese to {trans_column_name}: {str(data[simple_column_name])}, please directly translate it and do not output any extra content"},
            #     {"role": "assistant", "content": str(data[trans_column_name])},
            #     {"role": "user", "content": f"Translate the following sentence from Chinese to {target_column}: {str(data[simple_column_name])}, please directly translate it and do not output any extra content"}
            # ],
            temperature=0
        )
        dictionary[str(data[english_column_name])] = completion.choices[0].message.content
        return index, completion.choices[0].message.content

    from tqdm import tqdm
    from concurrent.futures import ThreadPoolExecutor, as_completed
    import pandas as pd
    to_fix = pd.read_excel(r"/Users/princepride/Documents/Project/ai-translator/src/友户通-新增词条-需要翻译-33种-20250530.xlsx")

    with ThreadPoolExecutor(max_workers=1000) as executor:
        futures = {executor.submit(generate_text, index, row) for index, row in to_fix.iterrows()}

        for future in tqdm(as_completed(futures), total=len(futures)):
            index, output = future.result()

            if output is not None:
                to_fix.at[index, target_column] = output

    # 保存最终结果到 Excel 文件
    to_fix.to_excel(r"/Users/princepride/Documents/Project/ai-translator/src/友户通-新增词条-需要翻译-33种-20250530.xlsx", index=False)

印尼语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.68s/it]


匈牙利语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


葡萄牙语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


泰语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


土耳其语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.26s/it]


越南语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.31s/it]


俄语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.58s/it]


阿拉伯语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.52s/it]


芬兰语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.36s/it]


丹麦语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.72s/it]


荷兰语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.37s/it]


波兰语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.45s/it]


法语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.19s/it]


德语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.02s/it]


日语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.60s/it]


挪威语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.52s/it]


希伯来语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


韩语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.08s/it]


西班牙语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.51s/it]


捷克语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


意大利语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:00<00:00,  1.00it/s]


瑞典语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.00s/it]


希腊语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.26s/it]


马来语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


斯洛伐克语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


柬埔寨语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.24s/it]


罗马尼亚语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.61s/it]


克罗地亚语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.21s/it]


乌兹别克语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


缅甸语


  to_fix.at[index, target_column] = output
100%|██████████| 1/1 [00:01<00:00,  1.52s/it]


# PDF翻译

In [1]:
from dotenv import load_dotenv
load_dotenv()

!pdf2zh "/Users/princepride/Downloads/2503.06923v1.pdf" -li en -lo zh -s openai

not in git repo
Namespace(files=['/Users/princepride/Downloads/2503.06923v1.pdf'], debug=False, pages=None, vfont='', vchar='', lang_in='en', lang_out='zh', service='openai', output='', thread=4, interactive=False, share=False, flask=False, celery=False, authorized=None, prompt=None, compatible=False, onnx=None, serverport=None, dir=False, config=None, babeldoc=False, skip_subset_fonts=False, ignore_cache=False)
[2;36m[07/08/25 10:01:07][0m[2;36m [0m[34mINFO    [0m INFO:pdf2zh.high_level:use font:  ]8;id=402860;file:///opt/miniconda3/envs/ai-trans/lib/python3.12/site-packages/pdf2zh/high_level.py\[2mhigh_level.py[0m]8;;\[2m:[0m]8;id=684286;file:///opt/miniconda3/envs/ai-trans/lib/python3.12/site-packages/pdf2zh/high_level.py#423\[2m423[0m]8;;\
[2;36m                    [0m         [35m/Users/princepride/.cache/babeldo[0m [2m                 [0m
[2;36m                    [0m         [35mc/fonts/[0m[95mSourceHanSerifCN-Regular.[0m [2m                 [0m

In [2]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(base_url="https://generativelanguage.googleapis.com/v1beta/", api_key="")
target_column = "印尼语"

def generate_text(index, data):
    if not pd.isnull(data[target_column]):
        return index, data[target_column]
    completion = client.chat.completions.create(
        model='gemini-1.5-flash',
        messages=[
            {"role": "user", "content": f"Translate English to Chinese: {str(data['English(译)'])}, please directly translate it and do not output any extra content"},
            {"role": "assistant", "content": str(data['简体中文(源)'])},
            {"role": "user", "content": f"Translate English to Traditional Chinese: {str(data['English(译)'])}, please directly translate it and do not output any extra content"},
            {"role": "assistant", "content": str(data['繁体中文（译）'])},
            {"role": "user", "content": f"Translate English to {target_column}: {str(data['English(译)'])}, please directly translate it and do not output any extra content"}
        ],
        temperature=0
    )
    return index, completion.choices[0].message.content

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
to_fix = pd.read_excel(r"D:\Projects\ai-translator\src\友互通\友互通-全.xlsx")

with ThreadPoolExecutor(max_workers=1000) as executor:
    futures = {executor.submit(generate_text, index, row) for index, row in to_fix.iterrows()}

    for future in tqdm(as_completed(futures), total=len(futures)):
        index, output = future.result()

        if output is not None:
            to_fix.at[index, target_column] = output

# 保存最终结果到 Excel 文件
to_fix.to_excel(r"D:\Projects\ai-translator\src\友互通\友互通-全.xlsx", index=False)

(0, 'matahari\n')

# 术语表合并

In [3]:
import pandas as pd

original_path = r"D:\Projects\ai-translator\src\models\API\chatgpt-4o-mini\glossary.xlsx"
new_path = r"C:\Users\wangz\Downloads\用友技术类常用术语表 for MT-20250320.xlsx"
output_path = "glossary.xlsx"

def merge_translation_tables(original_path, new_path, output_path, original_language, target_language):
    # 读取原始术语表和新增术语表
    original_df = pd.read_excel(original_path)
    new_df = pd.read_excel(new_path)
    
    # 确保列名存在
    if original_language not in original_df.columns or target_language not in original_df.columns:
        raise ValueError("Original language and target language columns must exist in the original table")
    if original_language not in new_df.columns or target_language not in new_df.columns:
        raise ValueError("Original language and target language columns must exist in the new table")
    
    # 过滤掉原始术语为空的行
    original_df = original_df.dropna(subset=[original_language])
    new_df = new_df.dropna(subset=[original_language])
    
    # 找出新增的术语（原术语表中不存在的）
    existing_terms = set(original_df[original_language].astype(str))
    filtered_new_df = new_df[~new_df[original_language].astype(str).isin(existing_terms)]
    
    # 合并数据表
    merged_df = pd.concat([original_df, filtered_new_df], ignore_index=True)
    
    # 保存合并后的数据
    merged_df.to_excel(output_path, index=False)
    
    print(f"术语表已合并，保存至: {output_path}")
    return merged_df

merge_translation_tables(original_path, new_path, output_path, "Chinese", "English")

术语表已合并，保存至: new_glossary.xlsx


Unnamed: 0,Chinese,English,Thai
0,会计服务,Accounting,บริการด้านบัญชี
1,企业绩效,Enterprise Performance Management,ผลการดำเนินธุรกิจ
2,业财综合分析,Financial Analytics,การวิเคราะห์การเงินแบบบูรณาการ
3,数智合同,Contract Lifecycle Management,สัญญาอัจฉริยะดิจิทัล
4,采购供应,Material Mgmt,การจัดหาซัพพลาย
...,...,...,...
5048,缓存机制,Caching Mechanism,
5049,管理类角色,Administrative Role,
5050,业务类角色,Business Role,
5051,应用发布,Application Release,
