#### 英文数据库

In [1]:
from datasets import load_dataset

##AI-MO___numina_math-co_t数据集
dataset = load_dataset(r"/root/.cache/huggingface/datasets/AI-MO___numina_math-co_t")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['source', 'problem', 'solution', 'messages'],
        num_rows: 859494
    })
    test: Dataset({
        features: ['source', 'problem', 'solution', 'messages'],
        num_rows: 100
    })
})


In [2]:
print(dataset['train'][5])

{'source': 'cn_k12', 'problem': 'In $\\triangle ABC$, the lengths of the sides opposite to angles $A$, $B$, and $C$ are $a$, $b$, and $c$ respectively. Given that $\\cos \\frac{C}{2} = \\frac{\\sqrt{5}}{3}$ and $a \\cos B + b \\cos A = 2$, find the maximum area of $\\triangle ABC$.', 'solution': 'Since $\\cos \\frac{C}{2} = \\frac{\\sqrt{5}}{3}$, we have $\\cos C = 2\\cos^2 \\frac{C}{2} - 1 = 2 \\left(\\frac{\\sqrt{5}}{3}\\right)^2 - 1 = \\frac{1}{9}$.\n\nUsing the cosine law, we have $a \\cos B + b \\cos A = 2$ can be written as\n\n$a \\frac{a^2 + c^2 - b^2}{2ac} + b \\frac{c^2 + b^2 - a^2}{2bc} = 2$\n\nSimplifying the equation, we obtain $c = 2$.\n\nNow, we have $4 = a^2 + b^2 - 2ab \\cos C \\geq 2ab - 2ab \\frac{1}{9} = \\frac{16}{9}ab$, which implies $ab \\leq \\frac{9}{4}$. The equality holds when $a = b = \\frac{3}{2}$.\n\nUsing the sine law, we have $\\sin C = \\sqrt{1 - \\cos^2 C} = \\sqrt{1 - \\left(\\frac{1}{9}\\right)^2} = \\frac{4\\sqrt{5}}{9}$.\n\nThe area of $\\triangle A

## 英文有概率翻译成中文

In [3]:
import torch
from modelscope import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset

device = "cuda" if torch.cuda.is_available() else "cpu"
cache_dir = "/root/autodl-tmp/model/GLM4-9bchat"  # 修改为你自己的缓存目录

############################################
# 1. 加载GLM4-9b-chat模型和tokenizer
############################################
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    "ZhipuAI/glm-4-9b-chat",
    trust_remote_code=True,
    cache_dir=cache_dir
)

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    "ZhipuAI/glm-4-9b-chat",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    cache_dir=cache_dir
).to(device).eval()

print("Model and tokenizer loaded on device:", device)

############################################
# 2. 加载数据集
############################################
ds2 = load_dataset("AI-MO/NuminaMath-CoT")
# ds2包含train和test两个split
print(ds2)
# DatasetDict({
#     train: Dataset({
#         features: ['source', 'problem', 'solution', 'messages'],
#         num_rows: 859494
#     })
#     test: Dataset({
#         features: ['source', 'problem', 'solution', 'messages'],
#         num_rows: 100
#     })
# })

############################################
# 3. 定义翻译函数 translate_text
############################################
def translate_text(text):
    """
    使用ZhipuAI/glm-4-9b-chat将输入文本从英文翻译成中文。
    处理列表类型时，将其join成字符串。
    """
    # 如果text是列表，则拼接成一个字符串
    if isinstance(text, list):
        text = " ".join(str(x) for x in text if x)

    # 如果依旧不是字符串，就做强制转换
    if not isinstance(text, str):
        text = str(text)

    text = text.strip()
    if not text:
        return ""

    # 构造对话消息：系统提示 + 用户消息
    system_message = {
        "role": "system",
        "content": "你是一位翻译助手，请将以下英文文本翻译成中文，保留数学公式与代码不变。"
    }
    user_message = {
        "role": "user",
        "content": text
    }
    messages = [system_message, user_message]

    # 格式化
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt",
        return_dict=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=1024,      # 可根据文本长度调大
            do_sample=False,
            top_k=1
        )
        # 移除输入部分
        output_ids = outputs[:, inputs['input_ids'].shape[1]:]
        translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return translated_text.strip()

############################################
# 4. 批量翻译函数，用于dataset.map()
############################################
def translate_batch(samples):
    """
    samples 是一个batch，每个字段都是列表。
    返回dict时，要包含与输入数量相同的元素列表。
    """
    source_list = samples["source"]
    problem_list = samples["problem"]
    solution_list = samples["solution"]
    messages_list = samples["messages"]

    source_zh = []
    problem_zh = []
    solution_zh = []
    messages_zh = []

    for i in range(len(source_list)):
        source_zh.append(translate_text(source_list[i]))
        problem_zh.append(translate_text(problem_list[i]))
        solution_zh.append(translate_text(solution_list[i]))
        messages_zh.append(translate_text(messages_list[i]))

    return {
        "source_zh": source_zh,
        "problem_zh": problem_zh,
        "solution_zh": solution_zh,
        "messages_zh": messages_zh
    }

############################################
# 5. 对 train/test split 批量翻译
############################################
# 注意：数据量很大，单次map会耗时和资源极高。先在少量数据上测试！
# 例如只翻译前100条：
# ds2_small = ds2["train"].select(range(100))

# 这里直接示范对 train 做map。请根据你实际硬件资源慎重决定！
print("Starting translation of train split... (this might take a LONG time!)")
ds2_train_zh = ds2["train"].map(
    translate_batch,
    batched=True,
    batch_size=2,  # 可调大/调小，看显存和速度
    desc="Translating train split"
)

print("Starting translation of test split...")
ds2_test_zh = ds2["test"].map(
    translate_batch,
    batched=True,
    batch_size=2,
    desc="Translating test split"
)

# 合并
ds2_zh = ds2.copy()
ds2_zh["train"] = ds2_train_zh
ds2_zh["test"] = ds2_test_zh

############################################
# 6. 打印部分翻译结果 & 保存
############################################

# 随机查看train前20条翻译结果
print("\n===== Sample translations from train split =====")
for i in range(20):
    print(f"---- Sample {i} ----")
    print("Original [source]:", ds2['train'][i]['source'])
    print("Translated [source_zh]:", ds2_zh['train'][i]['source_zh'])
    print()
    print("Original [problem]:", ds2['train'][i]['problem'])
    print("Translated [problem_zh]:", ds2_zh['train'][i]['problem_zh'])
    print("================================================\n")

# 保存到本地
save_path = "/root/autodl-tmp/code/2025service_creativity/process_dataset/NuminaMath-CoT-zh"
ds2_zh.save_to_disk(save_path)
print(f"Translated dataset saved to: {save_path}")


Loading tokenizer...


KeyboardInterrupt: 

In [None]:
save_path = "/root/autodl-tmp/NuminaMath-CoT-zh"
ds2_zh.save_to_disk(save_path)
print(f"Saved translated dataset to {save_path}")
