## 准备，拉取最新文件

In [14]:
# prep for notebook
import sys
from pathlib import Path
from pprint import pprint

# 添加项目根目录到路径
notebook_dir = Path().resolve()     # test 目录
project_root = notebook_dir.parent  # 项目根目录
sys.path.insert(0, str(project_root))

import os
os.chdir("E:\\Workshop\\SC-Flint\\sc-cn-desc-generator")

In [15]:
import src.config as config
import module.utils as utils
from module.replacer import DescTemplateReplacer
from module.paratranz_filler import ParatranzFiller

In [16]:
import re

def text_proc(s: str):
    tbremoved = [' ', '\xa0', '\u00a0']
    for chr in tbremoved:
        s = re.sub(f"^{chr}*", '', s)
        s = re.sub(f"{chr}*$", '', s)
    return s.replace('\u00a0', ' ').replace('\xa0', ' ')

In [30]:
import re

def mask_numbers(text):
    """
    将数字和 N/A 替换为 <<NUM>>
    
    匹配规则：
    - 纯数字：123, -456, +789
    - 小数：3.14, -2.5, +0.99
    - 字符串：N/A
    
    Args:
        text: 输入文本
        
    Returns:
        替换后的文本
    """
    # 正则模式：匹配数字、小数、N/A
    # [+-]?       可选的正负号
    # \d+         一个或多个数字
    # \.?         可选的小数点
    # \d*         零个或多个数字
    # |N/A        或者 N/A
    pattern = r'[+-]?\d+\.?\d*|N/A'
    
    return re.sub(pattern, '<<NUM>>', text)


In [None]:
utils.download_files()

### 其他处理

In [11]:
# 转换map
import json

filename = 'manufacturers'
desc = '制造商'

data = dict()
with open(f"./cache/{filename}.map", 'r', encoding=config.ENCODE) as file:
    for line in file.readlines():
        t_en, _, t_cn = line.partition('=')
        data[t_en] = t_cn.removesuffix('\n')
with open(f"./data/{filename}.json", 'w', encoding=config.ENCODE) as file:
    json.dump({
        'name': filename,
        'desc': desc,
        'data': data
    }, file, indent=4, ensure_ascii=False)

## 获取键

In [36]:
# TODO refactor
reader = utils.TextReader(
    base_path='./cache'
)
ids = reader.find_ids_by_pattern(r'^item_desc.*$', True)
raw_data = dict()
raw_list = list()
raw_set  = set()
for id in ids:
    texts = reader.get(id)
    if texts is None or texts['cn'] is None or texts['en'] is None: continue
    if '：' not in texts['cn'] or ': ' not in texts['en']: continue
    lines = []
    for i in range(len(texts['cn'].split('\\n'))):
        if len(texts['cn']) and len(texts['en']):
            lines.append((
                text_proc(texts['cn'].split('\\n')[i]), 
                text_proc(texts['en'].split('\\n')[i])
            ))
    if len(lines): 
        raw_data[id] = lines
        raw_list += lines
        raw_set.update(lines)


## 按键获取值

In [None]:
# 获取白名单所有键
with open('./data/keys.json', 'r', encoding=config.ENCODE) as file:
    key_data = json.load(file)['data']
    keys = set(key_data.keys())

data = dict()
for text_cn, text_en in raw_list:
    if text_en.count(': ') !=1 or text_cn.count('：') != 1:
        continue
    key_en, _, val_en = text_en.partition(': ')
    key_cn, _, val_cn = text_cn.partition('：')
    
    key_en = text_proc(key_en)
    key_cn = text_proc(key_cn)
    val_en = mask_numbers(text_proc(val_en))
    val_cn = mask_numbers(text_proc(val_cn))
    
    if key_en not in keys: continue
    
    if key_en not in data:
        data[key_en] = dict()
    if val_en not in data[key_en]:
        data[key_en][val_en] = dict()
    if val_cn not in data[key_en][val_en]:
        data[key_en][val_en][val_cn] = 0
    
    data[key_en][val_en][val_cn] = int(data[key_en][val_en][val_cn]) + 1
    
with open('./cache/values.json', 'w', encoding=config.ENCODE) as file:
    json.dump(data, file, ensure_ascii=False, indent=4)
    

In [41]:
# 编辑后格式化values -> general_values
with open('./cache/values.json', 'r', encoding=config.ENCODE) as file:
    raw_value_data = json.load(file)
    
value_data = dict()
for key, values in raw_value_data.items():
    value_data[key] = dict()
    for value_key in values.keys():
        value_data[key][value_key] = list(values[value_key].keys())[0]

with open('./data/general_values.json', 'w', encoding=config.ENCODE) as file:
    json.dump(value_data, file, ensure_ascii=False, indent=4)

## 生成测试文本

In [44]:
import random

sample_num = 300

reader = utils.TextReader(base_path='cache')
ids = reader.find_ids_by_pattern(r'^item_desc.*$', True)

p_data = []
for id in random.sample(list(ids), sample_num):
    p_data.append({
        'key': id,
        'original': reader.get(id)['en']
    })
    
with open('cache/paratranz_test.json', 'w', encoding=config.ENCODE) as file:
    json.dump(p_data, file)
    