In [None]:
ARXIV_ID = "2310.10083v2"

OPENAI_API_KEY = "sk-###"
WORKING_DIR = "/arxiv-translator/data/tmp"
TEMPLATE_DIR = "/arxiv-translator/templates"

In [4]:
import sys
sys.dont_write_bytecode = True

from jinja2 import Environment, FileSystemLoader
import logging
from pathlib import Path
from tqdm import tqdm

from arxiv_translator.file_utils import download_arxiv_source, unfreeze_targz, copy_item, find_files_by_ext, find_main_tex
from arxiv_translator import OpenAIChat, compile_tex
from arxiv_translator.tex_translator_utils import split_tex_to_chunks, insert_text_after_documentclass, remove_comments, reduce_newlines, is_only_commands, parse_code_blocks

In [None]:
def setup_logger():
    logger = logging.getLogger()  # ルートロガーを取得
    logger.setLevel(logging.INFO)

    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)

    formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")
    ch.setFormatter(formatter)

    logger.addHandler(ch)
    return logger
logger = setup_logger()

## 前処理

### ダウンロード

In [5]:
targz_path = download_arxiv_source(arxiv_id=ARXIV_ID, output_dir=WORKING_DIR)

2025-02-23 04:39:48,514 [INFO] arxiv_translator.file_utils: ダウンロード成功, from 2310.10083v2 to: /arxiv-translator/data/tmp/arxiv-2310.10083v2.tar.gz


### tarの解凍

In [6]:
raw_data_path = unfreeze_targz(targz_path, output_dir=WORKING_DIR)

2025-02-23 04:39:48,646 [INFO] arxiv_translator.file_utils: 解凍成功, from /arxiv-translator/data/tmp/arxiv-2310.10083v2.tar.gz to: /arxiv-translator/data/tmp/arxiv-2310.10083v2


##### 作業場所へのコピー

In [22]:
tex_dir = raw_data_path.parent/(raw_data_path.name+"-translated")
copy_item(src=raw_data_path, 
          dst=tex_dir, 
          overwrite=True)

2025-02-23 04:39:48,793 [INFO] arxiv_translator.file_utils: 成功, ディレクトリコピー from /arxiv-translator/data/tmp/arxiv-2310.10083v2 to: /arxiv-translator/data/tmp/arxiv-2310.10083v2-translated


## 本処理

### 翻訳用のLLM

In [None]:
jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
translator = OpenAIChat(api_key=OPENAI_API_KEY, 
                        model="gpt-4o", 
                        template=jinja_env.get_template('prompt_en_to_ja.j2'))

#### 日本語化パッケージの追加

In [7]:
main_tex_path = find_main_tex(tex_dir)
main_tex_contents = main_tex_path.read_text('utf-8')
main_tex_contents = insert_text_after_documentclass(content=main_tex_contents,
                                                    template=jinja_env.get_template('tex_style_ja.j2')
                                                    )
main_tex_path.write_text(main_tex_contents, encoding='utf-8')

46341

#### テキスト分割

In [30]:
tex_file_paths = find_files_by_ext(tex_dir, "tex")

for file_path in tex_file_paths:
    logging.info(file_path)
    file_path = Path(file_path)
    tex_content = file_path.read_text('utf-8')
    tex_content = remove_comments(tex_content)
    tex_content = reduce_newlines(tex_content)
    if is_only_commands(tex_content):
        continue
    else:
        # テキスト分割
        tex_chunks = split_tex_to_chunks(content=tex_content, token_counter=translator.count_tokens)
        # 翻訳
        translated_chunks=[]
        for tex_chunk in tqdm(tex_chunks, desc="翻訳中..."):
            if f"% skip start\n" in tex_chunk:
                translated_chunks.append(tex_chunk)
                logger.info("翻訳スキップ")
            else:
                translated_chunk = translator(tex_chunk)
                translated_chunk = parse_code_blocks(translated_chunk)[0]["code"]
                translated_chunks.append(translated_chunk)
        translated_tex_contents = "".join(translated_chunks)
        file_path.write_text(translated_tex_contents, encoding='utf-8')

2025-02-23 04:39:49,353 [INFO] root: /arxiv-translator/data/tmp/arxiv-2310.10083v2-translated/arxiv.tex
2025-02-23 04:39:49,357 [INFO] arxiv_translator.tex_translator_utils: \begin{document}が含まれていたので、この箇所でもチャンクを区切ります。
  0%|          | 0/9 [00:00<?, ?it/s]2025-02-23 04:40:01,895 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 22%|██▏       | 2/9 [00:11<00:41,  5.89s/it]2025-02-23 04:40:33,234 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 33%|███▎      | 3/9 [00:43<01:38, 16.49s/it]2025-02-23 04:40:50,489 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 44%|████▍     | 4/9 [01:00<01:23, 16.77s/it]2025-02-23 04:41:15,975 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 56%|█████▌    | 5/9 [01:25<01:19, 19.80s/it]2025-02-23 04:41:41,079 [INFO] httpx: HTTP Request: POST https://api.openai.com/v1/chat/compl

#### コンパイル

In [9]:
compile_tex(source_file_path=main_tex_path)

2025-02-23 04:42:47,738 [INFO] arxiv_translator.tex_compiler: 再試行します...
2025-02-23 04:42:56,338 [INFO] arxiv_translator.tex_compiler: 再試行します...
2025-02-23 04:42:58,535 [INFO] arxiv_translator.tex_compiler: コンパイル成功.


CompletedProcess(args=['latexmk', '-lualatex', '-interaction=nonstopmode', '/arxiv-translator/data/tmp/arxiv-2310.10083v2-translated/arxiv.tex'], returncode=0, stdout='Latexmk: All targets (arxiv.pdf) are up-to-date\n', stderr='Rc files read:\n  /etc/LatexMk\n  ./latexmkrc\nLatexmk: This is Latexmk, John Collins, 29 September 2020, version: 4.70b.\n')

### 3. 結果

In [10]:
compiled_pdf_path = find_files_by_ext(tex_dir, ext="pdf", single=True)
copy_item(src=compiled_pdf_path, dst=f"/data/{ARXIV_ID}_ja.pdf")

2025-02-23 04:42:58,573 [INFO] arxiv_translator.file_utils: 成功, ファイルコピー from /arxiv-translator/data/tmp/arxiv-2310.10083v2-translated/arxiv.pdf to: /data/2310.10083v2_ja.pdf
