In [1]:
import sys
sys.path.append("/repos/arxiv-translator/src/*")

from src.file_utils import unfreeze_targz, copy_item, copy_pdf_file
from src.openai_chat import OpenAIChat
from src.tex_compiler import find_main_tex, compile_tex
from src.tex_translator_utils import split_tex_to_chunks, insert_after_documentclass, extract_quoted_text

In [2]:
import yaml
from tqdm import tqdm
import concurrent.futures

In [3]:
ARXIV_NAME = "arXiv-####.#####v#"

### 1. tarの解凍

In [4]:
unfreeze_targz(targz_path=f"/data/0_tar_gz_data/{ARXIV_NAME}.tar.gz", output_dir="/data/1_raw_data/")

### 2. 作業

##### 作業場所へのコピー

In [5]:
copy_item(src=f"/data/1_raw_data/{ARXIV_NAME}", dst=f"/data/2_working_data/{ARXIV_NAME}", overwrite=True)

#### 書き換え処理

In [6]:
with open("/config/configs.yml", "r", encoding="utf-8") as file:
    data = yaml.safe_load(file)

inserting_pre_text = data['tex']['pre_text'].replace("\\", "\\\\")
template = data["prompt"]["translate"]["en_to_ja"]

with open("/config/api_keys.yml", "r", encoding="utf-8") as file:
    data = yaml.safe_load(file)

api_key = data["OPENAI_API_KEY"]


In [8]:
openai_chat = OpenAIChat(api_key=api_key, model="gpt-4o", template=template)

#### テキスト分割

In [9]:
main_tex_path=find_main_tex(source_dir=f"/data/2_working_data/{ARXIV_NAME}")
with open(main_tex_path, 'r', encoding='utf-8') as file:
    tex_contents = file.read()

tex_contents = insert_after_documentclass(contents=tex_contents, inserting_pre_text=inserting_pre_text)
tex_chunks = split_tex_to_chunks(contents=tex_contents, chunk_size=5600, token_counter=openai_chat.count_tokens)

#### 翻訳

In [17]:
translated_chunks=[]
def process_chunk(chunk):
    # openai_chat(chunk)の戻り値をextract_quoted_textするだけ
    translated_chunk = openai_chat(chunk)
    return extract_quoted_text(translated_chunk)

translated_chunks = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    # executor.mapで並列実行しながら、順番通りに結果を取得
    for result in tqdm(executor.map(process_chunk, tex_chunks), total=len(tex_chunks)):
        translated_chunks.append(result)
translated_tex_contents = "".join(translated_chunks)

with open(main_tex_path, "w", encoding="utf-8") as file:
    file.write(translated_tex_contents)

#### コンパイル

In [11]:
compile_tex(source_file_path=main_tex_path)

### 3. 結果

In [28]:
copy_pdf_file(f"/data/2_working_data/{ARXIV_NAME}", f"/data/3_output_data/{ARXIV_NAME}.pdf")