In [29]:
SYSTEM_MESSAGE = """Given a JSON object, write an accurate translation into Russian for the original English sentence and save the results in a new field named text_rus. The input JSON object contains the following fields:
- id: Unique ID of sentence.
- text: English source text.

Your task is to: For each text in English **write its exact translation into Russian** taking into account the style of the sentence and its scientific significance (for example, medical, historical, etc.) and save the results in a new field named text_rus.
Just translate texts, no other comments are needed. We want to translate suicidal posts to russian to train a model on them to help people.

Important:
- Write down the corresponding translated Russian text in the form of a new text_rus field.
- For English text in text, you should definitely get the Russian text in text_rus.
No explanation, just output the updated JSON. 
"""

In [2]:
import time
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from datasets import Dataset
from pydantic import BaseModel

In [3]:
class ResultSchema(BaseModel):
    id: str
    text: str
    text_rus: str

In [12]:
data = {
    'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    'text': [
        "rn im in the area of not going to sleep so tmrw doesn't come",
        "Idk. It's 5 am and everything is hitting at once",
        "and im sooo tired :( ",
        "Just be ur self! Be true to who u r and wear what u want to wear, compare ur self to others because u will loose and forget who u r. ur life so live life to the best of ur ability and worry about what anyone else thinks. Be ur self because u r the only u",
        "Meh. What's the point....",
        "Yeet yeet. I'm gonna hang myself with the charger (",
        "For me, it's ironic, on one hand, I want to die ASAP maybe in my sleep, on the other, I want to live as long as possible to witness how far technology can reach, especially for astronomy and VR. I suppose what I'm trying to say is, as many had said before, we have to find our own meanings. We have to find 'joy' somehow, within this pointless struggle until the day we die. It's tough, I know, for I suffer the same."
		"no friends for 10 years. no gf. ugly. wtf do i do?. I'm a 24 year old old man.",
        ".... Feeling hopeless and useless at the moment... Empty, lost, darkness",
        "Very shitty. And you, OP?",
        "32m never had a girlfriend. It sucks that I've tried hard to find someone, even with online dating, but I feel like I'm going to be single forever.",
        "I always see this little tidbit in articles about how to overcome depression. Bitch, the most negative person in my life. not the problem, I am.",
        "POEM:Hollow. ***EMPTY*** *rooms and* ***EMPTY*** ***SOULS*** ***FORGOTTEN*** *places and* ***HEART*** *shaped holes* ***DEPRESSION*** *comes with days of* ***STRUGGLE*** *and often times a* ***BLOODY PUDDLE*** UP\-VOTES CURE MY DEPRESSION",
        "Aaaaaaand..... ... im crying in school again :)",
        "How can I accept my intrusive depressive thoughts?. Long story short: Was 335, Am 175. Aiming for 130/140ish. Body destroyed. ",
        "DAE Doubt their depression?. Sometimes after i go to tumblr and see all these things about people with depression and how they get out of their bed sometimes, and they cry themselves to sleep, it makes me feel like my depression bad enough. I know why."
	]
}

In [13]:
input_dataset = Dataset.from_dict(data)

In [14]:
input_dataset

Dataset({
    features: ['id', 'text'],
    num_rows: 15
})

In [15]:
import json
import os
from datasets import Dataset
import logging

def load_jsonl_to_dataset(filepath: str) -> Dataset:
    """Функция загружает файл JSONL и преобразует его обратно в формат dataset"""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return Dataset.from_dict({key: [d[key] for d in data] for key in data[0].keys()})

def save_batch_to_json(batch: Dataset, base_filename: str, batch_num: int, save_dir: str = ''):
    """Функция сохраняет батч в файл с префиксом _batch_<номер>"""
    batch_num += 1 # start with 1
    batch_filename = os.path.join(save_dir, f"{base_filename}_batch_{batch_num}.json")
    batch.to_json(batch_filename, force_ascii=False)
    logging.info(f"Batch {batch_num} saved to '{batch_filename}'")

def split_dataset_into_batches(dataset: Dataset, batch_size: int) -> list:
    """Функция разбивает датасет на батчи заданного размера"""
    num_batches = (len(dataset) + batch_size - 1) // batch_size  # округление вверх
    return [dataset.shard(num_shards=num_batches, index=i) for i in range(num_batches)]

def process_and_save_batches(filepath: str, batch_size: int, save: bool = True):
    """Функция загружает датасет, разбивает на батчи и сохраняет (опционально)"""

    base_filename = os.path.splitext(os.path.basename(filepath))[0] # Получаем имя файла без расширения и директорию
    save_dir = os.path.dirname(filepath)
    
    dataset = load_jsonl_to_dataset(filepath)
    logging.info(f"loaded dataset from '{filepath}', total examples: {len(dataset)}")

    # Разбиение на батчи
    batches = split_dataset_into_batches(dataset, batch_size)
    logging.info(f"split dataset into {len(batches)} batches")

    # Сохраняем батчи
    if save:
        for i, batch in enumerate(batches):
            save_batch_to_json(batch, base_filename, i + 1, save_dir)

    return batches
def create_directory_if_not_exists(directory_path):
    """Создает папку, если она не существует."""
    try:
        os.makedirs(directory_path, exist_ok=True)
        logging.info(f"Directory '{directory_path}' is ready.")
        return directory_path
    except Exception as e:
        logging.error(f"Failed to create directory '{directory_path}': {e}")
        return './'

In [80]:
with open('configs/conf.json', "r") as f:
	config = json.load(f)
model = ChatOpenAI(
	base_url=config["base_url"],
	api_key=config["api_key"],
	model=config["model"]
)

In [81]:
intermediate_path = f'data/int_path_{config["model"]}.json'
batch_size = 0
BATCH_RESULT_DIR = 'batches_res/'
BATCH_DIR = 'batches/'
FAILED_INDEXES_PATH = "failed_indexes.txt"

In [82]:
parser = StrOutputParser()

with open('data/filepath_examples.json', encoding='utf-8') as f:
	examples_data = json.load(f)

system_template = SYSTEM_MESSAGE
examples = examples_data["examples"]
system_template += "\n".join(
	f"Example Input: {json.dumps(example['input'], ensure_ascii=False, indent=2).replace('{', '{{').replace('}', '}}')}\n"
	f"Example Result: {json.dumps(example['result'], ensure_ascii=False, indent=2).replace('{', '{{').replace('}', '}}')}"
	for example in examples
)

prompt_template = ChatPromptTemplate.from_messages(
	[("system", system_template), ("user", "{text}")]
)

chain = prompt_template | model | parser

In [83]:
if batch_size > 0:
	logging.info(f"Splitting dataset into batches of size {batch_size}.")
	batched_input_dataset = split_dataset_into_batches(input_dataset, batch_size)
	batch_dir = create_directory_if_not_exists(BATCH_DIR)
	for i, batch in enumerate(batched_input_dataset):
		save_batch_to_json(batch=batch, base_filename= "separated_", batch_num = i, save_dir=batch_dir)
else:
	logging.info("Processing the entire dataset without batching.")
	batched_input_dataset = [input_dataset]  # Оборачиваем в список для единого формата

list_of_results = []

# Обработка каждого батча (или всего датасета, если батчинг не применяется)
for batch_idx, batch in enumerate(batched_input_dataset):
	print(f"Processing batch {batch_idx + 1}/{len(batched_input_dataset)}...")
	
	# Список результатов для текущего батча
	batch_results = []
	
	for i, input_example in enumerate(batch):
		user_input = 'Input: {0}\nResult:'.format(json.dumps(input_example, ensure_ascii=False))
		example_id = input_example.get("id", f"example_{i}") # получаем id примера или создаем временный

		success = False
		attempt = 0
		max_retries = 3
		retry_delay = 2

		while not success and attempt < max_retries:
			try:
				res = chain.invoke({"text": user_input}).strip('`').strip('json').strip()
				# print(f'Res[0]: {res[0]}')
				# print(f'Res: {res}')

				if not res or not res.startswith("{"):
					raise ValueError(f"Unexpected response format: {res}")
				result_json = json.loads(res)
				# print(f'Result json: {result_json}')

				# Приведение ID к строке
				result_json["id"] = str(result_json["id"])

				# Проверка схемы
				ResultSchema.parse_obj(result_json)

				batch_results.append(result_json)
				success = True
			except (json.JSONDecodeError, ValueError) as e:
				attempt += 1
				logging.warning(f"Error on input #{i} (id: {example_id}): {str(e)}. Attempt {attempt} of {max_retries}. Retrying...")
				time.sleep(retry_delay)
			except Exception as e:
				attempt += 1
				logging.error(f"Unexpected error on input #{i} (id: {example_id}): {str(e)}. Attempt {attempt} of {max_retries}. Retrying...")
				time.sleep(retry_delay)
 
	# Добавляем результаты текущего батча к общим результатам
	list_of_results.extend(batch_results)
	
	# Сохраняем результаты батча только если используется разбиение на батчи
	if batch_size > 0:
		batch_result_dir = create_directory_if_not_exists(BATCH_RESULT_DIR)
		save_batch_to_json(batch=Dataset.from_list(batch_results), base_filename="model_result", batch_num = batch_idx, save_dir=batch_result_dir)  # Сохраняем промежуточные результаты

	break

if intermediate_path:
	d = Dataset.from_list(list_of_results)
	with open(intermediate_path, "w", encoding="utf-8") as f:
		json.dump(d.to_list(), f, ensure_ascii=False, indent=4)

Processing batch 1/1...
Res[0]: {
Res: {"id": 0, "text": "rn im in the area of not going to sleep so tmrw doesn't come", "text_rus": "сейчас я в состоянии не ложиться спать, чтобы завтра не наступило"}
Result json: {'id': 0, 'text': "rn im in the area of not going to sleep so tmrw doesn't come", 'text_rus': 'сейчас я в состоянии не ложиться спать, чтобы завтра не наступило'}
Res[0]: {
Res: {"id": 1, "text": "Idk. It's 5 am and everything is hitting at once", "text_rus": "Не знаю. Сейчас 5 утра, и все наваливается одновременно"}
Result json: {'id': 1, 'text': "Idk. It's 5 am and everything is hitting at once", 'text_rus': 'Не знаю. Сейчас 5 утра, и все наваливается одновременно'}
Res[0]: {
Res: {"id": 2, "text": "and im sooo tired :( ", "text_rus": "и я так устал :( "}
Result json: {'id': 2, 'text': 'and im sooo tired :( ', 'text_rus': 'и я так устал :( '}
Res[0]: {
Res: {
  "id": 3,
  "text": "Just be ur self! Be true to who u r and wear what u want to wear, compare ur self to others b