In [1]:
import json
import re
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path

import pandas as pd
from datasets import Dataset
from omegaconf import OmegaConf

from src.core.translate import Translator
from src.utils.schemas import (GeneralTranslationResultSchema,
                               RationaleTranslationResultSchema)

In [2]:
regex_check_enhlsih = re.compile(r'\b[a-zA-Z]{2,}\b')

In [3]:
def contains_english(text: str) -> bool:
	"""
	Checks if the given text contains English words.

	Args:
		text (str): Text to check.

	Returns:
		bool:		True if English words are found, False otherwise.
	"""
	return bool(re.search(regex_check_enhlsih, text))

def read_file(path: str):
	return Path(path).open().read()
	
def read_json(path: str):
	return json.load(Path(path).open())

In [4]:
config = OmegaConf.load('configs/conf.yaml')
general_translation_config = config.general_translation
general_translation_correction_config = config.general_translation_correction
rationales_translation_config = config.rationales_translation
rationales_translation_correction_config = config.rationales_translation_correction

In [5]:
print(read_json(general_translation_config.model_config_path))

{'base_url': 'https://bothub.chat/api/v2/openai/v1', 'api_key': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjU2N2RkOTMzLWYyMmEtNDdiMC1iNjJjLWFmODY4ZmNjNmQ4MCIsImlzRGV2ZWxvcGVyIjp0cnVlLCJpYXQiOjE3NDUwNDQwMDAsImV4cCI6MjA2MDYyMDAwMH0.g_9O0ivm62kR2nKfoLRcpcT4M1NxgHRQyc-4nwiIC7U', 'model': 'qwen-2.5-72b-instruct'}


In [6]:
general_translator = Translator(
    system_message=read_file(general_translation_config.prompt_path), 
    model_config=read_json(general_translation_config.model_config_path), 
    example_data=read_json(general_translation_config.filepath_examples), 
    batch_size=general_translation_config.batch_size,
    batch_result_dir=general_translation_config.batch_result_dir,
    batch_dir=general_translation_config.batches
)

general_translator_corrector = Translator(
    system_message=read_file(general_translation_correction_config.prompt_path), 
    model_config=read_json(general_translation_correction_config.model_config_path), 
    example_data=read_json(general_translation_correction_config.filepath_examples), 
    batch_size=general_translation_correction_config.batch_size,
    batch_result_dir=general_translation_correction_config.batch_result_dir,
    batch_dir=general_translation_correction_config.batches
)

rational_translator = Translator(
    system_message=read_file(rationales_translation_config.prompt_path), 
    model_config=read_json(rationales_translation_config.model_config_path), 
    example_data=read_json(rationales_translation_config.filepath_examples), 
    batch_size=rationales_translation_config.batch_size,
    batch_result_dir=rationales_translation_config.batch_result_dir,
    batch_dir=rationales_translation_config.batches
)

rational_translator_corrector = Translator(
    system_message=read_file(rationales_translation_correction_config.prompt_path), 
    model_config=read_json(rationales_translation_correction_config.model_config_path), 
    example_data=read_json(rationales_translation_correction_config.filepath_examples), 
    batch_size=rationales_translation_correction_config.batch_size,
    batch_result_dir=rationales_translation_correction_config.batch_result_dir,
    batch_dir=rationales_translation_correction_config.batches
)

general_translation_int_path = f"int_path_general_translator_{datetime.now()}.json"
rational_translation_int_path = f"int_path_rational_translator_{datetime.now()}.json"

## Translating

In [21]:
all_translations = []

In [22]:
general_data = json.load(Path("all_data.json").open())[:20]

In [23]:
for name in ["seeker_post", "response_post"]:
	general_dataset = [{"id": item['id'], "text": item[name]} for item in general_data]
	general_input_dataset = Dataset.from_list(general_dataset)
	print(general_input_dataset[0])
	translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)
	texts_containing_english = Dataset.from_list(translation_result).filter(lambda text: contains_english(text['text_rus'])) 
	print("Texts containing English: ", texts_containing_english["text_rus"])
	for t in texts_containing_english:
		translation_result.remove(t)
	for t in translation_result:
		flag = False
		for item in all_translations:
			if t["id"] == item['id']:
				flag = True
				item.update({f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})
				break
		if not flag:
			all_translations.append({"id": t["id"], f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})
	if texts_containing_english:
		json.dump(texts_containing_english.to_dict(), Path(f"wrong_translations_{name}").open("w"), ensure_ascii=False, indent=4)

{'id': '65m92s_dgbdk7z', 'text': "Help. Help me. I dunno what I'm doing anymore"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 73.03ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1637.12ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1891.03ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2197.12ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2037.06ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2136.68ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2563.76ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1727.47ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2165.36ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1691.93ba/s]

Processing batch 1/10...



Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 92.47ba/s]


Processing batch 2/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 302.29ba/s]


Processing batch 3/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 16.55ba/s]


Processing batch 4/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 242.25ba/s]


Processing batch 5/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 24.56ba/s]


Processing batch 6/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 56.40ba/s]


Processing batch 7/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 47.50ba/s]


Processing batch 8/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 103.79ba/s]


Processing batch 9/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 223.32ba/s]


Processing batch 10/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 205.89ba/s]
Filter: 100%|██████████| 20/20 [00:00<00:00, 801.13 examples/s]


Texts containing English:  ['Неправильный диагноз - ADHD vs Биполярная депрессия. В течение последних пяти лет я страдал от неверного диагноза. У меня ADHD и депрессия, но мне поставили диагноз биполярное расстройство. Кто-нибудь еще пережил это? Какие были последствия? Как вы с этим справлялись и какие действия предприняли?', 'Чего я, на хуй, Depressed. Я даже не помню, когда и за что, но я просто чувствую себя пустым. Например, я прекрасно провожу время, и вдруг мне просто хочется вернуться домой и лечь спать. Это настолько долбаное глупое. Я просто хочу быть счастливым, но даже не уверен, почему. Может, потому что я немного полный? Или из-за стресса? Или потому что у меня нет девушки? Но это глупая причина для депрессии. Почему я так поступаю с собой? Как, на хуй, поправить это? Зачем я пишу этот мусор, думая, что люди его прочтут?']
{'id': '65m92s_dgbdk7z', 'text': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or 

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1197.69ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2049.00ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1720.39ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1984.06ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 789.29ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 605.76ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1887.63ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1681.08ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1122.07ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1744.72ba/s]


Processing batch 1/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 271.14ba/s]


Processing batch 2/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 725.66ba/s]


Processing batch 3/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 355.24ba/s]


Processing batch 4/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1057.30ba/s]


Processing batch 5/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 191.49ba/s]


Processing batch 6/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 247.33ba/s]


Processing batch 7/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 521.42ba/s]


Processing batch 8/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1066.17ba/s]


Processing batch 9/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 696.38ba/s]


Processing batch 10/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 932.07ba/s]
Filter: 100%|██████████| 20/20 [00:00<00:00, 2198.33 examples/s]

Texts containing English:  []





In [24]:
with open("posts_translations_qwen72.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

### Correcting translation

In [18]:
input_dataset_correction = texts_containing_english.remove_columns(["text"]).rename_column("text_rus", "text")

In [19]:
translation_result_corrected = general_translator_corrector.translate(input_dataset_correction, GeneralTranslationResultSchema)

In [20]:
translation_result_corrected

[]

In [21]:
Dataset.from_list(translation_result_corrected).filter(lambda text: contains_english(text['text_rus'])) 

Dataset({
    features: [],
    num_rows: 0
})

In [22]:
for item in translation_result_corrected:
    if not contains_english(item['text_rus']):
        translation_result.append(item)

In [23]:
with open(general_translation_int_path, "w", encoding="utf-8") as f:
	json.dump(translation_result, f, ensure_ascii=False, indent=4)

## Translating rationales

In [25]:
for name in ["emotional_reactions_rationales", "explorations_rationales", "interpretations_rationales"]:
	all_data = Dataset.from_list(general_data).filter(lambda x: x[name])
	translation_result_filtered = list(filter(lambda x: x['id'] in all_data['id'], all_translations))
	dataset_rationales = [{"id": item['id'], "text_eng": item["response_post"], "text_rus": item1['response_post_rus'], 'rationales_eng': item[name]} for item, item1 in zip(all_data, translation_result_filtered)]
	input_dataset_rationales = Dataset.from_list(dataset_rationales)
	print(input_dataset_rationales[0])
	translation_result_rationales = rational_translator.translate(input_dataset_rationales, RationaleTranslationResultSchema)
	incorrectly_translated = [] 
	for item in translation_result_rationales:
		rats = item['rationales_rus'].strip('|').split('|')
		if len(item['rationales_eng'].strip('|').split("|")) != len(rats):
			incorrectly_translated.append(item)
		for r in rats:
			if item['text_rus'].find(r) == -1:
				incorrectly_translated.append(item)

	translated_rationales = list(filter(lambda x : x not in incorrectly_translated, translation_result_rationales))
	print("Incorrectly translated: ", incorrectly_translated)
	if incorrectly_translated:
		json.dump(incorrectly_translated, Path(f"wrong_translations_{name}").open("w"), ensure_ascii=False, indent=4)

	for t in translated_rationales:
		flag = False
		for item in all_translations:
			if t["id"] == item['id']:
				flag = True
				item.update({f"{name}_rus": t["rationales_rus"], f"{name}_en": t["rationales_eng"]})
				break

Filter: 100%|██████████| 20/20 [00:00<00:00, 886.38 examples/s]


{'id': '7oi3es_ds9oti2', 'text_eng': "Is that really so bad? Maybe it was the smart decision because you needed that time to read recover. You're being kind to yourself when you need it and that's important. Hope you feel better soon.", 'text_rus': 'Разве это настолько плохо? Возможно, это было разумное решение, потому что вам нужно было это время, чтобы оправиться. Вы проявляете доброту к себе, когда это необходимо, и это важно. Надеюсь, вы скоро почувствуете себя лучше.', 'rationales_eng': 'Hope you feel better soon|'}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 45.72ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1236.16ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1602.71ba/s]


Processing batch 1/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 931.03ba/s]


Processing batch 2/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 219.44ba/s]


Processing batch 3/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 746.18ba/s]


Incorrectly translated:  [{'id': '7gipvi_dqje246', 'text_rus': 'Я не хочу, чтобы ты прыгал. Я не хочу, чтобы ты умирал, ни так, ни иначе. Я не знаю точно, что ты переживаешь, но судя по всему, я бы посоветовал сосредоточиться на тех вещах в твоей жизни, которые тебе нравятся.', 'text_eng': "I don't want you to jump. I don't want you to die, in either one. I don't know exactly what you're going through but from the sounds of it, I'd suggest focusing on the parts in your life that you enjoy.", 'rationales_eng': "I don't want you to jump. I don't want you to die|", 'rationales_rus': 'Я не хочу, чтобы ты прыгал|Я не хочу, чтобы ты умирал, ни так, ни иначе|'}]


Filter: 100%|██████████| 20/20 [00:00<00:00, 6557.70 examples/s]


{'id': '65m92s_dgbdk7z', 'text_eng': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or work?", 'text_rus': 'Это довольно расплывчато, ты не знаешь, что делать в какой-то конкретной области своей жизни? Например, в школе или на работе?', 'rationales_eng': "do you not know what you're doing in regards to a specific section of your life? Like school or work?|"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 916.79ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1909.11ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1814.93ba/s]


Processing batch 1/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 290.79ba/s]


Processing batch 2/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 874.18ba/s]


Processing batch 3/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 385.36ba/s]


Incorrectly translated:  [{'id': '65m92s_dgbdk7z', 'text_rus': 'Это довольно расплывчато, ты не знаешь, что делать в какой-то конкретной области своей жизни? Например, в школе или на работе?', 'text_eng': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or work?", 'rationales_eng': "do you not know what you're doing in regards to a specific section of your life? Like school or work?|", 'rationales_rus': 'ты не знаешь, что делать в какой-то конкретной области своей жизни|Например, в школе или на работе|'}]


Filter: 100%|██████████| 20/20 [00:00<00:00, 5278.84 examples/s]


{'id': '6b2cmc_dhj8tcb', 'text_eng': "I think it's social anxiety , that creates paranoid feelings , unless I'm wrong but that's how I feel", 'text_rus': 'Я думаю, это социальная тревога, что создает параноидальные чувства, если я не ошибаюсь, но так я чувствую.', 'rationales_eng': "unless I'm wrong but that's how I feel|"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1339.18ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1733.18ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1751.28ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1639.68ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1715.46ba/s]


Processing batch 1/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 350.52ba/s]


Processing batch 2/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 454.32ba/s]


Processing batch 3/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 43.57ba/s]


Processing batch 4/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 309.61ba/s]


Processing batch 5/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 614.55ba/s]

Incorrectly translated:  [{'id': '8iz0as_dyvq1ne', 'text_rus': 'Не думаешь ли ты, что ты в петле. Фастфуд может вызывать депрессию. И быть в депрессии, вероятно, заставляет тебя больше есть фастфуд? У меня тоже было такое проблема какое-то время. Даже если это случайно, я, скорее всего, переем и потом буду чувствовать себя плохо. И снова буду переедать, чтобы попытаться поднять себе настроение.', 'text_eng': "By any chance do you think you're in a loop. Junk food can make you depressed. And being depressed probably makes you eat more junk food? For a while that was my problem too. Even if its just random, ill probably binge and feel bad later. Only to indulge in more later to try to cheer up.", 'rationales_eng': 'For a while that was my problem too. Even if its just random, ill probably binge and feel bad later. Only to indulge in more later to try to cheer up.|', 'rationales_rus': 'У меня тоже было такое проблема какое-то время|Даже если это случайно, я, скорее всего, переем и потом б




In [26]:
for a in general_data:
    for item in all_translations:
        if a["id"] == item["id"]:
            item.update({"emotional_reactions_level": a["emotional_reactions_level"], "explorations_level": a["explorations_level"], "interpretations_level": a["interpretations_level"]})
            if a["emotional_reactions_level"] == 0:
                item.update({"emotional_reactions_rationales_rus": "", "emotional_reactions_rationales_en": ""})
            if a["explorations_level"] == 0:
                item.update({"explorations_rationales_rus": "", "explorations_rationales_en": ""})
            if a["interpretations_level"] == 0:
                item.update({"interpretations_rationales_rus": "", "interpretations_rationales_en": ""})
            break

In [27]:
with open("all_translations_qwen72.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

### Correcting rationales

In [29]:
rationales_containing_english = Dataset.from_list(translation_result_rationales).filter(lambda text: contains_english(text['rationales_rus'])) 

Filter: 100%|██████████| 30/30 [00:00<00:00, 11455.67 examples/s]


In [30]:
rationales_containing_english

Dataset({
    features: ['id', 'text_rus', 'text_eng', 'rationales_eng', 'rationales_rus'],
    num_rows: 0
})

In [31]:
for t in rationales_containing_english:
    translation_result_rationales.remove(t)

In [32]:
keys = ['id', 'text_rus', 'text_eng', 'rationales_rus']
dataset_correction_rationales = {k: [d[k] for d in rationales_containing_english] for k in keys}
dataset_correction_rationales['rationales_eng'] = dataset_correction_rationales.pop('rationales_rus')

In [33]:
input_dataset_correction = Dataset.from_dict(dataset_correction_rationales)

In [34]:
translation_result_corrected = rational_translator_corrector.translate(input_dataset_correction, RationaleTranslationResultSchema)

In [35]:
translation_result_corrected

[]

In [36]:
Dataset.from_list(translation_result_corrected).filter(lambda text: contains_english(text['text_rus'])) 

Dataset({
    features: [],
    num_rows: 0
})

In [37]:
for item, rationale in zip(translation_result_corrected, rationales_containing_english):
    if not contains_english(item['text_rus']):
        translation_result_rationales.append({'id': item['id'], 'rationales_eng': rationale['rationales_eng'], 'rationales_rus': item['text_rus'], 'text_rus': rationale['text_rus'], 'text_eng': rationale['text_eng']})

In [38]:
d = Dataset.from_list(translation_result_rationales)
with open(rational_translation_int_path, "w", encoding="utf-8") as f:
	json.dump(d.to_list(), f, ensure_ascii=False, indent=4)

In [39]:
incorrectly_translated = [] 
for item in translation_result_rationales:
    rats = item['rationales_rus'].split('|')
    for r in rats:
        if item['text_rus'].find(r) == -1:
            incorrectly_translated.append(item)

translated_rationales = list(filter(lambda x : x not in incorrectly_translated, translation_result_rationales))

In [None]:
d = Dataset.from_list(translation_result_rationales)
with open(rational_translation_int_path, "w", encoding="utf-8") as f:
	json.dump(d.to_list(), f, ensure_ascii=False, indent=4)