In [165]:
import json
import re
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path

import pandas as pd
from datasets import Dataset
from omegaconf import OmegaConf

from src.core.translate import Translator
from src.utils.schemas import (GeneralTranslationResultSchema,
                               RationaleTranslationResultSchema)

In [166]:
regex_check_enhlsih = re.compile(r'\b[a-zA-Z]{2,}\b')

In [167]:
def contains_english(text: str) -> bool:
	"""
	Checks if the given text contains English words.

	Args:
		text (str): Text to check.

	Returns:
		bool:		True if English words are found, False otherwise.
	"""
	return bool(re.search(regex_check_enhlsih, text))

def read_file(path: str):
	return Path(path).open().read()
	
def read_json(path: str):
	return json.load(Path(path).open())

In [168]:
config = OmegaConf.load('configs/conf.yaml')
general_translation_config = config.general_translation
general_translation_correction_config = config.general_translation_correction
rationales_translation_config = config.rationales_translation
rationales_translation_correction_config = config.rationales_translation_correction

In [169]:
general_translator = Translator(
    system_message=read_file(general_translation_config.prompt_path), 
    model_config=read_json(general_translation_config.model_config_path), 
    example_data=read_json(general_translation_config.filepath_examples), 
    batch_size=general_translation_config.batch_size,
    batch_result_dir=general_translation_config.batch_result_dir,
    batch_dir=general_translation_config.batches
)

general_translator_corrector = Translator(
    system_message=read_file(general_translation_correction_config.prompt_path), 
    model_config=read_json(general_translation_correction_config.model_config_path), 
    example_data=read_json(general_translation_correction_config.filepath_examples), 
    batch_size=general_translation_correction_config.batch_size,
    batch_result_dir=general_translation_correction_config.batch_result_dir,
    batch_dir=general_translation_correction_config.batches
)

rational_translator = Translator(
    system_message=read_file(rationales_translation_config.prompt_path), 
    model_config=read_json(rationales_translation_config.model_config_path), 
    example_data=read_json(rationales_translation_config.filepath_examples), 
    batch_size=rationales_translation_config.batch_size,
    batch_result_dir=rationales_translation_config.batch_result_dir,
    batch_dir=rationales_translation_config.batches
)

rational_translator_corrector = Translator(
    system_message=read_file(rationales_translation_correction_config.prompt_path), 
    model_config=read_json(rationales_translation_correction_config.model_config_path), 
    example_data=read_json(rationales_translation_correction_config.filepath_examples), 
    batch_size=rationales_translation_correction_config.batch_size,
    batch_result_dir=rationales_translation_correction_config.batch_result_dir,
    batch_dir=rationales_translation_correction_config.batches
)

general_translation_int_path = f"int_path_general_translator_{datetime.now()}.json"
rational_translation_int_path = f"int_path_rational_translator_{datetime.now()}.json"

## Translating

In [170]:
all_translations = []

In [171]:
general_data = json.load(Path("all_data.json").open())

In [None]:
for name in ["seeker_post", "response_post"]:
	general_dataset = [{"id": item['id'], "text": item[name]} for item in general_data]
	general_input_dataset = Dataset.from_list(general_dataset)
	print(general_input_dataset[0])
	translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)
	texts_containing_english = Dataset.from_list(translation_result).filter(lambda text: contains_english(text['text_rus'])) 
	print("Texts containing English: ", texts_containing_english["text_rus"])
	for t in texts_containing_english:
		translation_result.remove(t)
	for t in translation_result:
		flag = False
		for item in all_translations:
			if t["id"] == item['id']:
				flag = True
				item.update({f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})
				break
		if not flag:
			all_translations.append({"id": t["id"], f"{name}_rus": t["text_rus"], f"{name}t_en": t["text"]})
	if texts_containing_english:
		json.dump(texts_containing_english, Path(f"wrong_translations_{name}").open("w"))

In [173]:
with open("posts_translations.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

### Correcting translation

In [18]:
input_dataset_correction = texts_containing_english.remove_columns(["text"]).rename_column("text_rus", "text")

In [19]:
translation_result_corrected = general_translator_corrector.translate(input_dataset_correction, GeneralTranslationResultSchema)

In [20]:
translation_result_corrected

[]

In [21]:
Dataset.from_list(translation_result_corrected).filter(lambda text: contains_english(text['text_rus'])) 

Dataset({
    features: [],
    num_rows: 0
})

In [22]:
for item in translation_result_corrected:
    if not contains_english(item['text_rus']):
        translation_result.append(item)

In [23]:
with open(general_translation_int_path, "w", encoding="utf-8") as f:
	json.dump(translation_result, f, ensure_ascii=False, indent=4)

## Translating rationales

In [159]:
for name in ["emotional_reactions_rationales", "explorations_rationales", "interpretations_rationales"]:
	all_data = Dataset.from_list(general_data).filter(lambda x: x[name])
	translation_result_filtered = list(filter(lambda x: x['id'] in all_data['id'], all_translations))
	dataset_rationales = [{"id": item['id'], "text_eng": item["response_post"], "text_rus": item1['response_post_rus'], 'rationales_eng': item[name]} for item, item1 in zip(all_data, translation_result_filtered)]
	input_dataset_rationales = Dataset.from_list(dataset_rationales)
	print(input_dataset_rationales[0])
	translation_result_rationales = rational_translator.translate(input_dataset_rationales, RationaleTranslationResultSchema)
	incorrectly_translated = [] 
	for item in translation_result_rationales:
		rats = item['rationales_rus'].strip('|').split('|')
		if len(item['rationales_eng'].strip('|').split("|")) != len(rats):
			incorrectly_translated.append(item)
		for r in rats:
			if item['text_rus'].find(r) == -1:
				incorrectly_translated.append(item)

	translated_rationales = list(filter(lambda x : x not in incorrectly_translated, translation_result_rationales))
	print("Incorrectly translated: ", incorrectly_translated)
	if incorrectly_translated:
		json.dump(incorrectly_translated, Path(f"wrong_translations_{name}").open("w"))

	for t in translated_rationales:
		flag = False
		for item in all_translations:
			if t["id"] == item['id']:
				flag = True
				item.update({f"{name}_rus": t["rationales_rus"], f"{name}_en": t["rationales_eng"]})
				break

Filter: 100%|██████████| 15/15 [00:00<00:00, 723.52 examples/s]


{'id': '7oi3es_ds9oti2', 'text_eng': "Is that really so bad? Maybe it was the smart decision because you needed that time to read recover. You're being kind to yourself when you need it and that's important. Hope you feel better soon.", 'text_rus': 'Действительно ли это так плохо? Возможно, это было умное решение, потому что вам нужно было это время, чтобы восстановиться. Вы добры к себе, когда это необходимо, и это важно. Надеюсь, вам скоро станет лучше.', 'rationales_eng': 'Hope you feel better soon|'}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 67.65ba/s]


Processing batch 1/1...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 297.98ba/s]


Incorrectly translated:  []


Filter: 100%|██████████| 15/15 [00:00<00:00, 4795.68 examples/s]


{'id': '65m92s_dgbdk7z', 'text_eng': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or work?", 'text_rus': 'Это довольно расплывчато, ты не знаешь, что делаешь в отношении конкретной части своей жизни? Как школа или работа?', 'rationales_eng': "do you not know what you're doing in regards to a specific section of your life? Like school or work?|"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1237.62ba/s]


Processing batch 1/1...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 619.91ba/s]


Incorrectly translated:  []


Filter: 100%|██████████| 15/15 [00:00<00:00, 4052.99 examples/s]


{'id': '6b2cmc_dhj8tcb', 'text_eng': "I think it's social anxiety , that creates paranoid feelings , unless I'm wrong but that's how I feel", 'text_rus': 'Я думаю, это социальная тревожность, которая создаёт параноидальные чувства, если я не ошибаюсь, но вот так я себя чувствую', 'rationales_eng': "unless I'm wrong but that's how I feel|"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1211.88ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1315.65ba/s]


Processing batch 1/2...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 131.63ba/s]


Processing batch 2/2...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 203.54ba/s]

Incorrectly translated:  []





In [160]:
for a in general_data:
    for item in all_translations:
        if a["id"] == item["id"]:
            item.update({"emotional_reactions_level": a["emotional_reactions_level"], "explorations_level": a["explorations_level"], "interpretations_level": a["interpretations_level"]})
            if a["emotional_reactions_level"] == 0:
                item.update({"emotional_reactions_rationales_rus": "", "emotional_reactions_rationales_en": ""})
            if a["explorations_level"] == 0:
                item.update({"explorations_rationales_rus": "", "explorations_rationales_en": ""})
            if a["interpretations_level"] == 0:
                item.update({"interpretations_rationales_rus": "", "interpretations_rationales_en": ""})
            break

In [161]:
with open("all_translations.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

### Correcting rationales

In [29]:
rationales_containing_english = Dataset.from_list(translation_result_rationales).filter(lambda text: contains_english(text['rationales_rus'])) 

Filter: 100%|██████████| 30/30 [00:00<00:00, 11455.67 examples/s]


In [30]:
rationales_containing_english

Dataset({
    features: ['id', 'text_rus', 'text_eng', 'rationales_eng', 'rationales_rus'],
    num_rows: 0
})

In [31]:
for t in rationales_containing_english:
    translation_result_rationales.remove(t)

In [32]:
keys = ['id', 'text_rus', 'text_eng', 'rationales_rus']
dataset_correction_rationales = {k: [d[k] for d in rationales_containing_english] for k in keys}
dataset_correction_rationales['rationales_eng'] = dataset_correction_rationales.pop('rationales_rus')

In [33]:
input_dataset_correction = Dataset.from_dict(dataset_correction_rationales)

In [34]:
translation_result_corrected = rational_translator_corrector.translate(input_dataset_correction, RationaleTranslationResultSchema)

In [35]:
translation_result_corrected

[]

In [36]:
Dataset.from_list(translation_result_corrected).filter(lambda text: contains_english(text['text_rus'])) 

Dataset({
    features: [],
    num_rows: 0
})

In [37]:
for item, rationale in zip(translation_result_corrected, rationales_containing_english):
    if not contains_english(item['text_rus']):
        translation_result_rationales.append({'id': item['id'], 'rationales_eng': rationale['rationales_eng'], 'rationales_rus': item['text_rus'], 'text_rus': rationale['text_rus'], 'text_eng': rationale['text_eng']})

In [38]:
d = Dataset.from_list(translation_result_rationales)
with open(rational_translation_int_path, "w", encoding="utf-8") as f:
	json.dump(d.to_list(), f, ensure_ascii=False, indent=4)

In [39]:
incorrectly_translated = [] 
for item in translation_result_rationales:
    rats = item['rationales_rus'].split('|')
    for r in rats:
        if item['text_rus'].find(r) == -1:
            incorrectly_translated.append(item)

translated_rationales = list(filter(lambda x : x not in incorrectly_translated, translation_result_rationales))

In [None]:
d = Dataset.from_list(translation_result_rationales)
with open(rational_translation_int_path, "w", encoding="utf-8") as f:
	json.dump(d.to_list(), f, ensure_ascii=False, indent=4)