In [1]:
import json
import re
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path

import pandas as pd
from datasets import Dataset
from omegaconf import OmegaConf

from src.core.translate import Translator
from src.utils.schemas import (GeneralTranslationResultSchema,
                               RationaleTranslationResultSchema)
from collections import defaultdict
from copy import copy

In [2]:
regex_check_enhlsih = re.compile(r'\b[a-zA-Z]{2,}\b')

In [3]:
def contains_english(text: str) -> bool:
	"""
	Checks if the given text contains English words.

	Args:
		text (str): Text to check.

	Returns:
		bool:		True if English words are found, False otherwise.
	"""
	return bool(re.search(regex_check_enhlsih, text))

def read_file(path: str):
	return Path(path).open().read()
	
def read_json(path: str):
	return json.load(Path(path).open())

In [4]:
config = OmegaConf.load('configs/conf.yaml')
general_translation_config = config.general_translation
general_translation_correction_config = config.general_translation_correction
rationales_translation_config = config.rationales_translation
rationales_translation_correction_config = config.rationales_translation_correction

In [5]:
print(read_json(general_translation_config.model_config_path))

{'model_uri': 'gpt://b1gjag92sd76ed87o4h8/yandexgpt-lite', 'api_key': 'AQVNxEuHAQ777bIGDAXgumFxuRBtnB0A__JeWWJJ', 'url': 'https://llm.api.cloud.yandex.net/foundationModels/v1/completion'}


In [6]:
general_translator = Translator(
    system_message=read_file(general_translation_config.prompt_path), 
    model_config=read_json(general_translation_config.model_config_path), 
    example_data=read_json(general_translation_config.filepath_examples), 
    batch_size=general_translation_config.batch_size,
    batch_result_dir=general_translation_config.batch_result_dir,
    batch_dir=general_translation_config.batches,
    model_type="yandex_gpt"
)

general_translator_corrector = Translator(
    system_message=read_file(general_translation_correction_config.prompt_path), 
    model_config=read_json(general_translation_correction_config.model_config_path), 
    example_data=read_json(general_translation_correction_config.filepath_examples), 
    batch_size=general_translation_correction_config.batch_size,
    batch_result_dir=general_translation_correction_config.batch_result_dir,
    batch_dir=general_translation_correction_config.batches,
    model_type="yandex_gpt"
)

rational_translator = Translator(
    system_message=read_file(rationales_translation_config.prompt_path), 
    model_config=read_json(rationales_translation_config.model_config_path), 
    example_data=read_json(rationales_translation_config.filepath_examples), 
    batch_size=rationales_translation_config.batch_size,
    batch_result_dir=rationales_translation_config.batch_result_dir,
    batch_dir=rationales_translation_config.batches,
    model_type="yandex_gpt"
)

rational_translator_corrector = Translator(
    system_message=read_file(rationales_translation_correction_config.prompt_path), 
    model_config=read_json(rationales_translation_correction_config.model_config_path), 
    example_data=read_json(rationales_translation_correction_config.filepath_examples), 
    batch_size=rationales_translation_correction_config.batch_size,
    batch_result_dir=rationales_translation_correction_config.batch_result_dir,
    batch_dir=rationales_translation_correction_config.batches,
    model_type="yandex_gpt"
)

general_translation_int_path = f"int_path_general_translator_{datetime.now()}.json"
rational_translation_int_path = f"int_path_rational_translator_{datetime.now()}.json"

## Translating

In [7]:
all_translations = []

In [8]:
general_data = json.load(Path("all_data.json").open())[:5]

In [9]:
def index(a_list, value):
    try:
        return a_list.index(value)
    except ValueError:
        return -1

In [10]:
incorrect_translations = defaultdict()

In [11]:
for name in ["seeker_post", "response_post"]:
	general_dataset = [{"id": item['id'], "text": item[name]} for item in general_data]
	general_input_dataset = Dataset.from_list(general_dataset)

	print(general_input_dataset[0])

	translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)
	incorrect_translations[name] = list(filter(lambda x: contains_english(x['text_rus']), translation_result))
	texts_containing_english = Dataset.from_list(incorrect_translations[name])
	
	print("Texts containing English: ", texts_containing_english)

	all_translations_ids = [item["id"] for item in all_translations]
	for t in texts_containing_english:
		present_idx = index(all_translations_ids, t['id'])
		translation_result.remove(t)
		if present_idx != -1:
			del all_translations[present_idx]

	all_translations_ids = [item["id"] for item in all_translations]
	for t in translation_result:
		present_idx = index(all_translations_ids, t['id'])
		if present_idx != -1:
			all_translations[present_idx].update({f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})
		else:
			all_translations.append({"id": t["id"], f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})
	if texts_containing_english:
		json.dump(texts_containing_english.to_dict(), Path(f"wrong_translations_{name}_yandex").open("w"), ensure_ascii=False, indent=4)

{'id': '65m92s_dgbdk7z', 'text': "Help. Help me. I dunno what I'm doing anymore"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 372.83ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1064.81ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1067.25ba/s]

Processing batch 1/3...



Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 557.16ba/s]


Processing batch 2/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 551.88ba/s]


Processing batch 3/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 325.01ba/s]


Texts containing English:  Dataset({
    features: [],
    num_rows: 0
})
{'id': '65m92s_dgbdk7z', 'text': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or work?"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1133.90ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1276.80ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1805.55ba/s]


Processing batch 1/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 724.53ba/s]


Processing batch 2/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 269.89ba/s]


Processing batch 3/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 704.81ba/s]

Texts containing English:  Dataset({
    features: [],
    num_rows: 0
})





In [12]:
with open("posts_translations_correct_yandex.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

### Correcting translation

In [13]:
incorrect_translations

defaultdict(None, {'seeker_post': [], 'response_post': []})

In [None]:
for name, items in incorrect_translations.items():
    keys = ['id', 'text_rus', 'text']
    dataset_correction = {k: [d[k] for d in items] for k in keys}
    dataset_correction['text'] = dataset_correction.pop('text_rus')
    if dataset_correction['id']:
        all_translations_ids = [item["id"] for item in all_translations]
        input_dataset_correction = Dataset.from_dict(dataset_correction)
        translation_result_corrected = general_translator_corrector.translate(input_dataset_correction, GeneralTranslationResultSchema)
        incorrect_translations[name] = list(filter(lambda x: contains_english(x['text_rus']), translation_result_corrected))
        for item in translation_result_corrected:
            if not contains_english(item['text_rus']):
                present_idx = index(all_translations_ids, item['id'])
                if present_idx != -1:
                    all_translations[present_idx].update({f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})

In [15]:
incorrect_translations

defaultdict(None, {'seeker_post': [], 'response_post': []})

In [16]:
with open("all_posts_translations", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

## Translating rationales

In [17]:
incorrectly_translated_rationales = defaultdict()

In [None]:
for name in ["emotional_reactions_rationales", "explorations_rationales", "interpretations_rationales"]:
	all_data = Dataset.from_list(general_data).filter(lambda x: x[name]).sort("id")
	translation_result_filtered = list(filter(lambda x: x['id'] in all_data['id'], all_translations))
	translation_result_filtered = sorted(translation_result_filtered, key=lambda x: x['id'])
	dataset_rationales = [{"id": item['id'], "text_eng": item["response_post"], "text_rus": item1['response_post_rus'], 'rationales_eng': item[name]} for item, item1 in zip(all_data, translation_result_filtered)]
	input_dataset_rationales = Dataset.from_list(dataset_rationales)

	print(input_dataset_rationales[0])

	translation_result_rationales = rational_translator.translate(input_dataset_rationales, RationaleTranslationResultSchema)
	all_translations_ids = [item["id"] for item in all_translations]
	
	incorrectly_translated = [] 
	for item in translation_result_rationales:
		rats = item['rationales_rus'].strip('|').split('|')
		if len(item['rationales_eng'].strip('|').split("|")) != len(rats):
			incorrectly_translated.append(item)
		for r in rats:
			if item['text_rus'].find(r) == -1:
				incorrectly_translated.append(item)
	
	incorrectly_translated_rationales[name] = incorrectly_translated

	translated_rationales = list(filter(lambda x : x not in incorrectly_translated, translation_result_rationales))
	print("Incorrectly translated: ", incorrectly_translated)
	if incorrectly_translated:
		json.dump(incorrectly_translated, Path(f"wrong_translations_{name}_yandex_32batch").open("w"), ensure_ascii=False, indent=4)

	for t in translated_rationales:
		present_idx = index(all_translations_ids, t['id'])
		if present_idx != -1:
			all_translations[present_idx].update({f"{name}_rus": t["rationales_rus"], f"{name}_en": t["rationales_eng"]})

In [37]:
for a in general_data:
	all_translations_ids = [item["id"] for item in all_translations]
	present_idx = index(all_translations_ids, a['id'])
	if present_idx != -1:
		all_translations[present_idx].update({"emotional_reactions_level": a["emotional_reactions_level"], "explorations_level": a["explorations_level"], "interpretations_level": a["interpretations_level"]})
		if a["emotional_reactions_level"] == 0:
			all_translations[present_idx].update({"emotional_reactions_rationales_rus": "", "emotional_reactions_rationales_en": ""})
		if a["explorations_level"] == 0:
			all_translations[present_idx].update({"explorations_rationales_rus": "", "explorations_rationales_en": ""})
		if a["interpretations_level"] == 0:
			all_translations[present_idx].update({"interpretations_rationales_rus": "", "interpretations_rationales_en": ""})

In [38]:
with open("all_translations_yandex_correct.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

### Correcting rationales

In [39]:
incorrectly_translated_rationales

defaultdict(None,
            {'emotional_reactions_rationales': [{'id': '50sgzd_d76kxsp',
               'text_rus': 'Это совершенно нормально, большинство людей время от времени испытывают такое. Это пройдёт, когда — вот в чём вопрос. Хорошая новость в том, что ты можешь поработать над этим «когда», по крайней мере, у меня это работает. Я думаю, ключ в том, чтобы принимать всё, что ты чувствуешь, не бороться с этим и не пытаться изменить, даже пытаться заставить себя плакать. Суть в том, чтобы действительно изменить своё мышление с «у меня депрессия, я не могу её вылечить, она убивает меня» на «я снова грущу без причины, человеческий мозг — куча дерьма и чертовски глючный, я буду плакать весь день, чтобы дать ему то, что он хочет»... ',
               'text_eng': "I'm sorry to hear that you're feeling this way. You're not alone.",
               'rationales_eng': "I'm sorry to hear that you're feeling this way|You're not alone.",
               'rationales_rus': 'Мне жаль слышать, чт

In [40]:
for name, items in incorrectly_translated_rationales.items():
	keys = ['id', 'text_rus', 'text_eng', 'rationales_rus']
	dataset_correction_rationales = {k: [d[k] for d in items] for k in keys}
	dataset_correction_rationales['rationales_eng'] = dataset_correction_rationales.pop('rationales_rus')
	if dataset_correction_rationales['id']:
		all_translations_ids = [item["id"] for item in all_translations]
		input_dataset_correction = Dataset.from_dict(dataset_correction_rationales)
		translation_result_corrected = rational_translator_corrector.translate(input_dataset_correction, RationaleTranslationResultSchema)
		for item in translation_result_corrected:
			base_item = list(filter(lambda x: x['id'] == item['id'], items))[0]
			correct = True
			rats = item['rationales_rus'].strip('|').split('|')
			if len(base_item['rationales_eng'].strip('|').split("|")) != len(rats):
				correct = False
			for r in rats:
				if item['text_rus'].find(r) == -1:
					correct = False
			if correct:
				present_idx = index(all_translations_ids, item['id'])
				incorrectly_translated_rationales[name].remove(base_item)
				if present_idx != -1:
					all_translations[present_idx].update({f"{name}_rus": t["rationales_rus"], f"{name}_en": base_item["rationales_eng"]})
			else:
				items[index(items, base_item)] = item

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 278.38ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 733.14ba/s]




Processing batch 1/2...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 422.30ba/s]


Processing batch 2/2...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 825.00ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1337.04ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1880.01ba/s]


Processing batch 1/2...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 693.39ba/s]


Processing batch 2/2...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 96.52ba/s]


In [41]:
incorrectly_translated_rationales

defaultdict(None,
            {'emotional_reactions_rationales': [{'id': '50sgzd_d76kxsp',
               'text_rus': 'Это совершенно нормально, большинство людей время от времени испытывают такое. Это пройдёт, когда — вот в чём вопрос. Хорошая новость в том, что ты можешь поработать над этим «когда», по крайней мере, у меня это работает. Я думаю, ключ в том, чтобы принимать всё, что ты чувствуешь, не бороться с этим и не пытаться изменить, даже пытаться заставить себя плакать. Суть в том, чтобы действительно изменить своё мышление с «у меня депрессия, я не могу её вылечить, она убивает меня» на «я снова грущу без причины, человеческий мозг — куча дерьма и чертовски глючный, я буду плакать весь день, чтобы дать ему то, что он хочет»... ',
               'text_eng': "I'm sorry to hear that you're feeling this way. You're not alone.",
               'rationales_eng': 'Мне жаль слышать, что ты так себя чувствуешь|Ты не один.',
               'rationales_rus': 'Мне жаль слышать, что ты так

In [42]:
with open("all_translations_yandex_corrected.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)