In [19]:
import json
import re
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path

import pandas as pd
from datasets import Dataset
from omegaconf import OmegaConf

from src.core.translate import Translator
from src.utils.schemas import (GeneralTranslationResultSchema,
                               RationaleTranslationResultSchema)
from collections import defaultdict
from copy import copy

In [2]:
regex_check_enhlsih = re.compile(r'\b[a-zA-Z]{2,}\b')

In [3]:
def contains_english(text: str) -> bool:
	"""
	Checks if the given text contains English words.

	Args:
		text (str): Text to check.

	Returns:
		bool:		True if English words are found, False otherwise.
	"""
	return bool(re.search(regex_check_enhlsih, text))

def read_file(path: str):
	return Path(path).open().read()
	
def read_json(path: str):
	return json.load(Path(path).open())

In [51]:
config = OmegaConf.load('configs/conf.yaml')
general_translation_config = config.general_translation
general_translation_correction_config = config.general_translation_correction
rationales_translation_config = config.rationales_translation
rationales_translation_correction_config = config.rationales_translation_correction

In [52]:
print(read_json(general_translation_config.model_config_path))

{'base_url': 'https://bothub.chat/api/v2/openai/v1', 'api_key': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjU2N2RkOTMzLWYyMmEtNDdiMC1iNjJjLWFmODY4ZmNjNmQ4MCIsImlzRGV2ZWxvcGVyIjp0cnVlLCJpYXQiOjE3NDUwNDQwMDAsImV4cCI6MjA2MDYyMDAwMH0.g_9O0ivm62kR2nKfoLRcpcT4M1NxgHRQyc-4nwiIC7U', 'model': 'qwen-2.5-72b-instruct'}


In [53]:
general_translator = Translator(
    system_message=read_file(general_translation_config.prompt_path), 
    model_config=read_json(general_translation_config.model_config_path), 
    example_data=read_json(general_translation_config.filepath_examples), 
    batch_size=general_translation_config.batch_size,
    batch_result_dir=general_translation_config.batch_result_dir,
    batch_dir=general_translation_config.batches,
    model_type="openai"
)

general_translator_corrector = Translator(
    system_message=read_file(general_translation_correction_config.prompt_path), 
    model_config=read_json(general_translation_correction_config.model_config_path), 
    example_data=read_json(general_translation_correction_config.filepath_examples), 
    batch_size=general_translation_correction_config.batch_size,
    batch_result_dir=general_translation_correction_config.batch_result_dir,
    batch_dir=general_translation_correction_config.batches,
    model_type="openai"
)

rational_translator = Translator(
    system_message=read_file(rationales_translation_config.prompt_path), 
    model_config=read_json(rationales_translation_config.model_config_path), 
    example_data=read_json(rationales_translation_config.filepath_examples), 
    batch_size=rationales_translation_config.batch_size,
    batch_result_dir=rationales_translation_config.batch_result_dir,
    batch_dir=rationales_translation_config.batches,
    model_type="openai"
)

rational_translator_corrector = Translator(
    system_message=read_file(rationales_translation_correction_config.prompt_path), 
    model_config=read_json(rationales_translation_correction_config.model_config_path), 
    example_data=read_json(rationales_translation_correction_config.filepath_examples), 
    batch_size=rationales_translation_correction_config.batch_size,
    batch_result_dir=rationales_translation_correction_config.batch_result_dir,
    batch_dir=rationales_translation_correction_config.batches,
    model_type="openai"
)

general_translation_int_path = f"int_path_general_translator_{datetime.now()}.json"
rational_translation_int_path = f"int_path_rational_translator_{datetime.now()}.json"

## Translating

In [64]:
all_translations = []

In [65]:
general_data = json.load(Path("all_data.json").open())[:20]

In [66]:
def index(a_list, value):
    try:
        return a_list.index(value)
    except ValueError:
        return -1

In [67]:
incorrect_translations = defaultdict()

In [69]:
for name in ["seeker_post", "response_post"]:
	general_dataset = [{"id": item['id'], "text": item[name]} for item in general_data]
	general_input_dataset = Dataset.from_list(general_dataset)

	print(general_input_dataset[0])

	translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)
	incorrect_translations[name] = list(filter(lambda x: contains_english(x['text_rus']), translation_result))
	texts_containing_english = Dataset.from_list(incorrect_translations[name])
	
	print("Texts containing English: ", texts_containing_english)

	all_translations_ids = [item["id"] for item in all_translations]
	for t in texts_containing_english:
		present_idx = index(all_translations_ids, t['id'])
		translation_result.remove(t)
		if present_idx != -1:
			del all_translations[present_idx]

	all_translations_ids = [item["id"] for item in all_translations]
	for t in translation_result:
		present_idx = index(all_translations_ids, t['id'])
		if present_idx != -1:
			all_translations[present_idx].update({f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})
		else:
			all_translations.append({"id": t["id"], f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})
	if texts_containing_english:
		json.dump(texts_containing_english.to_dict(), Path(f"wrong_translations_{name}_yandex").open("w"), ensure_ascii=False, indent=4)

{'id': '65m92s_dgbdk7z', 'text': "Help. Help me. I dunno what I'm doing anymore"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 463.56ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1503.87ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1862.48ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1961.79ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1712.66ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2155.35ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 994.85ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1707.08ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2045.00ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2470.14ba/s]

Processing batch 1/10...



Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 686.80ba/s]


Processing batch 2/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 260.81ba/s]


Processing batch 3/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 503.76ba/s]


Processing batch 4/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 684.90ba/s]


Processing batch 5/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 249.62ba/s]


Processing batch 6/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 660.10ba/s]


Processing batch 7/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 201.99ba/s]


Processing batch 8/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 880.42ba/s]


Processing batch 9/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 243.66ba/s]


Processing batch 10/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 310.37ba/s]


Texts containing English:  Dataset({
    features: [],
    num_rows: 0
})
{'id': '65m92s_dgbdk7z', 'text': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or work?"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1642.25ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 929.38ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2024.28ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 810.81ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1541.46ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1202.15ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1340.46ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1033.08ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1908.24ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 2262.30ba/s]


Processing batch 1/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 273.74ba/s]


Processing batch 2/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 550.94ba/s]


Processing batch 3/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 48.02ba/s]


Processing batch 4/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 261.10ba/s]


Processing batch 5/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 415.98ba/s]


Processing batch 6/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 297.89ba/s]


Processing batch 7/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 256.08ba/s]


Processing batch 8/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 252.78ba/s]


Processing batch 9/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 532.75ba/s]


Processing batch 10/10...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 339.56ba/s]

Texts containing English:  Dataset({
    features: ['id', 'text', 'text_rus'],
    num_rows: 1
})





In [90]:
with open("posts_translations_correct_yandex.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

### Correcting translation

In [71]:
incorrect_translations

defaultdict(None,
            {'seeker_post': [],
             'response_post': [{'id': '7oi3es_ds9oti2',
               'text': "Is that really so bad? Maybe it was the smart decision because you needed that time to read recover. You're being kind to yourself when you need it and that's important. Hope you feel better soon.",
               'text_rus': 'Правда, это так плохо? Возможно, это было разумное решение, потому что вам нужно было это время для восстановления. Вы заботитесь о себе, когда это необходимо, и это важно. Надеюсь, вам станет лучше soon.'}]})

In [72]:
for name, items in incorrect_translations.items():
    keys = ['id', 'text_rus', 'text']
    dataset_correction = {k: [d[k] for d in items] for k in keys}
    dataset_correction['text'] = dataset_correction.pop('text_rus')
    if dataset_correction['id']:
        all_translations_ids = [item["id"] for item in all_translations]
        input_dataset_correction = Dataset.from_dict(dataset_correction)
        translation_result_corrected = general_translator_corrector.translate(input_dataset_correction, GeneralTranslationResultSchema)
        incorrect_translations[name] = list(filter(lambda x: contains_english(x['text_rus']), translation_result_corrected))
        for item in translation_result_corrected:
            if not contains_english(item['text_rus']):
                present_idx = index(all_translations_ids, item['id'])
                if present_idx != -1:
                    all_translations[present_idx].update({f"{name}_rus": t["text_rus"], f"{name}_en": t["text"]})

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 265.90ba/s]


Processing batch 1/1...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 257.49ba/s]


In [73]:
incorrect_translations

defaultdict(None, {'seeker_post': [], 'response_post': []})

In [74]:
with open("all_posts_translations", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

## Translating rationales

In [91]:
incorrectly_translated_rationales = defaultdict()

In [108]:
for name in ["emotional_reactions_rationales", "explorations_rationales", "interpretations_rationales"]:
	all_data = Dataset.from_list(general_data).filter(lambda x: x[name]).sort("id")
	translation_result_filtered = list(filter(lambda x: x['id'] in all_data['id'], all_translations))
	translation_result_filtered = sorted(translation_result_filtered, key=lambda x: x['id'])
	dataset_rationales = [{"id": item['id'], "text_eng": item["response_post"], "text_rus": item1['response_post_rus'], 'rationales_eng': item[name]} for item, item1 in zip(all_data, translation_result_filtered)]
	input_dataset_rationales = Dataset.from_list(dataset_rationales)

	print(input_dataset_rationales[0])

	translation_result_rationales = rational_translator.translate(input_dataset_rationales, RationaleTranslationResultSchema)
	all_translations_ids = [item["id"] for item in all_translations]
	
	incorrectly_translated = [] 
	for item in translation_result_rationales:
		rats = item['rationales_rus'].strip('|').split('|')
		if len(item['rationales_eng'].strip('|').split("|")) != len(rats):
			incorrectly_translated.append(item)
		for r in rats:
			if item['text_rus'].find(r) == -1:
				incorrectly_translated.append(item)
	
	incorrectly_translated_rationales[name] = incorrectly_translated

	translated_rationales = list(filter(lambda x : x not in incorrectly_translated, translation_result_rationales))
	print("Incorrectly translated: ", incorrectly_translated)
	if incorrectly_translated:
		json.dump(incorrectly_translated, Path(f"wrong_translations_{name}_yandex_32batch").open("w"), ensure_ascii=False, indent=4)

	for t in translated_rationales:
		present_idx = index(all_translations_ids, t['id'])
		if present_idx != -1:
			all_translations[present_idx].update({f"{name}_rus": t["rationales_rus"], f"{name}_en": t["rationales_eng"]})

Filter: 100%|██████████| 20/20 [00:00<00:00, 4802.27 examples/s]


{'id': '50sgzd_d76kxsp', 'text_eng': "Thats totally normal, most people get that from time to time. It will pass, the when is the real question. Good news is you can work on the when, at least it works for me. I think the key is to accept whatever you are feeling, not fight it and not try to change it, even try to get yourself to cry. The point is really to change your mindset from 'i have depression i can't cure it, it is killing me' to 'i'm sad again for no reason, human brain is a pile of shit and buggy as hell, I'm gonny cry all day to give it what it wants'..", 'text_rus': "Это совершенно нормально, большинство людей испытывают это время от времени. Это пройдет, вопрос в том, когда. Хорошая новость в том, что ты можешь работать над этим 'когда', по крайней мере, это работает для меня. Думаю, ключевое — принимать те чувства, которые ты испытываешь, не бороться с ними и не пытаться их изменить, даже пытаться заплакать. Суть в том, чтобы изменить свое мышление от 'у меня депрессия, я

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 66.06ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1020.51ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1689.89ba/s]


Processing batch 1/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 248.68ba/s]


Processing batch 2/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 274.50ba/s]


Processing batch 3/3...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 183.79ba/s]


Incorrectly translated:  [{'id': '7gipvi_dqje246', 'text_rus': 'Я не хочу, чтобы ты прыгал. Я не хочу, чтобы ты умирал, ни так, ни иначе. Я не знаю точно, что с тобой происходит, но по тому, как ты говоришь, я бы посоветовал сосредоточиться на тех вещах в жизни, которые тебе нравятся.', 'text_eng': "I don't want you to jump. I don't want you to die, in either one. I don't know exactly what you're going through but from the sounds of it, I'd suggest focusing on the parts in your life that you enjoy.", 'rationales_eng': "I don't want you to jump. I don't want you to die|", 'rationales_rus': 'Я не хочу, чтобы ты прыгал|Я не хочу, чтобы ты умирал, ни так, ни иначе|'}]


Filter: 100%|██████████| 20/20 [00:00<00:00, 6880.42 examples/s]


{'id': '65m92s_dgbdk7z', 'text_eng': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or work?", 'text_rus': 'Это довольно расплывчато, ты не знаешь, что делаешь в какой-то конкретной области своей жизни? Например, в школе или на работе?', 'rationales_eng': "do you not know what you're doing in regards to a specific section of your life? Like school or work?|"}


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1081.01ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 643.50ba/s]


Processing batch 1/2...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 15.20ba/s]


Processing batch 2/2...


Creating json from Arrow format: 0ba [00:00, ?ba/s]


Incorrectly translated:  [{'id': '65m92s_dgbdk7z', 'text_rus': 'Это довольно расплывчато, ты не знаешь, что делаешь в какой-то конкретной области своей жизни? Например, в школе или на работе?', 'text_eng': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or work?", 'rationales_eng': "do you not know what you're doing in regards to a specific section of your life? Like school or work?|", 'rationales_rus': 'ты не знаешь, что делаешь в какой-то конкретной области своей жизни?|например, в школе или на работе?'}, {'id': '65m92s_dgbdk7z', 'text_rus': 'Это довольно расплывчато, ты не знаешь, что делаешь в какой-то конкретной области своей жизни? Например, в школе или на работе?', 'text_eng': "That's pretty vague, do you not know what you're doing in regards to a specific section of your life? Like school or work?", 'rationales_eng': "do you not know what you're doing in regards to a specific section of your life? Like school or

Filter: 100%|██████████| 20/20 [00:00<00:00, 949.89 examples/s]


{'id': '50sgzd_d76kxsp', 'text_eng': "Thats totally normal, most people get that from time to time. It will pass, the when is the real question. Good news is you can work on the when, at least it works for me. I think the key is to accept whatever you are feeling, not fight it and not try to change it, even try to get yourself to cry. The point is really to change your mindset from 'i have depression i can't cure it, it is killing me' to 'i'm sad again for no reason, human brain is a pile of shit and buggy as hell, I'm gonny cry all day to give it what it wants'..", 'text_rus': "Это совершенно нормально, большинство людей испытывают это время от времени. Это пройдет, вопрос в том, когда. Хорошая новость в том, что ты можешь работать над этим 'когда', по крайней мере, это работает для меня. Думаю, ключевое — принимать те чувства, которые ты испытываешь, не бороться с ними и не пытаться их изменить, даже пытаться заплакать. Суть в том, чтобы изменить свое мышление от 'у меня депрессия, я

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 49.72ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 466.66ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 578.37ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 546.13ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 258.75ba/s]


Processing batch 1/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 94.46ba/s]


Processing batch 2/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 341.53ba/s]


Processing batch 3/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 630.15ba/s]


Processing batch 4/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 637.04ba/s]


Processing batch 5/5...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 312.29ba/s]

Incorrectly translated:  [{'id': '96zkq7_e44llwe', 'text_rus': 'Общие ответы, которые слышались за годы, обычно звучат примерно так: упражнения, солнечный свет, разговоры, терапия, ласка животных, здоровое питание, медитации и запись одного благодарного момента ежедневно. С искренностью могу сказать, что ни один из этих способов не помог мне, но помог другим. Занятие хобби в те дни, когда ты можешь функционировать, может помочь. Мне это помогает немного.', 'text_eng': 'The generic answers heard over the years tend to go something like the following; exercise, sunlight, talking, therapy, stroking animals, eating healthily, meditation and writing one thing thankful for daily. I say with sincerity that any of those have worked for me but helped others. Keeping busy with hobbies on the days able to function might help. It does slightly for me.', 'rationales_eng': 'I say with sincerity that any of those have worked for me but helped others. Keeping busy with hobbies on the days able to func




In [110]:
for a in general_data:
	all_translations_ids = [item["id"] for item in all_translations]
	present_idx = index(all_translations_ids, a['id'])
	if present_idx != -1:
		all_translations[present_idx].update({"emotional_reactions_level": a["emotional_reactions_level"], "explorations_level": a["explorations_level"], "interpretations_level": a["interpretations_level"]})
		if a["emotional_reactions_level"] == 0:
			all_translations[present_idx].update({"emotional_reactions_rationales_rus": "", "emotional_reactions_rationales_en": ""})
		if a["explorations_level"] == 0:
			all_translations[present_idx].update({"explorations_rationales_rus": "", "explorations_rationales_en": ""})
		if a["interpretations_level"] == 0:
			all_translations[present_idx].update({"interpretations_rationales_rus": "", "interpretations_rationales_en": ""})

In [111]:
with open("all_translations_yandex_correct.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)

### Correcting rationales

In [112]:
incorrectly_translated_rationales

defaultdict(None,
            {'emotional_reactions_rationales': [{'id': '7gipvi_dqje246',
               'text_rus': 'Я не хочу, чтобы ты прыгал. Я не хочу, чтобы ты умирал, ни так, ни иначе. Я не знаю точно, что с тобой происходит, но по тому, как ты говоришь, я бы посоветовал сосредоточиться на тех вещах в жизни, которые тебе нравятся.',
               'text_eng': "I don't want you to jump. I don't want you to die, in either one. I don't know exactly what you're going through but from the sounds of it, I'd suggest focusing on the parts in your life that you enjoy.",
               'rationales_eng': "I don't want you to jump. I don't want you to die|",
               'rationales_rus': 'Я не хочу, чтобы ты прыгал|Я не хочу, чтобы ты умирал, ни так, ни иначе|'}],
             'explorations_rationales': [{'id': '65m92s_dgbdk7z',
               'text_rus': 'Это довольно расплывчато, ты не знаешь, что делаешь в какой-то конкретной области своей жизни? Например, в школе или на работе?',
  

In [113]:
for name, items in incorrectly_translated_rationales.items():
	keys = ['id', 'text_rus', 'text_eng', 'rationales_rus']
	dataset_correction_rationales = {k: [d[k] for d in items] for k in keys}
	dataset_correction_rationales['rationales_eng'] = dataset_correction_rationales.pop('rationales_rus')
	if dataset_correction_rationales['id']:
		all_translations_ids = [item["id"] for item in all_translations]
		input_dataset_correction = Dataset.from_dict(dataset_correction_rationales)
		translation_result_corrected = rational_translator_corrector.translate(input_dataset_correction, RationaleTranslationResultSchema)
		for item in translation_result_corrected:
			base_item = list(filter(lambda x: x['id'] == item['id'], items))[0]
			correct = True
			rats = item['rationales_rus'].strip('|').split('|')
			if len(base_item['rationales_eng'].strip('|').split("|")) != len(rats):
				correct = False
			for r in rats:
				if item['text_rus'].find(r) == -1:
					correct = False
			if correct:
				present_idx = index(all_translations_ids, item['id'])
				incorrectly_translated_rationales[name].remove(base_item)
				if present_idx != -1:
					all_translations[present_idx].update({f"{name}_rus": t["rationales_rus"], f"{name}_en": base_item["rationales_eng"]})
			else:
				items[index(items, base_item)] = item

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 33.61ba/s]


Processing batch 1/1...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1193.26ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1436.90ba/s]


Processing batch 1/1...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 293.27ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 367.37ba/s]


Processing batch 1/1...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 943.60ba/s]


In [114]:
incorrectly_translated_rationales

defaultdict(None,
            {'emotional_reactions_rationales': [{'id': '7gipvi_dqje246',
               'text_rus': 'Я не хочу, чтобы ты прыгал. Я не хочу, чтобы ты умирал, ни так, ни иначе. Я не знаю точно, что с тобой происходит, но по тому, как ты говоришь, я бы посоветовал сосредоточиться на тех вещах в жизни, которые тебе нравятся.',
               'text_eng': "I don't want you to jump. I don't want you to die, in either one. I don't know exactly what you're going through but from the sounds of it, I'd suggest focusing on the parts in your life that you enjoy.",
               'rationales_eng': 'Я не хочу, чтобы ты прыгал|Я не хочу, чтобы ты умирал, ни так, ни иначе|',
               'rationales_rus': 'Я не хочу, чтобы ты прыгал|Я не хочу, чтобы ты умирал, ни так, ни иначе'}],
             'explorations_rationales': [{'id': '65m92s_dgbdk7z',
               'text_rus': 'Это довольно расплывчато, ты не знаешь, что делаешь в какой-то конкретной области своей жизни? Например, в школ

In [115]:
with open("all_translations_yandex_corrected.json", "w", encoding="utf-8") as f:
	json.dump(all_translations, f, ensure_ascii=False, indent=4)