# Перевод датасета ESConv
https://github.com/thu-coai/Emotional-Support-Conversation

In [2]:
import sys
import os.path as osp
import os

current_dir = os.getcwd()

parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
sys.path.append(current_dir + '/notebooks')
sys.path.append(current_dir)
sys.path.append(current_dir + '/data')

In [None]:
import json
from pathlib import Path
import re

from datasets import Dataset
from omegaconf import OmegaConf
from copy import deepcopy
import pandas as pd

from src.core.translate import Translator
from src.utils.schemas import GeneralTranslationResultSchema

вспомогательные функции

In [347]:
regex_check_enhlsih = re.compile(r'\b[a-zA-Z]{2,}\b')
def contains_english(text: str) -> bool:
	return bool(re.search(regex_check_enhlsih, text))

def read_json(path):
    return json.load(Path(path).open())

def save_json(obj, path):
    json.dump(obj, Path(path).open("w", encoding="utf-8"), indent=4, ensure_ascii=False)

def read_file(path: str):
	return Path(path).open().read()

def index(a_list, value):
    try:
        return a_list.index(value)
    except ValueError:
        return -1
    
def get_sort_key(id_string):
    """Extract the two numbers from the ID for sorting"""
    first, second = map(int, id_string.split('_'))
    return (first, second)

In [7]:
esconv_data = read_json("../ESConv.json")

In [324]:
# esconv_data[2]['dialog']

In [8]:
len(esconv_data)

1300

соберем данные в нужном формате для перевода, для начала переведем просто все тексты из диалога

In [325]:
dataset = []

In [326]:
for i, item in enumerate(esconv_data):
    for j, mess in enumerate(item['dialog']):
        dataset.append(
            {
                "id": f"{i}_{j}",
                "text": mess["content"]
            }
        )

In [328]:
dataset[0]

{'id': '0_0', 'text': 'Hello\n'}

In [327]:
len(dataset)

38365

In [14]:
save_json(dataset, "../esconv_data.json")

подгружаем конфиг для перевода, в этом случае используем батч из 32 текстов

In [146]:
config = OmegaConf.load('../configs/conf.yaml')
general_translation_config = config.general_translation

In [329]:
general_translator = Translator(
    system_message=read_file(general_translation_config.prompt_path), 
    model_config=read_json(general_translation_config.model_config_path), 
    example_data=read_json(general_translation_config.filepath_examples), 
    batch_size=general_translation_config.batch_size,
    batch_result_dir=general_translation_config.batch_result_dir,
    batch_dir=general_translation_config.batches,
    model_type="openai"
)

In [None]:
print(read_json(general_translation_config.model_config_path))

In [331]:
all_translations = []

In [332]:
general_input_dataset = Dataset.from_list(dataset[:1])

translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)
save_json(translation_result, "translation_result_esconv_yandex_gpt_1st_try.json")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 31.35ba/s]


Processing batch 1/1...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 924.47ba/s]


In [333]:
incorrect_translations = list(filter(lambda x: contains_english(x['text_rus']), translation_result))
texts_containing_english = Dataset.from_list(incorrect_translations)

In [334]:
incorrect_translations

[]

In [335]:
texts_containing_english

Dataset({
    features: [],
    num_rows: 0
})

In [336]:
len(translation_result)

1

In [337]:
all_translations_ids = [item["id"] for item in translation_result]
for t in texts_containing_english:
	present_idx = index(all_translations_ids, t['id'])
	translation_result.remove(t)

In [338]:
save_json(translation_result, "esconv_translations1.json")

In [339]:
if texts_containing_english:
	save_json(texts_containing_english.to_dict(), "wrong_translations_esconv1.json")

In [340]:
all_translated = read_json("esconv_translations1.json")

In [341]:
all_translated

[{'id': '0_0', 'text': 'Hello\n', 'text_rus': 'Привет\n'}]

In [348]:
alltr_sorted = sorted(all_translated, key=lambda x: get_sort_key(x['id']))

In [349]:
len(alltr_sorted)

1

In [350]:
allids = set([i['id'] for i in dataset])
len(allids)

38365

In [351]:
not_translated = []
for item in alltr_sorted:
    if item['id'] not in allids:
        not_translated.append(item)

In [352]:
len(not_translated)

0

In [353]:
alltr_normalized = []
seen_ids = set()
for item in alltr_sorted:
    if item['id'] not in seen_ids:
        seen_ids.add(item['id'])
        alltr_normalized.append(item)

In [354]:
len(alltr_normalized)

1

In [355]:
save_json(alltr_normalized, "esconv_dialog_content_translations1.json")

In [None]:
dataset = json.load(Path("esconv_data.json").open())
# здесь уже мой файл со всеми переводами, выше для примера перевода показан только перевод одного примера
trans = json.load(Path("esconv_unique_sorted_38359_all.json").open())
esconv_data = read_json("../ESConv.json")

также в файле содрежится много другой информации, поэтому переводим и другие поля.

In [382]:
ANNOTATIONS = {
    'strategy': {
        'Affirmation and Reassurance': 'Подтверждение и заверение',
        'Information': 'Информирование',
        'Others': 'Другое',
        'Providing Suggestions': 'Предоставление советов',
        'Question': 'Вопрос',
        'Reflection of feelings': 'Отражение чувств',
        'Restatement or Paraphrasing': 'Перефразирование',
        'Self-disclosure': 'Самораскрытие'
    },
    'feedback': {'1', '2', '3', '4', '5'}
}

In [383]:
PROBLEM_TYPES = {
    'Alcohol Abuse': 'Злоупотребление алкоголем',
    'Appearance Anxiety': 'Тревога из-за внешности',
    'Issues with Children': 'Проблемы с детьми',
    'Issues with Parents': 'Проблемы с родителями',
    'Procrastination': 'Прокрастинация',
    'School Bullying': 'Школьная травля',
    'Sleep Problems': 'Проблемы со сном',
    'academic pressure': 'Учебное давление',
    'breakup with partner': 'Расставание с партнёром',
    'conflict with parents': 'Конфликт с родителями',
    'job crisis': 'Кризис на работе',
    'ongoing depression': 'Затяжная депрессия',
    'problems with friends': 'Проблемы с друзьями'
}

In [384]:
EMOTION_TYPES = {
    'anger': 'гнев',
    'anxiety': 'тревога',
    'depression': 'депрессия',
    'disgust': 'отвращение',
    'fear': 'страх',
    'guilt': 'вина',
    'jealousy': 'ревность',
    'nervousness': 'нервозность',
    'pain': 'боль',
    'sadness': 'грусть',
    'shame': 'стыд'
}

In [385]:
EXPERIENCE_TYPES = {
    'Current Experience': 'Текущий опыт',
    'Previous Experience': 'Предыдущий опыт'
}

In [None]:
situations = []
seeker_question1 = []
seeker_question2 = []
supporter_question1 = []
supporter_question2 = []
for i, item in enumerate(esconv_data):
    if item['situation']:
        situations.append({'id': i, 'text': item['situation']})
    if item['seeker_question1']:
        seeker_question1.append({'id': i, 'text': item['seeker_question1']})
    if item['seeker_question2']:
        seeker_question2.append({'id': i, 'text': item['seeker_question2']})
    if item['supporter_question1']:
        supporter_question1.append({'id': i, 'text': item['supporter_question1']})
    if item['supporter_question2']:
        supporter_question2.append({'id': i, 'text': item['supporter_question2']})

In [357]:
to_translate = {
    "situation": situations,
    "seeker_question1": seeker_question1,
    "seeker_question2": seeker_question2,
    "supporter_question1": supporter_question1,
    "supporter_question2": supporter_question2
}

In [356]:
config = OmegaConf.load('../configs/conf.yaml')
general_translation_config = config.general_translation
general_translator = Translator(
    system_message=read_file(general_translation_config.prompt_path), 
    model_config=read_json(general_translation_config.model_config_path), 
    example_data=read_json(general_translation_config.filepath_examples), 
    batch_size=general_translation_config.batch_size,
    batch_result_dir=general_translation_config.batch_result_dir,
    batch_dir=general_translation_config.batches,
    model_type="openai"
)

In [None]:
for key, values in to_translate.items():
    general_input_dataset = Dataset.from_list(values[:2])
    translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)
    data = {t['id']: t['text_rus'] for t in translation_result}
    save_json(translation_result, f"{key}_translations_list.json")
    save_json(data, f"{key}_translations.json")
    sk1ids = set([str(i['id']) for i in values])
    sktr1ids = set([i['id'] for i in translation_result])
    print(sk1ids - sktr1ids)
    print(sktr1ids - sk1ids)

In [None]:
sk1 = read_json('seeker_question1_translations.json')
sk2 = read_json('seeker_question2_translations.json')
sp1 = read_json('supporter_question1_translations.json')
sp2 = read_json('supporter_question2_translations.json')
sit = read_json('situation_translations.json')

In [389]:
grouped = {}

for item in trans:
    prefix = item["id"].split("_")[0] 
    if prefix not in grouped:
        grouped[prefix] = []  
    grouped[prefix].append(item['text_rus'])

In [390]:
len(esconv_data), len(grouped)

(1300, 1300)

собираем переведенные данные в исходный формат

In [None]:
new_esconv_data = []
for id, (item1, tr) in enumerate(zip(esconv_data, grouped.values())):
    new_item = deepcopy(item1)
    new_item['experience_type'] = EXPERIENCE_TYPES[item1['experience_type']]
    new_item['emotion_type'] = EMOTION_TYPES[item1['emotion_type']]
    new_item['problem_type'] = PROBLEM_TYPES[item1['problem_type']]
    new_item['situation'] = sit[str(id)]
    # new_item['situation_eng'] = dataset[id]['situation']
    if item1['seeker_question1']:
        new_item['seeker_question1'] = sk1[str(id)]
    if item1['seeker_question2']:
        new_item['seeker_question2'] = sk2[str(id)]
    if item1['supporter_question1']:
        new_item['supporter_question1'] = sp1[str(id)]
    if item1['supporter_question2']:
        new_item['supporter_question2'] = sp2[str(id)]
    for i, d in enumerate(new_item['dialog']):
        new_item['dialog'][i]['content'] = tr[i]
        # new_item['dialog'][i]['content_eng'] = dataset[id]['dialog'][i]['content']
        if new_item['dialog'][i]['annotation']:
            if 'strategy' in new_item['dialog'][i]['annotation']:
                new_item['dialog'][i]['annotation']['strategy'] = ANNOTATIONS['strategy'][item1['dialog'][i]['annotation']['strategy']]

    new_esconv_data.append(new_item)

In [None]:
save_json(new_esconv_data, "all_esconv_translated_witheng.json")

собираем в датасет формата huggingface

In [None]:
data = read_json('all_esconv_translated_witheng.json')

In [None]:
rows = []
for item in data:
    dialog_id = f"{item['experience_type']}_{item['emotion_type']}_{item['problem_type']}"
    
    metadata = {
        "dialog_id": dialog_id,
        "experience_type": item["experience_type"],
        "emotion_type": item["emotion_type"],
        "problem_type": item["problem_type"],
        "situation": item["situation"],
        "situation_eng": item["situation_eng"],
        "seeker_question1": item.get("seeker_question1", ""),
        "seeker_question2": item.get("seeker_question2", ""),
        "supporter_question1": item.get("supporter_question1", ""),
        "supporter_question2": item.get("supporter_question2", ""),
        
        "seeker_initial_emotion_intensity": item.get("survey_score", {}).get("seeker", {}).get("initial_emotion_intensity", ""),
        "seeker_relevance": item.get("survey_score", {}).get("seeker", {}).get("relevance", ""),
        "seeker_empathy": item.get("survey_score", {}).get("seeker", {}).get("empathy", ""),
        "seeker_final_emotion_intensity": item.get("survey_score", {}).get("seeker", {}).get("final_emotion_intensity", ""),
        "supporter_relevance": item.get("survey_score", {}).get("supporter", {}).get("relevance", ""),
    }
    
    for i, turn in enumerate(item["dialog"]):
        row = {
            **metadata,  
            "turn_id": i,
            "speaker": turn["speaker"],
            "text": turn["content"],
            "text_eng": turn["content_eng"],
            "strategy": turn["annotation"].get("strategy", ""),
            "feedback": turn["annotation"].get("feedback", ""),
        }
        rows.append(row)

df = pd.DataFrame(rows)

In [None]:
dataset = Dataset.from_pandas(df)
dataset[8]