# Перевести датасет с когнитивными искажениями
https://github.com/psytechlab/empathy_dataset_transfer/issues/33

In [30]:
%load_ext autoreload
%autoreload 2

%precision 3

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'%.3f'

In [2]:
import sys
import os.path as osp
import os

current_dir = os.getcwd()

parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
sys.path.append(current_dir + '/notebooks')
sys.path.append(current_dir)
sys.path.append(current_dir + '/data')

In [64]:
import json
from pathlib import Path
import re
import ast
import numpy as np
from datasets import Dataset
from omegaconf import OmegaConf
from copy import deepcopy
from collections import defaultdict
import pandas as pd
from src.core.translate import Translator
from src.utils.schemas import GeneralTranslationResultSchema, RationaleTranslationResultSchema

In [None]:
regex_check_russian = re.compile(r'^[а-яА-ЯёЁ0-9\s,.!?;:\-—()«»„“"\'’]+$')
def read_json(path):
    return json.load(Path(path).open())

def read_file(path: str):
	return Path(path).open().read()

def save_json(obj, path):
    json.dump(obj, Path(path).open("w", encoding="utf-8"), indent=4, ensure_ascii=False)

def is_russian_text(text):
    return bool(re.fullmatch(regex_check_russian, text))

### Подготовка данных

In [7]:
df = pd.read_csv("../Annotated_data.csv")
df.loc[df['Distorted part'].isna(), 'Distorted part'] = ''
df['Patient Question'] = df['Patient Question'].apply(lambda x: x.replace("\xa0", " ").replace("  ", " "))

In [6]:
df.head()

Unnamed: 0,Id_Number,Patient Question,Distorted part,Dominant Distortion,Secondary Distortion (Optional)
0,4500,"Hello, I have a beautiful,smart,outgoing and a...",The voice are always fimilar (someone she know...,Personalization,
1,4501,Since I was about 16 years old I’ve had these ...,I feel trapped inside my disgusting self and l...,Labeling,Emotional Reasoning
2,4502,So I’ve been dating on and off this guy for a...,,No Distortion,
3,4503,My parents got divorced in 2004. My mother has...,,No Distortion,
4,4504,I don’t really know how to explain the situati...,I refused to go because I didn’t know if it wa...,Fortune-telling,Emotional Reasoning


In [7]:
def split_distorted_by_max_matches(question, distorted):
    q_lower = question.lower()
    d = distorted.strip()
    i = 0
    segments = []
    while i < len(d):
        best = ""
        for j in range(i+1, len(d)+1):
            sub = d[i:j]
            if sub.lower() in q_lower:
                best = sub
            else:
                break
        if best:
            segments.append(best)
            i += len(best)
        else:
            i += 1 
    return [seg.strip() for seg in segments if seg.strip()]


In [8]:
parts = df['Distorted part'].tolist()
qus = df['Patient Question'].tolist()
new_parts = []
i = 0
inc = []
for q, p in zip(qus, parts):
	if p in q:
		i += 1
		new_parts.append([p])
	else:
		rr = split_distorted_by_max_matches(q, p)
		good = True
		for t in rr:
			if t not in q:
				print(q)
				print()
				inc.append(q)
				good = False
				break
		if good:
			new_parts.append(rr)
		else:
			new_parts.append([])

From a grandmother in Canada: My grandchildren ages 7 and 8 have been telling me for almost 2 years that they are afraid of mommy, my Grandson age 7 started soiling his pants, so I took him to the doctors and he said he’s afraid of mommy and she smacks them quite a bit in the head, I got the CPS involved and they sent the children to counselling. The counsellor seen the children one time and informed the children’s father the children are lying and are making stories up to get mommy and daddy back together.

Hi, Doctor, I think I’m continued dive in the depression. I can’t feel very well. When I trying to get rid of my bad thoughts which is continuously coming to my mind –I can’t. When I trying to concentrate on my work I can’t. Every time I think about ” what happened next or what if I can’t do anything in my life.” I literally struggled to live a normal Happy life. 'Please help me to overcome my meaningless fear and the big one of the big problems for me is I’m not very Happy with my

In [9]:
inc.append('Im 17 years old. ive lied about having an eating disorder and have been hospitalized many times. ive lied about trauma and have been hospitalized for it as well.')
inc.append("From a teen in the U.S.: So to begin, I’ve been diagnosed and recieved treatment for depression and an eating disorder, seen 2 therapists, and struggled with self-harm in the past 3 years. Lately I’ve noticed a few behaviors that are bit strange to me. Here’s a list of the behaviors that are concerning me: – I excessive clean my room – When I’m the only one home, I organize the pantry, clean and straighten up the countertops, and try to put everything in order – When I babysit, after the kids have gone to bed, I wipe down all the counter and table tops, straighten paperwork/books and put them in the upper corners of the table, straighten the TV remotes so they are perpendicular to the TV, and put all dishes in the sink – I used to bring all my horse equipment home every weekend to clean it (I don’t ride anymore so this isn’t happening anymore) – I regularly go through all my school work and try to organize it based on class and date – I’m picking at my skin a lot. I pick at my chapped lips for hours at a time, as well as ripping open the skin around my fingernails, and picking off all scabs multiple times – I have very limited personal relationships with people and struggle to make friends – I had 2 childhood friends that I kept for a long time (one for 8 years, one for 10 years) the 8 year friendship ended recently and I really don’t feel like I miss her – When my family goes out of town for long periods of time I don’t really miss them but when they call I tell them I miss them because I feel rude if I say that I don’t – I hardly spend time with my friend (the 10 year friendship I mentioned) I see her maybe once every couple months – ' – I lie a lot and am extremely secretive")

In [10]:
df['restored_distorted_part'] = new_parts
df.drop(df[df['Patient Question'].isin(inc)].index, inplace=True)

In [82]:
df.head()

Unnamed: 0,Id_Number,Patient Question,Distorted part,Dominant Distortion,Secondary Distortion (Optional),restored_distorted_part
0,4500,"Hello, I have a beautiful,smart,outgoing and a...",The voice are always fimilar (someone she know...,Personalization,,[The voice are always fimilar (someone she kno...
1,4501,Since I was about 16 years old I’ve had these ...,I feel trapped inside my disgusting self and l...,Labeling,Emotional Reasoning,[I feel trapped inside my disgusting self and ...
2,4502,So I’ve been dating on and off this guy for a...,,No Distortion,,[]
3,4503,My parents got divorced in 2004. My mother has...,,No Distortion,,[]
4,4504,I don’t really know how to explain the situati...,I refused to go because I didn’t know if it wa...,Fortune-telling,Emotional Reasoning,[I refused to go because I didn’t know if it w...


In [83]:
df.shape

(2525, 6)

In [101]:
annot_list = []
for i, annot in enumerate(df.restored_distorted_part.to_list()):
    subdf = df.iloc[i, 1]
    annot_list.append(annot)
    if not all(True if x in subdf else False for x in annot):
        raise ValueError

In [None]:
# df.to_csv("annotated_data_restored.csv", index=False, sep="|")

In [120]:
df = pd.read_csv("annotated_data_restored.csv", sep="|")
df.Id_Number = df.Id_Number.astype(str)

In [122]:
samples = df.sample(20)

In [123]:
samples_data = [{'id': i['Id_Number'], 'text': i['Patient Question']} for _, i in samples.iterrows()]

In [124]:
samples_data[0]

{'id': '949',
 'text': 'From a teen in the U.S.: I have an extremely low self esteem for no apparent reason; my mom and my family always go out of their way to tell me im beautiful, and sometimes i feel that way, but sometimes i feel horrible, which is selfish when i think about it. I also feel really self conscious about my personality, since it feels like no one likes me, maybe thinks im annoying or that im not worth talking to them.'}

In [125]:
len(samples_data)

20

### Проверяем модели

выбираем лучшу модель для перевода, сначала протестируем на 30 примерах по 4 батча

In [134]:
models = [
    'qwen3-235b-a22b',
	'qwen3-32b',
	'llama-3.3-70b-instruct',
	'gpt-4o',
	'gpt-oss-20b',
	'claude-sonnet-4',
	'gemini-2.5-pro',
	'grok-4'
]

In [None]:
for modelname in models:
	config = OmegaConf.load('../configs/conf.yaml')
	general_translation_config = config.general_translation
	general_translation_config.batch_result_dir = f"batches_res_general_translation_cognitive_{modelname}"
	general_translation_config.batches = f"batches_general_translation_cognitive_{modelname}"
	general_translation_config.batch_size = 2
	general_translation_config.prompt_path = "../configs/prompts/translation_prompt_cognitive_distortions.txt"

	model_config = read_json(general_translation_config.model_config_path)
	model_config['model'] = modelname

	print(model_config)

	general_translator = Translator(
		system_message=read_file(general_translation_config.prompt_path), 
		model_config=model_config, 
		example_data=read_json(general_translation_config.filepath_examples), 
		batch_size=general_translation_config.batch_size,
		batch_result_dir=general_translation_config.batch_result_dir,
		batch_dir=general_translation_config.batches,
		model_type="openai"
	)
	
	general_input_dataset = Dataset.from_list(samples_data)
	translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)
	save_json(translation_result, f"translation_result_{modelname}_cognitive_distortions_20.json")

In [135]:
jsons = []
for modelname in models:
    jsons.append(read_json(f"translation_result_{modelname}_cognitive_distortions_20.json"))

data = {}
for modelname, model_json in zip(models, jsons):
    for entry in model_json:
        text_id = entry['id']
        text_en = entry['text']
        translation = entry['text_rus']
        if text_id not in data:
            data[text_id] = {'id': text_id, 'text_en': text_en}
        data[text_id][f'translation_{modelname}'] = translation

In [138]:
df_translations = pd.DataFrame(list(data.values()))
df_translations.replace(np.nan, "", inplace=True)
df_translations.head()

Unnamed: 0,id,text_en,translation_qwen3-235b-a22b,translation_qwen3-32b,translation_llama-3.3-70b-instruct,translation_gpt-4o,translation_gpt-oss-20b,translation_claude-sonnet-4,translation_gemini-2.5-pro,translation_grok-4
0,949,From a teen in the U.S.: I have an extremely l...,От подростка из США: У меня крайне низкая само...,От подростка в США: у меня предельно низкая са...,Из письма подростка из США: У меня очень низка...,От подростка из США: У меня чрезвычайно заниже...,Из подростка из США: У меня крайне низкая само...,От подростка из США: У меня крайне низкая само...,От подростка из США: У меня крайне низкая само...,От подростка из США: У меня крайне низкая само...
1,1784,I am a 12 year old who has not been diagnosed ...,"Мне 12 лет, и у меня не поставлено никаких диа...","Мне 12 лет, у меня не поставлен диагноз заболе...","Мне 12 лет, и у меня нет никаких диагностирова...","Мне 12 лет, и мне не поставлен диагноз никаких...","Мне 12 лет, и у меня не диагностировано никаки...","Мне 12 лет, и у меня не диагностировано никаки...","Мне 12 лет, и у меня не диагностировано никаки...","Мне 12 лет, и у меня не диагностировано никаки..."
2,1133,For the past two years or so I have noticed my...,"В течение последних двух лет я замечаю, что мо...","В течение последних двух лет я заметила, что м...",За последние два года или около того я заметил...,На протяжении последних двух лет или около тог...,"За последние два года я заметил, что мои эмоци...",,"Последние года два я замечаю, что мои эмоции б...",За последние два года или около того я заметил...
3,1538,My best friend of 2 years is going through a l...,Мой лучший друг уже 2 года переживает сложный ...,Мой лучший друг в течение двух лет переживает ...,Мой лучший друг за последние 2 года проходит ч...,"Мой лучший друг, с которым мы дружим 2 года, п...","Мой лучший друг, с которым я дружу два года, с...",,"У моего лучшего друга, с которым мы дружим два...",Мой лучший друг двух лет переживает много всег...
4,384,I have a problem. I don’t feel like I have a h...,У меня проблема. Я не испытываю ощущение счаст...,"У меня проблема. Мне не кажется, что у меня сч...","У меня есть проблема. Мне не кажется, что у ме...","У меня проблема. Я не чувствую, что у меня впе...","У меня проблема. Я не чувствую, что у меня ест...","У меня есть проблема. Я не чувствую, что впере...","У меня проблема. Я не чувствую, что меня ждёт ...","У меня проблема. Я не чувствую, что впереди ме..."


In [140]:
df_translations.shape

(20, 10)

In [None]:
# df_translations.to_csv("cognitive_distortions_translations20.csv", index=False, sep="|")

### Перевод текстов

выбрали для перевода текстов модель grok-4:

In [None]:
modelname = 'grok-4'

config = OmegaConf.load('../configs/conf.yaml')
general_translation_config = config.general_translation

model_config = read_json(general_translation_config.model_config_path)
model_config['model'] = modelname

general_translation_config.batch_result_dir = f"batches_res_general_translation_cognitive_{modelname}_all1"
general_translation_config.batches = f"batches_general_translation_cognitive_{modelname}_all1"
general_translation_config.prompt_path = "../configs/prompts/translation_promp_cognitive_distortions.txt"
general_translation_config.batch_size = 4


general_translator = Translator(
	system_message=read_file(general_translation_config.prompt_path), 
	model_config=model_config, 
	example_data=read_json(general_translation_config.filepath_examples), 
	batch_size=general_translation_config.batch_size,
	batch_result_dir=general_translation_config.batch_result_dir,
	batch_dir=general_translation_config.batches,
	model_type="openai"
)

In [None]:
samples_data = [{'id': i['Id_Number'], 'text': i['Patient Question']} for _, i in df.iterrows()]
general_input_dataset = Dataset.from_list(samples_data)
translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)

In [None]:
# save_json(translation_result, f"translation_result_{modelname}_cognitive_distortions.json")

In [None]:
dataset_ids = set([str(item['id']) for item in samples_data])
tr_ids = set([item['id'] for item in translation_result])

In [41]:
more = dataset_ids - tr_ids
len(more)

0

In [42]:
len(tr_ids - dataset_ids)

0

In [None]:
i = 0
to_correct = []
for item in translation_result:
    if not is_russian_text(item['text_rus']):
        to_correct.append(item)
        print(item['text_rus'])
        print('***************'*10)
        i += 1

In [None]:
for item in more:
    for d in samples_data:
        if d['id'] == item:
            to_correct.append(d)

In [None]:
for item in to_correct:
    if 'text_rus' in item:
        del item['text_rus']

In [None]:
len(to_correct)

In [None]:
to_correct_ids = set([i['id'] for i in to_correct])

In [None]:
for i, item in enumerate(translation_result):
    if item['id'] in to_correct_ids:
        samples_data.remove(item)

In [None]:
save_json(translation_result, "alltr_normalized_cognitive_distortions.json")

### Перевод подстрок

теперь переведем подстроки, для этого используем модель gpt-4o

In [65]:
alltr_normalized = read_json("alltr_normalized_cognitive_distortions.json")
alltr_normalized_dict = {item['id']: item for item in alltr_normalized}

In [68]:
rationales_data = []
for _, row in df.iterrows():
    if row.restored_distorted_part != "['']":
        rationales_data.append({
			'id': row.Id_Number,
			'text_eng': row['Patient Question'],
            'text_rus': alltr_normalized_dict[str(row.Id_Number)]['text_rus'],
			'rationales_eng': "|".join(ast.literal_eval(row.restored_distorted_part))
		})

In [72]:
incorrectly_formed = [] 
for item in rationales_data:
	rats = item['rationales_eng'].strip('|').split('|')
	for r in rats:
		if item['text_eng'].find(r) == -1:
			incorrectly_formed.append(item)

In [73]:
incorrectly_formed

[]

In [69]:
len(rationales_data)

1592

In [71]:
rationales_data[0]

{'id': 4500,
 'text_eng': 'Hello, I have a beautiful,smart,outgoing and amazing five year old little girl. Yesterday she came to me and said mom can you take me to the doctor. I ask her what was wrong and she replied: I hear voices in my ears but I dont see the people saying it. She says it happened during school doing a reading circle. She thought someone called her stupid and let the teacher know. The teacher said no one said anything. It happened again when my husband was talking to my other children, she said I heard daddy say shut up, but he didnt really say it. The voice are always fimilar (someone she knows) Im very concerned about this and hope it has nothing to do with my pregnancy while on active duty.',
 'text_rus': 'Здравствуйте, у меня есть красивая, умная, общительная и удивительная пятилетняя девочка. Вчера она подошла ко мне и сказала: мама, ты можешь отвезти меня к врачу. Я спросила, что случилось, и она ответила: я слышу голоса в ушах, но не вижу людей, которые это го

In [None]:
modelname = 'gpt-4o'

config = OmegaConf.load('../configs/conf.yaml')
rationales_translation_config = config.rationales_translation

model_config = read_json(rationales_translation_config.model_config_path)
model_config['model'] = modelname

rationales_translation_config.batch_result_dir = f"batches_res_general_translation_cognitive_{modelname}_all_rationales"
rationales_translation_config.batches = f"batches_general_translation_cognitive_{modelname}_all_rationales"
rationales_translation_config.batch_size = 4


rational_translator = Translator(
    system_message=read_file(rationales_translation_config.prompt_path), 
    model_config=model_config, 
    example_data=read_json(rationales_translation_config.filepath_examples), 
    batch_size=rationales_translation_config.batch_size,
    batch_result_dir=rationales_translation_config.batch_result_dir,
    batch_dir=rationales_translation_config.batches,
    model_type="openai"
)

In [None]:
translation_result_rationales = rational_translator.translate(rationales_data, RationaleTranslationResultSchema)

In [None]:
incorrectly_translated = [] 
for item in translation_result_rationales:
	rats = item['rationales_rus'].strip('|').split('|')
	# if len(item['rationales_eng'].strip('|').split("|")) != len(rats):
	# 	incorrectly_translated.append(item)
	for r in rats:
		if item['text_rus'].find(r) == -1:
			incorrectly_translated.append(item)

In [161]:
for i in incorrectly_translated:
    if 'rationales_rus' in i:
    	del i['rationales_rus']

In [None]:
translated_rationales = list(filter(lambda x : x not in incorrectly_translated, translation_result_rationales))
len(translated_rationales)

In [None]:
save_json(translated_rationales, 'cognitive_distortions_translated.json')

In [167]:
def index(a_list, value):
    try:
        return a_list.index(value)
    except ValueError:
        return -1

In [168]:
i = 0
all_data_ids = [str(item["id"]) for item in rationales_data]
for t in translation_result_rationales:
    present_idx = index(all_data_ids, t['id'])
    if present_idx == -1:
        i += 1

In [169]:
i

0

In [170]:
k = 0
to_correct = []
all_translation_ids = [str(item["id"]) for item in translation_result_rationales]
for t in rationales_data:
    present_idx = index(all_translation_ids, str(t['id']))
    if present_idx == -1:
        k += 1
        to_correct.append(t)

In [171]:
k

0

### Проверка и сбор данных

после перевода подстрок и текстов соберем все данные и проверим корректны ли они.

In [74]:
cogn_data = read_json('cognitive_distortions.json')
tranlations = read_json('cognitive_distortions_translations_1592.json')

так как в процессе модели могут менять некоторые данные внутри возвращаемых словарей, убедимся, что у нас тексты корректны.

In [75]:
for i, tr in enumerate(tranlations):
    for ff in cogn_data:
        if tr['id'] == str(ff['id']):
            tranlations[i]['text_eng'] = ff['text']
            break

In [76]:
# save_json(tranlations, 'cognitive_distortions_translations_1592.json')

In [77]:
translations_rat = read_json('cognitive_distortions_translations_1592.json')
translations_rat_ids = [i['id'] for i in translations_rat]
translations_rat_dict = {i['id']: i for i in translations_rat}

translations = read_json('alltr_normalized_cognitive_distortions.json')
translations_ids = [i['id'] for i in translations]
translations_dict = {i['id']: i for i in translations}

In [78]:
len(translations), len(translations_rat)

(2525, 1592)

In [79]:
final_data = []
for _, row in df.iterrows():
    id = str(row.Id_Number)
    if id in translations_rat_ids:
        final_data.append({
			'id': id,
			'patient_question': translations_rat_dict[id]['text_eng'],
			'patient_question_rus': translations_rat_dict[id]['text_rus'],
			'distorted_part': translations_rat_dict[id]['rationales_eng'].split('|'),
			'distorted_part_rus': translations_rat_dict[id]['rationales_rus'].split('|'),
            'dominant_distortion': row['Dominant Distortion'],
			'secondary_distortion': row['Secondary Distortion (Optional)'],
		})
    elif id in translations_ids:
        final_data.append({
			'id': id,
			'patient_question': translations_dict[id]['text'],
			'patient_question_rus': translations_dict[id]['text_rus'],
			'distorted_part': '',
			'distorted_part_rus': '',
            'dominant_distortion': row['Dominant Distortion'],
			'secondary_distortion': row['Secondary Distortion (Optional)'],
		})

In [80]:
len(final_data)

2525

In [81]:
final_data_df = pd.DataFrame(final_data)

In [83]:
set(final_data_df.id.unique()) - set([str(i) for i in df.Id_Number.unique()])

set()

In [84]:
set([str(i) for i in df.Id_Number.unique()]) - set(final_data_df.id.unique()) 

set()

In [None]:
# final_data_df.to_csv("cognitive_distortions_translations.csv", sep="|")

In [86]:
final_data_df.head(3)

Unnamed: 0,id,patient_question,patient_question_rus,distorted_part,distorted_part_rus,dominant_distortion,secondary_distortion
0,4500,"Hello, I have a beautiful,smart,outgoing and a...","Здравствуйте, у меня есть красивая, умная, общ...",[The voice are always fimilar (someone she kno...,"[Голоса всегда знакомы (кто-то, кого она знает...",Personalization,
1,4501,Since I was about 16 years old I’ve had these ...,"С 16 лет у меня бывают такие «приступы», когда...",[I feel trapped inside my disgusting self and ...,[Я чувствую себя запертым в своем отвратительн...,Labeling,Emotional Reasoning
2,4502,So I’ve been dating on and off this guy for al...,"Итак, я встречаюсь с этим парнем то встречаюсь...",,,No Distortion,


In [5]:
# df.to_csv("cognitive_distortions_gpt4_synthetic.csv", sep="|", index=False)

### Перевод синтетических данных
https://huggingface.co/datasets/halilbabacan/cognitive_distortions_gpt4

In [None]:
from deepmultilingualpunctuation import PunctuationModel

In [87]:
df_synt = pd.read_csv("cognitive_distortions_gpt4_synthetic.csv", sep="|")

In [None]:
model = PunctuationModel()
df_synt.dropna(inplace=True)
texts = df_synt.text.tolist()

In [None]:
restored = {}
for t in tqdm(texts):
    restored[t] = model.restore_punctuation(t)

In [None]:
df_synt['restored_punct'] = df_synt.text.apply(lambda x: restored[x])

In [None]:
# df_synt.to_csv('cognitive_distortions_gpt4_synthetic.csv', sep="|")

In [None]:
df_synt = pd.read_csv("cognitive_distortions_gpt4_synthetic.csv", sep="|")

In [None]:
modelname = 'grok-4'

config = OmegaConf.load('../configs/conf.yaml')
general_translation_config = config.general_translation

model_config = read_json(general_translation_config.model_config_path)
model_config['model'] = modelname

general_translation_config.batch_result_dir = f"batches_res_general_translation_cognitive_{modelname}_syn"
general_translation_config.batches = f"batches_general_translation_cognitive_{modelname}_syn"
general_translation_config.prompt_path = "../configs/prompts/translation_promp_cognitive_distortions.txt"
general_translation_config.batch_size = 32


general_translator = Translator(
	system_message=read_file(general_translation_config.prompt_path), 
	model_config=model_config, 
	example_data=read_json(general_translation_config.filepath_examples), 
	batch_size=general_translation_config.batch_size,
	batch_result_dir=general_translation_config.batch_result_dir,
	batch_dir=general_translation_config.batches,
	model_type="openai"
)

In [97]:
df_synt_no_dup = df_synt.drop_duplicates(subset=['text', 'label'])
df_synt_no_dup = df_synt_no_dup.rename(columns={'Unnamed: 0': 'id'})

In [98]:
df_synt_no_dup.shape

(1473, 4)

In [111]:
samples_data = [{'id': i.id, 'text': i['restored_punct']} for _, i in df_synt_no_dup.iterrows()]
samples_data[182]

{'id': 242, 'text': "I couldn't solve the math problem. I'm not good at math."}

In [57]:
general_input_dataset = Dataset.from_list(samples_data)
translation_result = general_translator.translate(general_input_dataset, GeneralTranslationResultSchema)

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 123.90ba/s]


Processing batch 1/1...


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 854.76ba/s]


In [None]:
# save_json(translation_result1, f"cognitive_distortions_synthetic-{modelname}.json")

In [109]:
translation_result = read_json(f"cognitive_distortions_synthetic-{modelname}.json")
translation_result_dict = {i['id']: i for i in translation_result}
translation_result[0]

{'id': '0',
 'text': 'John walked past me without saying a word. He must be angry at me for something.',
 'text_rus': 'Джон прошел мимо меня, не сказав ни слова. Он, должно быть, сердится на меня за что-то.'}

In [None]:
set(translation_result_dict) - set(df_synt_no_dup.id.astype(str).unique())

In [None]:
set(df_synt_no_dup.id.astype(str).unique()) - set(translation_result_dict)

In [117]:
alldata_synthetic = []
inc = []
trs_dict = {i['text']: i['text_rus'] for i in translation_result}
for idx, row in df_synt_no_dup.iterrows():
	t = row.restored_punct
	if t in trs_dict:
		alldata_synthetic.append({
			"id": row['id'],
			"text_eng": t,
			"text_rus": trs_dict[t],
			"label": row.label,
		})
	else:
		inc.append({
			"id": row['id'],
			"text": t,
		})

In [118]:
len(inc)

0

In [119]:
pd.DataFrame(alldata_synthetic).sample(3)

Unnamed: 0,id,text_eng,text_rus,label
186,246,I didn't get the grant. I'll never get funding...,Я не получил грант. Я никогда не получу финанс...,Overgeneralization
1208,1542,"My boss praised my performance, but I'm fixate...","Мой начальник похвалил мое выступление, но я з...",Mental Filter
1325,1720,My colleagues should always appreciate my work...,Мои коллеги всегда должны ценить мою работу. Е...,Should Statements


In [None]:
# pd.DataFrame(alldata_synthetic).to_csv("cognitive_distortions_gpt4_translated.csv", sep="|")