In [None]:
from mtranslate import translate
from tqdm import tqdm
import jsonlines


def translation(to_translate, file):
    translated = {}
    i = 0
    for image_id, captions in tqdm(to_translate.items()):
        cur_image = []
        for caption in captions:
            cur_image.append(translate(caption, 'ru'))
        translated[image_id] = cur_image
        i += 1
        if i % 1000 == 0:
            with jsonlines.open(f'{file}.jsonl', mode='w') as writer:
                writer.write(translated)
    return translated

In [None]:
from pycocotools.coco import COCO


def get_coco_captions(coco_annotations_file):
    # Initialize COCO instance
    coco = COCO(coco_annotations_file)

    # Load captions
    coco_captions = {}

    # Get image IDs
    image_ids = coco.getImgIds()

    for image_id in image_ids:
        # Get image info
        image_info = coco.loadImgs(image_id)[0]

        # Get file name and image captions
        file_name = image_info['file_name']
        captions = []

        # Get annotation IDs for the image
        annotation_ids = coco.getAnnIds(imgIds=image_id)

        # Get annotations and their captions
        annotations = coco.loadAnns(annotation_ids)
        for annotation in annotations:
            caption = annotation['caption']
            captions.append(caption)

        # Add captions to the dictionary
        coco_captions[file_name] = captions
    return coco_captions

In [None]:
coco_captions_train = get_coco_captions('coco_dataset/annotations/captions_train2014.json')

In [None]:
coco_captions_val = get_coco_captions('coco_dataset/annotations/captions_val2014.json')

In [None]:
print(len(coco_captions_train), len(coco_captions_val))

In [None]:
coco_train_ru = translation(coco_captions_train, "coco_train_trainslation")

In [None]:
with jsonlines.open("coco_train_trainslation.jsonl", mode='w') as writer:
    writer.write(coco_train_ru)

In [None]:
import jsonlines

with jsonlines.open("coco_train_trainslation.jsonl") as reader:
    f = reader.read()
len(f)

In [None]:
coco_val_ru = translation(coco_captions_val, "coco_val_translation")