# Install dependencies

In [1]:
!pip install transformers==4.22.1
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.22.1
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 35.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.5 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 67.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.12.1 transformers-4.22.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 M

# Imports

In [2]:
import json

import pandas as pd
import torch
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

# Set seed

In [3]:
torch.manual_seed(42)

<torch._C.Generator at 0x7fb32ba9b410>

# Define tokenizer and model

We're using Meta's [NLLB](https://about.fb.com/news/2022/07/new-meta-ai-model-translates-200-languages-making-technology-more-accessible/) models distilled version because of memory constraints.

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to(device)

Downloading:   0%|          | 0.00/564 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/846 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

# Translation test

In [5]:
article = ["This is a translation test from english to nepali", "This is a translation test from english to nepali language"]
inputs = tokenizer(article, return_tensors="pt", padding=True).to(device)

translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["npi_Deva"], max_length=30
)
tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

['यो अंग्रेजीबाट नेपालीमा अनुवादको परीक्षा हो',
 'यो अंग्रेजीबाट नेपाली भाषामा अनुवादको परीक्षा हो']

# Work with COCO dataset

## Download data

In [None]:
! wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
! unzip annotations_trainval2014.zip

--2022-11-06 15:05:45--  http://images.cocodataset.org/annotations/annotations_trainval2014.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.36.81
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.36.81|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252872794 (241M) [application/zip]
Saving to: ‘annotations_trainval2014.zip’


2022-11-06 15:06:07 (11.3 MB/s) - ‘annotations_trainval2014.zip’ saved [252872794/252872794]



## Prepare dataset

In [None]:
class AnnotationDataset(Dataset):
    def __init__(self, annotation_list):
        self.annotations = annotation_list
    
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, idx):
        return self.annotations[idx]

In [None]:
phase = "val" # change to train to translate train annotations
with open(f"annotations/captions_{phase}2014.json", "r") as f:
    data = json.load(f)

In [None]:
batch_size = 64
annotations = data["annotations"]

# testing on 100 exmples for testing purpose, don't use annotations_sub if running on entire dataset
annotations_sub = annotations[:100]
dataset = AnnotationDataset(annotations_sub)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## Translate from english to nepali

In [None]:
nepali_annotations = []

for idx, annot in enumerate(dataloader):
  if idx % 100 == 0:
    print("Runnning batch: ", idx)
  image_ids = annot["image_id"]
  ids = annot["id"]
  captions = annot["caption"]
  
  inputs = tokenizer(captions, return_tensors="pt", padding=True).to(device)

  translated_tokens = model.generate(
      **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["npi_Deva"], max_length=30
  )
  
  translations = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
  
  new_data = zip(image_ids, ids, translations)

  new_data_dict = [{"image_id": image_id.item(), "id": id.item(), "caption": caption} for image_id, id, caption in new_data]

  nepali_annotations.extend(new_data_dict)


In [None]:
len(nepali_annotations)

## Compare translation results

In [None]:
test_annotation = annotations_sub[0]
test_id = test_annotation["image_id"]
[x for x in annotations_sub if x["image_id"] == test_id]

In [None]:
[x for x in nepali_annotations if x["image_id"] == test_id]

## Write nepali annotations to json file

In [None]:
data["annotations"] = nepali_annotations

In [None]:
with open(f"nepali_captions_{phase}2014.json", "w") as f:
  json.dump(data, f)