In [1]:
import os
import random
import time
import itertools
import json

import numpy as np
import concurrent.futures
import tqdm
from googletrans import Translator

In [2]:
PROJECT_DIR = os.getcwd()  # get the current working directory
processed_data_dir = os.path.join(PROJECT_DIR, 'data-processed')
processed_data_dir

'/home/sakshigupta/Desktop/meme-generation/data-processed'

In [3]:
# Load the saved identified non english captions
non_english_captions = np.load(
    os.path.join(processed_data_dir, "non_english_captions.npy"), allow_pickle=True
)
print(len(non_english_captions))

29820


In [4]:
# Ref - https://www.packetswitch.co.uk/what-is-concurrent-futures-and-how-can-it-boost-your-python-performance/ (for parallel processing)

# Create a translator object
translator = Translator()

# Define a function to translate a caption to English
def translate_to_english(caption):
    try:
        return caption, translator.translate(caption).text
    except: 
        print("Error")       
        return caption, None

# Use parallel processing to translate the captions to English
translated_captions = {}

In [5]:
# Get the unique non-english captions text
unique_non_english_captions = set(non_english_captions)
batch_size = 500

while len(unique_non_english_captions) > 0:
    # Check the length of the set
    print(f'The length of non-english captions remaining for translation: {len(unique_non_english_captions)}')

    # Get the data of size = batch_size or of the size of non-english captions remaining from the set
    batch_data = list(itertools.islice(unique_non_english_captions, min(batch_size, len(unique_non_english_captions))))

    flag = False

    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:

        results = list(executor.map(translate_to_english, batch_data))

        for caption, translated_caption in results:
            # Add all the converted captions to the dictionary, and set the flag if any of the caption is not converted
            if translated_caption is not None:
                translated_captions[caption] = translated_caption
                unique_non_english_captions.remove(caption)
            else:
                flag = True
        
        # If there was error in any of the caption, wait for 10 seconds, and then start the translation process again
        if flag:
            print("Waiting for 10 seconds")
            time.sleep(10)
            flag = False 

# Note : In print we can see some error messages, it is because googletrans library is a workaround for google translate API.
# Google blocks multiple requests from same API, so we need to give a wait for some time to process unconverted captions.
# We have large number of tokens to translate, and using official API will be expensive. 

The length of non-english captions remaining for translation: 29547
The length of non-english captions remaining for translation: 29047
The length of non-english captions remaining for translation: 28547
The length of non-english captions remaining for translation: 28047
The length of non-english captions remaining for translation: 27547
The length of non-english captions remaining for translation: 27047
The length of non-english captions remaining for translation: 26547
The length of non-english captions remaining for translation: 26047
The length of non-english captions remaining for translation: 25547
The length of non-english captions remaining for translation: 25047
The length of non-english captions remaining for translation: 24547
The length of non-english captions remaining for translation: 24047
The length of non-english captions remaining for translation: 23547
The length of non-english captions remaining for translation: 23047
The length of non-english captions remaining for

In [5]:
# Save the translated captions, to avoid hitting the API again
with open(os.path.join(processed_data_dir, "optimized_translated_captions.json"), "w") as f:
    json.dump(translated_captions, f)

In [None]:
# This approach was restarting the translation again from the first encountered error
# (there can be scenario that in another worker thread, next caption would have been translated successfully)
# offset = len(translated_captions)
# batch_size = 500

# while True:
#     print(f"Offset: {offset}")
#     # Creates a ThreadPoolExecutor instance as a context manager, which manages the life cycle of
#     #  a pool of worker threads that will be used to execute tasks concurrently.
#     with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:

#         results = list(executor.map(translate_to_english, non_english_captions[offset:offset+batch_size]))

#         for i, result in enumerate(results):
#             if result is None:
#                 time.sleep(30)
#                 break
            
#             translated_captions[offset + i] = result

#     offset = len(translated_captions)
    
#     if offset >= len(non_english_captions):
#         break  
    

In [17]:
# Map the index to the non-translated caption and make a new dictionary to store key value pairs of non-translated and translated captions
# translated_captions_new = {}
# for key, value in tqdm.tqdm(translated_captions.items()):
#     non_english_caption = non_english_captions[key]
#     translated_captions_new[non_english_caption] = value

# # Save as JSON
# with open(os.path.join(PROJECT_DIR, "translated_captions.json"), "w") as f:
#     json.dump(translated_captions_new, f)

100%|██████████| 28422/28422 [00:00<00:00, 702800.98it/s]
