# Synchronous Document Translation
https://learn.microsoft.com/en-us/azure/ai-services/translator/document-translation/quickstarts/synchronous-rest-api


<img src="https://techcommunity.microsoft.com/t5/image/serverpage/image-id/552407i90F58669E536FA1A/image-size/large?v=v2&px=999">

> https://aka.ms/TranslatorLanguageCodes 

<img src="webapp2.jpg">

In [39]:
import datetime
import gradio as gr
import os
import pandas as pd
import requests
import sys
import time

from azure.ai.translation.text import TextTranslationClient, TranslatorCredential
from dotenv import load_dotenv
from IPython.display import FileLink

In [2]:
sys.version

'3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]'

In [3]:
print(f"Today is {datetime.datetime.today().strftime('%d-%b-%Y %H:%M:%S')}")

Today is 11-Apr-2024 07:18:06


## Azure AI Translator credentials

In [4]:
load_dotenv("azure.env")

azure_ai_translator_key = os.getenv("AZURE_AI_TRANSLATION_KEY")
azure_ai_translator_endpoint = os.getenv("AZURE_AI_TRANSLATION_ENDPOINTDOCUMENT")
azure_ai_translator_region = os.getenv("AZURE_AI_TRANSLATION_REGION")

In [5]:
source_dir = "source"
target_dir = "translated"

os.makedirs(source_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)

## Function

In [6]:
def translate(input_file, sourceLanguage, targetLanguage):
    """
    Document translation
    """
    start = time.time()
    print(f"Translating the document from {sourceLanguage} to {targetLanguage} ...")

    params = {
        "sourceLanguage": sourceLanguage,
        "targetLanguage": targetLanguage,
        "api-version": "2023-11-01-preview",
    }
    
    path = "translator/document:translate"
    url = azure_ai_translator_endpoint + path

    headers = {"Ocp-Apim-Subscription-Key": azure_ai_translator_key}
    
    with open(input_file, "rb") as document:
        # Define the data to be sent
        # Find list of supported content types here: https://aka.ms/dtsync-content-type
        data = {
            "document": (os.path.basename(input_file),
                         document,
                         "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
        }

        # Send the POST request
        response = requests.post(url, headers=headers, files=data, params=params)

    # Write the response content to a file
    with open(output_file, "wb") as output_document:
        output_document.write(response.content)

    elapsed = time.time() - start
    print("\nDone. Elapsed time: " + time.strftime("%H:%M:%S.{}".format(str(elapsed %
          1)[2:])[:15], time.gmtime(elapsed)))

## Test 1

In [7]:
sourceLanguage = "en"
targetLanguage = "fr"

In [8]:
input_file = os.path.join(source_dir, "worddocument.docx")
output_file = os.path.join(target_dir, "worddocument_translated_" + targetLanguage + ".docx")

In [9]:
source_link = FileLink(path=os.path.join(
    source_dir, os.path.basename(input_file)))
source_link

In [10]:
translate(input_file, sourceLanguage, targetLanguage)

Translating the document from en to fr ...

Done. Elapsed time: 00:00:00.421438


In [11]:
translated_link = FileLink(path=os.path.join(
    target_dir, os.path.basename(output_file)))
translated_link

## Test 2

In [12]:
sourceLanguage = "en"
targetLanguage = "it"

In [13]:
input_file = os.path.join(source_dir, "worddocument.docx")
output_file = os.path.join(target_dir, "worddocument_translated_" + targetLanguage + ".docx")

In [14]:
translate(input_file, sourceLanguage, targetLanguage)

Translating the document from en to it ...

Done. Elapsed time: 00:00:00.373859


In [15]:
translated_link = FileLink(path=os.path.join(
    target_dir, os.path.basename(output_file)))
translated_link

## Test 3

In [16]:
sourceLanguage = "en"
targetLanguage = "ar"

In [17]:
input_file = os.path.join(source_dir, "worddocument.docx")
output_file = os.path.join(target_dir, "worddocument_translated_" + targetLanguage + ".docx")

In [18]:
translate(input_file, sourceLanguage, targetLanguage)

Translating the document from en to ar ...

Done. Elapsed time: 00:00:00.433402


In [19]:
translated_link = FileLink(path=os.path.join(
    target_dir, os.path.basename(output_file)))
translated_link

## Test 4

In [20]:
sourceLanguage = "en"
targetLanguage = "zh-Hans"

In [21]:
input_file = os.path.join(source_dir, "worddocument.docx")
output_file = os.path.join(target_dir, "worddocument_translated_" + targetLanguage + ".docx")

In [22]:
translate(input_file, sourceLanguage, targetLanguage)

Translating the document from en to zh-Hans ...

Done. Elapsed time: 00:00:00.426323


In [23]:
translated_link = FileLink(path=os.path.join(
    target_dir, os.path.basename(output_file)))
translated_link

## Webapp

In [24]:
credential = TranslatorCredential(azure_ai_translator_key, azure_ai_translator_region)

text_translator = TextTranslationClient(
    endpoint=azure_ai_translator_endpoint, credential=credential)

In [25]:
try:
    response = text_translator.get_languages()

    print(
        f"Number of supported languages for translate operation: {len(response.translation) if response.translation is not None else 0}"
    )
    print(
        f"Number of supported languages for transliterate operation: {len(response.transliteration) if response.transliteration is not None else 0}"
    )
    print(
        f"Number of supported languages for dictionary operations: {len(response.dictionary) if response.dictionary is not None else 0}"
    )

    print()

    if response.translation is not None:
        print("Translation Languages:")
        i = 1
        for key, value in response.translation.items():
            print(f"{i} {key} -- name: {value.name} ({value.native_name})")
            i += 1

except HttpResponseError as exception:
    if exception.error is not None:
        print(f"Error Code: {exception.error.code}")
        print(f"Message: {exception.error.message}")
    raise

Number of supported languages for translate operation: 135
Number of supported languages for transliterate operation: 42
Number of supported languages for dictionary operations: 50

Translation Languages:
1 af -- name: Afrikaans (Afrikaans)
2 am -- name: Amharic (አማርኛ)
3 ar -- name: Arabic (العربية)
4 as -- name: Assamese (অসমীয়া)
5 az -- name: Azerbaijani (Azərbaycan)
6 ba -- name: Bashkir (Bashkir)
7 bg -- name: Bulgarian (Български)
8 bho -- name: Bhojpuri (भोजपुरी)
9 bn -- name: Bangla (বাংলা)
10 bo -- name: Tibetan (བོད་སྐད་)
11 brx -- name: Bodo (बड़ो)
12 bs -- name: Bosnian (Bosanski)
13 ca -- name: Catalan (Català)
14 cs -- name: Czech (Čeština)
15 cy -- name: Welsh (Cymraeg)
16 da -- name: Danish (Dansk)
17 de -- name: German (Deutsch)
18 doi -- name: Dogri (डोगरी)
19 dsb -- name: Lower Sorbian (Dolnoserbšćina)
20 dv -- name: Divehi (ދިވެހިބަސް)
21 el -- name: Greek (Ελληνικά)
22 en -- name: English (English)
23 es -- name: Spanish (Español)
24 et -- name: Estonian (Eesti)
25

In [26]:
data = []

if response.translation is not None:
    for key, value in response.translation.items():
        data.append({'Language_Code': key,
                     'Language_Name': value.name,
                     'Native_Name': value.native_name})

df_languages = pd.DataFrame(data)
df_languages

Unnamed: 0,Language_Code,Language_Name,Native_Name
0,af,Afrikaans,Afrikaans
1,am,Amharic,አማርኛ
2,ar,Arabic,العربية
3,as,Assamese,অসমীয়া
4,az,Azerbaijani,Azərbaycan
...,...,...,...
130,yua,Yucatec Maya,Yucatec Maya
131,yue,Cantonese (Traditional),粵語 (繁體)
132,zh-Hans,Chinese Simplified,中文 (简体)
133,zh-Hant,Chinese Traditional,繁體中文 (繁體)


In [27]:
language_dict = df_languages.set_index('Language_Code')['Language_Name'].to_dict()
print(language_dict)

{'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese', 'az': 'Azerbaijani', 'ba': 'Bashkir', 'bg': 'Bulgarian', 'bho': 'Bhojpuri', 'bn': 'Bangla', 'bo': 'Tibetan', 'brx': 'Bodo', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech', 'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'doi': 'Dogri', 'dsb': 'Lower Sorbian', 'dv': 'Divehi', 'el': 'Greek', 'en': 'English', 'es': 'Spanish', 'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish', 'fil': 'Filipino', 'fj': 'Fijian', 'fo': 'Faroese', 'fr': 'French', 'fr-CA': 'French (Canada)', 'ga': 'Irish', 'gl': 'Galician', 'gom': 'Konkani', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew', 'hi': 'Hindi', 'hne': 'Chhattisgarhi', 'hr': 'Croatian', 'hsb': 'Upper Sorbian', 'ht': 'Haitian Creole', 'hu': 'Hungarian', 'hy': 'Armenian', 'id': 'Indonesian', 'ig': 'Igbo', 'ikt': 'Inuinnaqtun', 'is': 'Icelandic', 'it': 'Italian', 'iu': 'Inuktitut', 'iu-Latn': 'Inuktitut (Latin)', 'ja': 'Japanese', 'ka': 'Georgian', 'kk': 'Kazakh', 'km

In [28]:
language_full_names = list(language_dict.values())
language_full_names.sort()
len(language_full_names)

135

In [29]:
language_codes = list(language_dict.keys())
language_codes.sort()

reverse_language_names = {v: k for k, v in language_dict.items()}

In [30]:
def get_language_code(language_name):
    # Function to get language code by language name
    return reverse_language_names.get(language_name)

In [31]:
# Quick test
get_language_code("French")

'fr'

In [32]:
def get_format_from_extension(source_ext):
    format_mapping = {
        '.txt': "text/plain",
        '.txv': "text/tab-separated-values",
        '.tab': "text/tab-separated-values",
        '.csv': "text/csv",
        '.html': "text/html",
        '.htm': "text/html",
        '.mthml': "message/rfc822@application/x-mimearchive@multipart/related",
        '.mthm': "message/rfc822@application/x-mimearchive@multipart/related",
        '.pptx': "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        '.xlsx': "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        '.docx': "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        '.msg': "application/vnd.ms-outlook",
        '.xlf': "application/xliff+xml",
        '.xliff': "application/xliff+xml"
    }
    return format_mapping.get(source_ext, None)

In [33]:
# Test
source_ext = '.pptx'
get_format_from_extension(source_ext)

'application/vnd.openxmlformats-officedocument.presentationml.presentation'

In [34]:
def document_translation_fn(source_document, name_source_lang, name_target_lang):
    """
    Synchronous document translation
    """
    # Get the code
    source_lang = get_language_code(name_source_lang)
    target_lang = get_language_code(name_target_lang)
    
    # Output file
    source_file = os.path.splitext(os.path.basename(source_document))[0]
    source_ext = os.path.splitext(source_document)[1]
    
    output_file = os.path.join(target_dir, source_file + "_" + str(name_target_lang) + source_ext)
    
    fileformat = get_format_from_extension(source_ext)
    
    params = {
        "sourceLanguage": source_lang,
        "targetLanguage": target_lang,
        "api-version": "2023-11-01-preview",
    }

    headers = {"Ocp-Apim-Subscription-Key": azure_ai_translator_key}
    
    path = "translator/document:translate"
    url = azure_ai_translator_endpoint + path

    with open(source_document, "rb") as document:
        # Define the data to be sent
        # Find list of supported content types here: https://aka.ms/dtsync-content-type
        data = {
            "document": (os.path.basename(source_document),
                         document,
                         fileformat)
        }

        # Send the POST request
        response = requests.post(url, headers=headers, files=data, params=params)

    # Write the response content to a file
    with open(output_file, "wb") as output_document:
        output_document.write(response.content)
        
    return output_file

In [35]:
# Quick test
document_translation_fn("source/worddocument.docx", "English", "Italian")

'translated/worddocument_Italian.docx'

In [37]:
image_url = "https://th.bing.com/th/id/OIP.ngRD8aNJwXZ2B0eol5tFlwAAAA?rs=1&pid=ImgDetMain"
logo = "<center> <img src= {} width=100px></center>".format(image_url)

doc_translator_webapp = gr.Interface(
    fn=document_translation_fn,
    inputs=[
        gr.components.File(label="Document to translate"),
        gr.components.Dropdown(label="Source language",
                               choices=language_full_names),
        gr.components.Dropdown(label="Target language",
                               choices=language_full_names),
    ],
    outputs=gr.File(label="Translated text"),
    cache_examples=False,
    title="Synchronous document translation with Azure AI Translator",
    theme="rottenlittlecreature/Moon_Goblin",
    description=logo,
)

doc_translator_webapp.launch(share=True)

themes/theme_schema@0.0.3.json: 100%|██████████| 13.3k/13.3k [00:00<00:00, 22.8MB/s]


Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://fdef60e4b0cb8906a5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


