# Synchronous Document Translation
https://learn.microsoft.com/en-us/azure/ai-services/translator/document-translation/quickstarts/synchronous-rest-api


<img src="https://techcommunity.microsoft.com/t5/image/serverpage/image-id/552407i90F58669E536FA1A/image-size/large?v=v2&px=999">

> https://aka.ms/TranslatorLanguageCodes 

In [34]:
import datetime
import gradio as gr
import os
import requests
import sys
import time

from dotenv import load_dotenv
from IPython.display import FileLink

In [2]:
sys.version

'3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]'

In [3]:
print(f"Today is {datetime.datetime.today().strftime('%d-%b-%Y %H:%M:%S')}")

Today is 10-Apr-2024 13:35:42


## Azure AI Translator credentials

In [4]:
load_dotenv("azure.env")

key = os.getenv("AZURE_AI_TRANSLATION_KEY")
endpoint = os.getenv("AZURE_AI_TRANSLATION_ENDPOINTDOCUMENT")
region = os.getenv("AZURE_AI_TRANSLATION_REGION")

In [5]:
source_dir = "source"
target_dir = "translated"

os.makedirs(source_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)

## Function

In [6]:
def translate(input_file, sourceLanguage, targetLanguage):
    """
    Document translation
    """
    start = time.time()
    print(f"Translating the document from {sourceLanguage} to {targetLanguage} ...")

    params = {
        "sourceLanguage": sourceLanguage,
        "targetLanguage": targetLanguage,
        "api-version": "2023-11-01-preview",
    }
    
    path = "translator/document:translate"
    url = endpoint + path

    headers = {"Ocp-Apim-Subscription-Key": key}
    
    with open(input_file, "rb") as document:
        # Define the data to be sent
        # Find list of supported content types here: https://aka.ms/dtsync-content-type
        data = {
            "document": (os.path.basename(input_file),
                         document,
                         "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
        }

        # Send the POST request
        response = requests.post(url, headers=headers, files=data, params=params)

    # Write the response content to a file
    with open(output_file, "wb") as output_document:
        output_document.write(response.content)

    elapsed = time.time() - start
    print("\nDone. Elapsed time: " + time.strftime("%H:%M:%S.{}".format(str(elapsed %
          1)[2:])[:15], time.gmtime(elapsed)))

## Test 1

In [7]:
sourceLanguage = "en"
targetLanguage = "fr"

In [8]:
input_file = os.path.join(source_dir, "worddocument.docx")
output_file = os.path.join(target_dir, "worddocument_translated_" + targetLanguage + ".docx")

In [9]:
source_link = FileLink(path=os.path.join(
    source_dir, os.path.basename(input_file)))
source_link

In [10]:
translate(input_file, sourceLanguage, targetLanguage)

Translating the document from en to fr ...

Done. Elapsed time: 00:00:00.402915


In [11]:
translated_link = FileLink(path=os.path.join(
    target_dir, os.path.basename(output_file)))
translated_link

## Test 2

In [12]:
sourceLanguage = "en"
targetLanguage = "it"

In [13]:
input_file = os.path.join(source_dir, "worddocument.docx")
output_file = os.path.join(target_dir, "worddocument_translated_" + targetLanguage + ".docx")

In [14]:
translate(input_file, sourceLanguage, targetLanguage)

Translating the document from en to it ...

Done. Elapsed time: 00:00:00.381246


In [15]:
translated_link = FileLink(path=os.path.join(
    target_dir, os.path.basename(output_file)))
translated_link

## Test 3

In [16]:
sourceLanguage = "en"
targetLanguage = "ar"

In [17]:
input_file = os.path.join(source_dir, "worddocument.docx")
output_file = os.path.join(target_dir, "worddocument_translated_" + targetLanguage + ".docx")

In [18]:
translate(input_file, sourceLanguage, targetLanguage)

Translating the document from en to ar ...

Done. Elapsed time: 00:00:00.381465


In [19]:
translated_link = FileLink(path=os.path.join(
    target_dir, os.path.basename(output_file)))
translated_link

## Test 4

In [20]:
sourceLanguage = "en"
targetLanguage = "zh-Hans"

In [21]:
input_file = os.path.join(source_dir, "worddocument.docx")
output_file = os.path.join(target_dir, "worddocument_translated_" + targetLanguage + ".docx")

In [22]:
translate(input_file, sourceLanguage, targetLanguage)

Translating the document from en to zh-Hans ...

Done. Elapsed time: 00:00:00.372039


In [23]:
translated_link = FileLink(path=os.path.join(
    target_dir, os.path.basename(output_file)))
translated_link

## Webapp

In [24]:
language_dict = {
    "af": "Afrikaans",
    "sq": "Albanian",
    "am": "Amharic",
    "ar": "Arabic",
    "hy": "Armenian",
    "as": "Assamese",
    "az": "Azerbaijani",
    "bn": "Bangla",
    "ba": "Bashkir",
    "eu": "Basque",
    "bho": "Bhojpuri",
    "brx": "Bodo",
    "bs": "Bosnian",
    "bg": "Bulgarian",
    "yue": "Cantonese",
    "ca": "Catalan",
    "lzh": "Chinese (Classical)",
    "zh-Hans": "Chinese (Simplified)",
    "zh-Hant": "Chinese (Traditional)",
    "sn": "Chishona",
    "hr": "Croatian",
    "cs": "Czech",
    "da": "Danish",
    "prs": "Dari",
    "dv": "Divehi",
    "doi": "Dogri",
    "nl": "Dutch",
    "en": "English",
    "et": "Estonian",
    "fo": "Faroese",
    "fj": "Fijian",
    "fil": "Filipino",
    "fi": "Finnish",
    "fr": "French",
    "fr-ca": "French (Canada)",
    "gl": "Galician",
    "ka": "Georgian",
    "de": "German",
    "el": "Greek",
    "gu": "Gujarati",
    "ht": "Haitian Creole",
    "ha": "Hausa",
    "he": "Hebrew",
    "hi": "Hindi",
    "mww": "Hmong Daw",
    "hu": "Hungarian",
    "is": "Icelandic",
    "ig": "Igbo",
    "id": "Indonesian",
    "ikt": "Inuktitut",
    "iu": "Inuktitut (Syllabics)",
    "iu-Latn": "Inuktitut (Latin)",
    "ga": "Irish",
    "it": "Italian",
    "ja": "Japanese",
    "kn": "Kannada",
    "ks": "Kashmiri",
    "kk": "Kazakh",
    "km": "Khmer",
    "rw": "Kinyarwanda",
    "tlh-Latn": "Klingon (Latin)",
    "tlh-Piqd": "Klingon (pIqaD)",
    "gom": "Konkani",
    "ko": "Korean",
    "ku": "Kurdish (Kurmanji)",
    "kmr": "Kurdish (Sorani)",
    "ky": "Kyrgyz",
    "lo": "Lao",
    "lv": "Latvian",
    "lt": "Lithuanian",
    "ln": "Lingala",
    "dsb": "Lower Sorbian",
    "lug": "Luganda",
    "mk": "Macedonian",
    "mai": "Maithili",
    "mg": "Malagasy",
    "ms": "Malay",
    "ml": "Malayalam",
    "mt": "Maltese",
    "mi": "Māori",
    "mr": "Marathi",
    "mn-Cyrl": "Mongolian (Cyrillic)",
    "mn-Mong": "Mongolian (Traditional Mongolian)",
    "my": "Myanmar (Burmese)",
    "ne": "Nepali",
    "nb": "Norwegian (Bokmål)",
    "nya": "Nyanja (Chichewa)",
    "or": "Odia (Oriya)",
    "ps": "Pashto",
    "fa": "Persian",
    "pl": "Polish",
    "pt": "Portuguese",
    "pt-pt": "Portuguese (Portugal)",
    "pa": "Punjabi",
    "otq": "Querétaro Otomi",
    "ro": "Romanian",
    "run": "Rundi",
    "ru": "Russian",
    "sm": "Samoan",
    "sr-Cyrl": "Serbian (Cyrillic)",
    "sr-Latn": "Serbian (Latin)",
    "st": "Sesotho",
    "nso": "Sesotho sa Leboa",
    "tn": "Setswana",
    "sd": "Sindhi",
    "si": "Sinhala",
    "sk": "Slovak",
    "sl": "Slovenian",
    "so": "Somali",
    "es": "Spanish",
    "sw": "Swahili",
    "sv": "Swedish",
    "ty": "Tahitian",
    "ta": "Tamil",
    "tt": "Tatar",
    "te": "Telugu",
    "th": "Thai",
    "bo": "Tibetan",
    "ti": "Tigrinya",
    "to": "Tongan",
    "tr": "Turkish",
    "tk": "Turkmen",
    "uk": "Ukrainian",
    "hsb": "Upper Sorbian",
    "ur": "Urdu",
    "ug": "Uyghur",
    "uz": "Uzbek",
    "vi": "Vietnamese",
    "cy": "Welsh",
    "xh": "Xhosa",
    "yo": "Yoruba",
    "yua": "Yucatec Maya",
    "zu": "Zulu",
}

In [25]:
language_full_names = list(language_dict.values())
language_full_names.sort()
len(language_full_names)

133

In [26]:
language_codes = list(language_dict.keys())
language_codes.sort()

reverse_language_names = {v: k for k, v in language_dict.items()}

In [27]:
def get_language_code(language_name):
    # Function to get language code by language name
    return reverse_language_names.get(language_name)

In [28]:
# Quick test
get_language_code("French")

'fr'

In [29]:
def get_format_from_extension(source_ext):
    format_mapping = {
        '.txt': "text/plain",
        '.txv': "text/tab-separated-values",
        '.tab': "text/tab-separated-values",
        '.csv': "text/csv",
        '.html': "text/html",
        '.htm': "text/html",
        '.mthml': "message/rfc822@application/x-mimearchive@multipart/related",
        '.mthm': "message/rfc822@application/x-mimearchive@multipart/related",
        '.pptx': "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        '.xlsx': "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        '.docx': "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        '.msg': "application/vnd.ms-outlook",
        '.xlf': "application/xliff+xml",
        '.xliff': "application/xliff+xml"
    }
    return format_mapping.get(source_ext, None)

In [30]:
source_ext = '.docx'
get_format_from_extension(source_ext)

'application/vnd.openxmlformats-officedocument.wordprocessingml.document'

In [31]:
def document_translation_fn(source_document, name_source_lang, name_target_lang):
    """
    Synchronous document translation
    """
    # Get the code
    source_lang = get_language_code(name_source_lang)
    target_lang = get_language_code(name_target_lang)
    
    # Output file
    source_file = os.path.splitext(os.path.basename(source_document))[0]
    source_ext = os.path.splitext(source_document)[1]
    
    output_file = os.path.join(target_dir, source_file + "_" + str(name_target_lang) + source_ext)
    
    fileformat = get_format_from_extension(source_ext)
    
    params = {
        "sourceLanguage": source_lang,
        "targetLanguage": target_lang,
        "api-version": "2023-11-01-preview",
    }

    headers = {"Ocp-Apim-Subscription-Key": key}
    
    path = "translator/document:translate"
    url = endpoint + path

    with open(source_document, "rb") as document:
        # Define the data to be sent
        # Find list of supported content types here: https://aka.ms/dtsync-content-type
        data = {
            "document": (os.path.basename(source_document),
                         document,
                         fileformat)
        }

        # Send the POST request
        response = requests.post(url, headers=headers, files=data, params=params)

    # Write the response content to a file
    with open(output_file, "wb") as output_document:
        output_document.write(response.content)
        
    return output_file

In [32]:
# Quick test
document_translation_fn("source/worddocument.docx", "English", "Italian")

'translated/worddocument_Italian.docx'

In [33]:
image_url = "https://th.bing.com/th/id/OIP.ngRD8aNJwXZ2B0eol5tFlwAAAA?rs=1&pid=ImgDetMain"
logo = "<center> <img src= {} width=100px></center>".format(image_url)

doc_translator_webapp = gr.Interface(
    fn=document_translation_fn,
    inputs=[
        gr.components.File(label="Document to translate"),
        gr.components.Dropdown(label="Source language",
                               choices=language_full_names),
        gr.components.Dropdown(label="Target language",
                               choices=language_full_names),
    ],
    outputs=gr.File(label="Translated text"),
    cache_examples=False,
    title="Synchronous document translation with Azure AI Translator",
    description=logo,
)

doc_translator_webapp.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://fcd1e4b0422e55c4cf.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


