# Translation of documents with Azure Open AI

In [1]:
import datetime
import openai
import re
import sys
import tiktoken
import time

from docx import Document
from dotenv import load_dotenv
from IPython.display import FileLink
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pypdf import PdfReader

In [2]:
sys.version

'3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]'

In [3]:
print('Today is:', datetime.datetime.today().strftime ('%d-%b-%Y %H:%M:%S'))

Today is: 21-Sep-2023 08:23:00


In [4]:
load_dotenv("azure.env")

# Azure Open AI
openai.api_type: str = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")

print("Open AI version:", openai.__version__)

Open AI version: 0.28.0


In [5]:
text = "Azure Open AI is great!"

In [6]:
def text_infos(text):
    """
    Get string information
    """
    nb_char = len(text)
    print("Number of characters =", nb_char)
    nb_words = len(re.findall(r"\w+", text))
    print("Number of words =", nb_words)

    encoding = tiktoken.get_encoding("cl100k_base")
    nb_tokens = len(encoding.encode(text))
    print("Number of tokens =", nb_tokens)

    return nb_char, nb_words, nb_tokens

In [7]:
text_infos(text)

Number of characters = 23
Number of words = 5
Number of tokens = 6


(23, 5, 6)

## Translation with Azure Open AI

In [8]:
model = "text-davinci-003"

In [9]:
def azure_openai_translation(text, language="French"):
    """
    Translation of a text using Azure Open AI
    """
    prompt = f"You are a translator expert. You should translate the following text into {language}:\n{text}"

    response = openai.Completion.create(
        engine=model,
        prompt=prompt,
        temperature=0,
        max_tokens=3000,
        top_p=1,
    )

    result = response["choices"][0]["text"]

    return result

In [10]:
smalltext = "Fintech Plus Sync had a successful Q2 with a revenue of 125 million, a 25% increase \
year over year, and a gross profit margin of 58%. Their net income rose to 16 million, total assets\
reached 1.5 billion, and their debt to equity ratio stood at 1.5. They saw organic user growth and had\
a LTV CAC ratio of 3.5%. They have a value at risk model in place and are expecting a 8% quarter over\
quarter revenue growth in the next quarter, driven by blockchain and AI. Additionally, they are excited\
about their subsidiary's upcoming IPO, which is expected to raise 200 million."

print("\033[1;31;34m", smalltext)

[1;31;34m Fintech Plus Sync had a successful Q2 with a revenue of 125 million, a 25% increase year over year, and a gross profit margin of 58%. Their net income rose to 16 million, total assetsreached 1.5 billion, and their debt to equity ratio stood at 1.5. They saw organic user growth and hada LTV CAC ratio of 3.5%. They have a value at risk model in place and are expecting a 8% quarter overquarter revenue growth in the next quarter, driven by blockchain and AI. Additionally, they are excitedabout their subsidiary's upcoming IPO, which is expected to raise 200 million.


In [11]:
text_infos(smalltext)

Number of characters = 567
Number of words = 104
Number of tokens = 140


(567, 104, 140)

In [12]:
print("\033[1;31;34m", azure_openai_translation(smalltext, "french"))

[1;31;34m 

Fintech Plus Sync a connu un Q2 réussi avec un chiffre d'affaires de 125 millions de dollars, une augmentation de 25 % par rapport à l'année précédente et un taux de marge brute de 58 %. Leur bénéfice net a augmenté à 16 millions de dollars, leurs actifs totaux ont atteint 1,5 milliard de dollars et leur ratio dette / équité s'élevait à 1,5. Ils ont constaté une croissance organique des utilisateurs et un ratio LTV CAC de 3,5 %. Ils ont mis en place un modèle de risque de valeur et s'attendent à une croissance du chiffre d'affaires de 8 % trimestre sur trimestre au cours du prochain trimestre, soutenue par la blockchain et l'IA. De plus, ils sont enthousiasmés par l'introduction en bourse prochaine de leur filiale, qui devrait lever 200 millions de dollars.


In [13]:
print("\033[1;31;34m", azure_openai_translation(smalltext, "italian"))

[1;31;34m 

Fintech Plus Sync ha avuto un Q2 di successo con un fatturato di 125 milioni, un aumento del 25% anno su anno e un margine di profitto lordo del 58%. Il loro reddito netto è aumentato a 16 milioni, gli attivi totali hanno raggiunto 1,5 miliardi e il loro rapporto debito / patrimonio netto era pari a 1,5. Hanno visto una crescita organica degli utenti e hanno avuto un rapporto LTV CAC di 3,5%. Hanno un modello di rischio di valore in atto e si aspettano una crescita del fatturato del 8% trimestre su trimestre nel prossimo trimestre, guidata dalla blockchain e dall'IA. Inoltre, sono entusiasti dell'imminente IPO della loro controllata, che dovrebbe raccogliere 200 milioni.


In [14]:
print("\033[1;31;34m", azure_openai_translation(smalltext, "japanese"))

[1;31;34m 

フィンテックプラスシンクは、年間25％の増加と58％の粗利益率を持つ、成功したQ2を持っていました。彼らの収益は1億2500万ドルに上昇し、資産総額は15億ドルに達し、債務対資本比率は1.5となりました。彼らは有機的なユーザー増加を見ており、LTV CAC比率は3.5％でした。彼らはバリューアットリスクモデルを導入しており、次の四半期にはブロックチェーンとAIによる収益8％の増加を期待しています。さらに、彼らは子会社のIPOが200万ドルを調達することを期待していることを興奮しています。


In [15]:
print("\033[1;31;34m", azure_openai_translation(smalltext, "chinese"))

[1;31;34m 

Fintech Plus Sync在第二季度取得了成功，收入达1.25亿，同比增长25％，毛利率达58％。其净收入增加至1600万，总资产达15亿，债务股权比率为1.5。他们看到了有机用户增长，LTV CAC比率为3.5％。他们有一个价值风险模型，预计下一季度收入将以区块链和人工智能为推动力增长8％。此外，他们对其子公司即将上市的IPO充满期待，预计将筹集2亿美元。


In [16]:
print("\033[1;31;34m", azure_openai_translation(smalltext, "spanish"))

[1;31;34m 

Fintech Plus Sync tuvo un exitoso segundo trimestre con una facturación de 125 millones de dólares, un aumento del 25% interanual, y un margen bruto de beneficio del 58%. Su beneficio neto aumentó a 16 millones, sus activos totales alcanzaron los 1.5 mil millones, y su ratio deuda/equidad se situó en 1.5. Vieron un crecimiento orgánico de usuarios y tuvieron un ratio LTV CAC de 3.5%. Tienen un modelo de riesgo de valor en su lugar y esperan un crecimiento de la facturación trimestral del 8% en el próximo trimestre, impulsado por la tecnología blockchain y la Inteligencia Artificial. Además, están entusiasmados con la próxima salida a bolsa de su filial, que se espera que recaude 200 millones de dólares.


In [17]:
print("\033[1;31;34m", azure_openai_translation(smalltext, "german"))

[1;31;34m 

Fintech Plus Sync hatte ein erfolgreiches Q2 mit einem Umsatz von 125 Millionen, einem Anstieg um 25% im Vergleich zum Vorjahr und einer Bruttogewinnspanne von 58%. Ihr Nettoergebnis stieg auf 16 Millionen, die Gesamtvermögenswerte erreichten 1,5 Milliarden und ihr Verschuldungsgrad betrug 1,5. Sie sahen ein organisches Wachstum der Nutzer und hatten ein LTV CAC-Verhältnis von 3,5%. Sie haben ein Value at Risk-Modell implementiert und erwarten ein Wachstum des Umsatzes im nächsten Quartal um 8%, getrieben von Blockchain und KI. Darüber hinaus freuen sie sich auf den bevorstehenden Börsengang ihrer Tochtergesellschaft, der 200 Millionen einbringen soll.


## Translation with Azure Open AI using a PDF file

In [18]:
my_pdf_file  = "constitution.pdf"

!ls $my_pdf_file -lh

-rwxrwxrwx 1 root root 166K Sep 12 09:16 constitution.pdf


### Get the number of pages

In [19]:
reader = PdfReader(my_pdf_file)
print("Total number of pages =", len(reader.pages))

Total number of pages = 32


### Get the pages of the PDF document into a string

In [20]:
# nb_pages_to_extract = 10
nb_pages_to_extract = len(reader.pages)

In [21]:
print("Extracting pages form the PDF document...\n")
start = time.time()

extracted_text = [
    reader.pages[idx].extract_text() for idx in range(nb_pages_to_extract)
]

print("Done")
elapsed = time.time() - start
elapsed_time_str = time.strftime(
    "%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:15], time.gmtime(elapsed)
)
print(f"Elapsed time: {elapsed_time_str}")

Extracting pages form the PDF document...

Done
Elapsed time: 00:00:01.810477


In [22]:
text_infos(str(extracted_text))

Number of characters = 79724
Number of words = 12912
Number of tokens = 23228


(79724, 12912, 23228)

### Let's create some chunk from the extract text

In [23]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
)

In [24]:
chunks = text_splitter.create_documents(extracted_text)
number_chunks = len(chunks)
print("Total number of chunks =", number_chunks)

Total number of chunks = 33


### Some preview of some chunks

In [25]:
type(chunks[0])

langchain.schema.document.Document

In [26]:
print(chunks[0].page_content)

Constitution du 4 octobre 1958
Dernière mise à jour des données de ce texte : 01 décembre 2009
Titre premier : De la souveraineté (Articles 2 à 4)
Titre II : Le Président de la République (Articles 5 à 19)
Titre III : Le Gouvernement (Articles 20 à 23)
Titre IV : Le Parlement (Articles 24 à 33)
Titre V : Des rapports entre le Parlement et le Gouvernement (Articles 34 à 51)
Titre VI : Des traités et accords internationaux (Articles 52 à 55)
Titre VII : Le Conseil constitutionnel (Articles 56 à 63)
Titre VIII : De l'autorité judiciaire (Articles 64 à 66)
Titre IX : La Haute Cour de Justice. (Articles 67 à 68)
Titre X : De la responsabilité pénale des membres du Gouvernement (Articles 68-1 à 68-3)
Titre XI : Le Conseil économique et social. (Articles 69 à 71)
Titre XII : Des collectivités territoriales (Articles 72 à 75)
Titre XIII : De la Communauté. (Articles 76 à 77) (abrogé)
Titre XIII : Dispositions transitoires relatives à la Nouvelle-Calédonie (Articles 76 à 
77)
Titre XIV : Des ac

## Translation from French to English

### Now we can run the Azure Open AI translation function for each chunk

In [27]:
target_language = "English"

In [28]:
start = time.time()
translated_document = []

print("Starting to translate the document into", target_language, "\n")

for nb_chunk in range(number_chunks):
    dt = datetime.datetime.today().strftime("%d-%b-%Y %H:%M:%S")
    print(f"{dt} Translation of chunk: {nb_chunk + 1} / {number_chunks}")
    chunk_text_to_translate = chunks[nb_chunk].page_content
    trans_chunk = azure_openai_translation(chunk_text_to_translate, target_language)
    translated_document.append(trans_chunk)

print("\nDone")
elapsed = time.time() - start
elapsed_time_str = time.strftime(
    "%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:15], time.gmtime(elapsed)
)
print(f"Elapsed time: {elapsed_time_str}")

Starting to translate the document into English 

21-Sep-2023 08:23:43 Translation of chunk: 1 / 33
21-Sep-2023 08:23:46 Translation of chunk: 2 / 33
21-Sep-2023 08:23:53 Translation of chunk: 3 / 33
21-Sep-2023 08:24:09 Translation of chunk: 4 / 33
21-Sep-2023 08:24:10 Translation of chunk: 5 / 33
21-Sep-2023 08:24:28 Translation of chunk: 6 / 33
21-Sep-2023 08:24:35 Translation of chunk: 7 / 33
21-Sep-2023 08:24:42 Translation of chunk: 8 / 33
21-Sep-2023 08:24:51 Translation of chunk: 9 / 33
21-Sep-2023 08:25:06 Translation of chunk: 10 / 33
21-Sep-2023 08:25:11 Translation of chunk: 11 / 33
21-Sep-2023 08:25:21 Translation of chunk: 12 / 33
21-Sep-2023 08:25:31 Translation of chunk: 13 / 33
21-Sep-2023 08:25:42 Translation of chunk: 14 / 33
21-Sep-2023 08:25:52 Translation of chunk: 15 / 33
21-Sep-2023 08:26:01 Translation of chunk: 16 / 33
21-Sep-2023 08:26:09 Translation of chunk: 17 / 33
21-Sep-2023 08:26:41 Translation of chunk: 18 / 33
21-Sep-2023 08:26:50 Translation of chunk

### Saving the translated text into a .docx file

In [29]:
docx_file = f"translated_document_{target_language}.docx"

start = time.time()
print("Saving the translated document into a .docx file...")

results = "\n".join(translated_document)

document = Document()
document.add_heading("Translated document made with Azure Open AI", level=1)
document.add_paragraph(results)
document.save(docx_file)

print("\nDone")
elapsed = time.time() - start
elapsed_time_str = time.strftime(
    "%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:15], time.gmtime(elapsed)
)
print(f"Elapsed time: {elapsed_time_str}")

Saving the translated document into a .docx file...

Done
Elapsed time: 00:00:00.131001


In [30]:
!ls $docx_file -lh

-rwxrwxrwx 1 root root 55K Sep 21 08:29 translated_document_English.docx


In [31]:
doclink = FileLink(path=docx_file)
doclink

## Translation from French to Spanish

In [32]:
target_language = "Spanish"

In [33]:
start = time.time()
translated_document = []

print("Starting to translate the document into", target_language, "\n")

for nb_chunk in range(number_chunks):
    dt = datetime.datetime.today().strftime("%d-%b-%Y %H:%M:%S")
    print(f"{dt} Translation of chunk: {nb_chunk + 1} / {number_chunks}")
    chunk_text_to_translate = chunks[nb_chunk].page_content
    trans_chunk = azure_openai_translation(chunk_text_to_translate, target_language)
    translated_document.append(trans_chunk)

print("\nDone")
elapsed = time.time() - start
elapsed_time_str = time.strftime(
    "%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:15], time.gmtime(elapsed)
)
print(f"Elapsed time: {elapsed_time_str}")

Starting to translate the document into Spanish 

21-Sep-2023 08:29:27 Translation of chunk: 1 / 33
21-Sep-2023 08:29:29 Translation of chunk: 2 / 33
21-Sep-2023 08:29:43 Translation of chunk: 3 / 33
21-Sep-2023 08:30:03 Translation of chunk: 4 / 33
21-Sep-2023 08:30:08 Translation of chunk: 5 / 33
21-Sep-2023 08:30:48 Translation of chunk: 6 / 33
21-Sep-2023 08:31:05 Translation of chunk: 7 / 33
21-Sep-2023 08:31:20 Translation of chunk: 8 / 33
21-Sep-2023 08:31:35 Translation of chunk: 9 / 33
21-Sep-2023 08:31:56 Translation of chunk: 10 / 33
21-Sep-2023 08:32:07 Translation of chunk: 11 / 33
21-Sep-2023 08:32:24 Translation of chunk: 12 / 33
21-Sep-2023 08:32:44 Translation of chunk: 13 / 33
21-Sep-2023 08:33:01 Translation of chunk: 14 / 33
21-Sep-2023 08:33:19 Translation of chunk: 15 / 33
21-Sep-2023 08:33:36 Translation of chunk: 16 / 33
21-Sep-2023 08:34:10 Translation of chunk: 17 / 33
21-Sep-2023 08:34:30 Translation of chunk: 18 / 33
21-Sep-2023 08:34:48 Translation of chunk

In [34]:
docx_file = f"translated_document_{target_language}.docx"

start = time.time()
print("Saving the translated document into a .docx file...")

results = "\n".join(translated_document)

document = Document()
document.add_heading("Translated document made with Azure Open AI", level=1)
document.add_paragraph(results)
document.save(docx_file)

print("\nDone")
elapsed = time.time() - start
elapsed_time_str = time.strftime(
    "%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:15], time.gmtime(elapsed)
)
print(f"Elapsed time: {elapsed_time_str}")

Saving the translated document into a .docx file...

Done
Elapsed time: 00:00:00.159475


In [35]:
!ls $docx_file -lh

-rwxrwxrwx 1 root root 57K Sep 21 08:40 translated_document_Spanish.docx


In [36]:
doclink = FileLink(path=docx_file)
doclink