In [1]:
import pandas as pd
import oci

In [3]:
def batch_generator(df, text_column_name, max_char_per_batch=20000, max_docs_per_batch=100):

    batch  = []
    total_len = 0
    for index, row in df.iterrows():
        if len(batch) < max_docs_per_batch and total_len < max_char_per_batch:
            batch.append(index)
            total_len += len(row[text_column_name])
        else:
            yield batch
            batch = []
            total_len = 0

In [4]:
def translate_batch(source_docs, ai_language_client, target, source="auto", compartment_id=None):
    
    documents = []
    
    for item in source_docs:
        documents.append(oci.ai_language.models.TextDocument(
                key=str(item[0]),
                text=item[1],
                language_code=source))

    batch_language_translation_details=oci.ai_language.models.BatchLanguageTranslationDetails(
        documents=documents,
        compartment_id=compartment_id,
        target_language_code=target)
    
    batch_language_translation_response = ai_language_client.batch_language_translation(batch_language_translation_details)
    
    return batch_language_translation_response.data

In [15]:
def process_translation_response(response, target_df, result_column):
    idx = []
    output = []
    for d in response.documents:
        idx.append(int(d.key))
        output.append(d.translated_text)
    target_df.loc[idx, result_column] = output

In [16]:

#OCI LANGUAGE HARD LIMITS
MAX_CHAR_PER_BATCH = 20000
MAX_DOC_PER_BATCH=100

def translate_dataframe(df_source, source_column, df_target, target_column, source_lang, target_lang, ai_language_client, compartment_id=None):
    
    batch_gen = batch_generator(df_source, source_column, max_char_per_batch=MAX_CHAR_PER_BATCH, max_docs_per_batch=MAX_DOC_PER_BATCH)
    
    for batch_idx in batch_gen:
        print(f'processing row {batch_idx[0]}:{batch_idx[-1]}')
        source_text = df.iloc[batch_idx][source_column]
        batch_docs = zip(batch_idx, source_text)
        
        results = translate_batch(batch_docs, ai_language_client, target_lang, source_lang, compartment_id)
        process_translation_response(results, df_target, target_column)

# Initialize OCI Langauge Client
* Ensure you follow pre-requisites to install OCI SDK and setup API key as described in below link *
<br>
OCI Language Live Lab below, LAB-1, TASK-2

https://apexapps.oracle.com/pls/apex/r/dbpm/livelabs/run-workshop?p210_wid=887&p210_wec=&session=17376605285617

In [21]:
config = oci.config.from_file()
ai_language_client = oci.ai_language.AIServiceLanguageClient(config)
ai_language_client.base_client.timeout = 300 #required to process for large batch sizes

# Reading and translating a csv file
Ensure you modify the csv file name, source column where the text resides

In [25]:
df = pd.read_csv("mydata.csv")
source_column = 'source'
result_column = 'result'

df_result = df.copy()
df_result['result'] = None

translate_dataframe(df, source_column, df_result, result_column, 'en', 'es', ai_language_client, None)


processing row 0:99
processing row 101:200
processing row 202:301
processing row 303:402
processing row 404:503
processing row 505:604
processing row 606:705
processing row 707:806
processing row 808:907
processing row 909:1008
processing row 1010:1109
processing row 1111:1210
processing row 1212:1311
processing row 1313:1412
processing row 1414:1513
processing row 1515:1614
processing row 1616:1715
processing row 1717:1816
processing row 1818:1917


In [None]:
df_result.head()