# Translate Kaizen Data using Google Cloud Platform

In [157]:
# import packages
import pandas as pd
import numpy as np
import os
import json
from google.cloud import translate

### free but extremely slow method to translate
# from deep_translator import GoogleTranslator
# to_translate = "Bonjour"
# translated = GoogleTranslator(source="auto", target="en").translate(to_translate)
# translated

In [None]:
# GCP client credentials
client_credentials = json.load(open("/Users/purushottam/Dropbox (Good Business Lab)/Purushottam/translate-seagate-348b9248f1ec.json"))

# set project id and build client
project_id = client_credentials['project_id']
assert project_id
parent = f"projects/{project_id}"
client = translate.TranslationServiceClient()

In [None]:
# Get all languages
response = client.get_supported_languages(parent=parent, display_language_code="en")
languages = response.languages

print(f" Languages: {len(languages)} ".center(60, "-"))
for language in languages:
    print(f"{language.language_code}\t{language.display_name}")

In [None]:
# GCP translate
sample_text = ["Bonjour", "Oui"]
target_language_code = "en"

response = client.translate_text(
    contents= sample_text,
    target_language_code=target_language_code,
    parent=parent,
)

for translation in response.translations:
    print(translation.translated_text)

In [None]:
# Load Raw Data
df = pd.read_stata("/Volumes/PII_PRECL/pii_kaizen_updated.dta")

In [None]:
# set empty translated output list
translated_output_list = []

In [None]:
x1 = 0
x2 = x1 + 10

In [195]:
# set subset of raw file
input_df = df[140000:145000].copy(deep=True)
# remove line separators from data
input_df = input_df.replace(r'\r', '', regex=True)
input_df = input_df.replace(r'\n', '', regex=True)

In [None]:
# translate text
# Translate All Thai Columns
#varnames_th = ['project_name', 'project_need', 'project_idea', 'project_countermeasure', 'project_result', 'project_expansion', 'employee_benefit', 'customer_benefit', 'org_benefit', 'social_benefit']
var = 'project_name'
var_th = var + '_thai'
var_en = var + '_en'

input_text = input_df[var_th]
target_language_code = "en"

response = client.translate_text(
    contents = input_text,
    target_language_code = target_language_code,
    parent = parent,
)

for n in range(0,len(response.translations)):
    translated_output = response.translations[n].translated_text
    translated_output_list.append(translated_output)

In [196]:
# Batch Translation
x1 = 0
x2 = 200

var = 'project_name'
var_th = var + '_thai'
var_en = var + '_en'

# GCP translate API doesn't support empty strings (to avoid errors)
# input_df['proj_name_empty'] = np.where(input_df[var_th] == "", 1, 0)
input_df[var_th] = np.where(input_df[var_th] == "", "Oui", input_df[var_th])
# doesn't support strings which are too long
#input_df['str_len'] = input_df[var_th].str.len()
input_df[var_th] = np.where(input_df[var_th].str.len() >= 200, "Non", input_df[var_th])

translated_output_list = []

while x1 < len(input_df):
    input_text = input_df[x1:x2][var_th]
    target_language_code = "en"
    # translate 
    response = client.translate_text(
    contents = input_text,
    target_language_code = target_language_code,
    parent = parent,
    )
    # append to list
    for n in range(0,len(response.translations)):
        translated_output = response.translations[n].translated_text
        translated_output_list.append(translated_output)
    # update counter
    x1 = x1 + 200
    x2 = x1 + 200

In [197]:
# set new translated variable
input_df[var_en] = translated_output_list
# export data
input_df.to_csv(path_or_buf = "/Users/purushottam/Downloads/pii_kaizen_updated_translated_" + var + ".csv", sep=",", header=True, mode='w+')

In [None]:
sys.getsizeof(input_text[1:1000])

In [None]:
# set subset of raw file
input_df = df[0:10].copy(deep=True)
# remove line separators from data
input_df = input_df.replace(r'\r', '', regex=True)
input_df = input_df.replace(r'\n', '', regex=True)

# Translate All Thai Columns
#varnames_th = ['project_name', 'project_need', 'project_idea', 'project_countermeasure', 'project_result', 'project_expansion', 'employee_benefit', 'customer_benefit', 'org_benefit', 'social_benefit']
varnames_th = ['project_name']

for var in varnames_th:
    # thai var names
    var_th = var + "_thai"
    # translate using GoogleTranslator package
    output_text_list = []
    for n in range(len(input_df)):
        to_translate = input_df[var_th][n]
        output = GoogleTranslator(source="auto", target="en").translate(to_translate)
        output_text_list.append(output)
    # variable name english
    var_en = var + "_en"
    # add translated text to data
    input_df[var_en] = output_text_list


In [None]:
input_df["project_name_en"]

In [None]:
# export data
input_df.to_csv(path_or_buf = "/Users/purushottam/Downloads/pii_kaizen_updated_translated_" + var + ".csv", sep=",", header=True, mode='w+')

In [None]:
len(output_text_list)