# GCP Demo 3: Text translation using AutoML

### Import Modules

In [None]:
import os
import subprocess
import shutil
import time
import pandas as pd


from gcpdemo3 import etl
from gcpdemo3 import predict

from google.cloud import automl_v1beta1 as automl
from google.oauth2 import service_account

### Get translation and AutoML credentials

In [None]:
# set translation credentials
trans_credentials = service_account.Credentials.from_service_account_file(
    '../credentials/translation/ml-sandbox-1-191918-b473cb40490b.json'
)

# set prediction credentials

automl_credentials = service_account.Credentials.from_service_account_file(
    '../credentials/prediction/ml-sandbox-1-191918-58d6a5f3d5c2.json'
)

### Run ETL

In [None]:
# copy raw book files from GCS
os.system('gcloud config set project ml-sandbox-1-191918')
os.mkdir('./temp')
os.system('gsutil -m cp -r gs://gcp-cert-demo-3/opus ./temp')

# process native english books and concatenate
etl.process_book(
    in_path='./temp/opus/Books/raw/en/Austen_Jane-Pride_and_Prejudice.xml',
    out_path='./temp/Austen_Jane-Pride_and_Prejudice_en_processed.xml'
)
etl.process_book(
    in_path='./temp/opus/Books/raw/en/Twain_Mark-Tom_Sawyer.xml',
    out_path='./temp/Twain_Mark-Tom_Sawyer_en_processed.xml'
)
etl.process_book(
    in_path='./temp/opus/Books/raw/en/Doyle_Arthur_Conan-Adventures_of_Sherlock_Holmes.xml',
    out_path='./temp/Doyle_Arthur_Conan-Adventures_of_Sherlock_Holmes_en_processed.xml'
)
etl.concat_label_files(
    in_paths=[
        './temp/Austen_Jane-Pride_and_Prejudice_en_processed.xml',
        './temp/Twain_Mark-Tom_Sawyer_en_processed.xml',
        './temp/Doyle_Arthur_Conan-Adventures_of_Sherlock_Holmes_en_processed.xml'
    ],
    out_path='./temp/native.csv',
    label='native'
)

# process professionally translated books and concatenate
etl.process_book(
    in_path='./temp/opus/Books/raw/en/Cervantes_Miguel-Don_Quijote.xml',
    out_path='./temp/Cervantes_Miguel-Don_Quijote_en_processed.xml'
)
etl.process_book(
    in_path='./temp/opus/Books/raw/en/Hugo_Victor-Notre_Dame_de_Paris.xml',
    out_path='./temp/Hugo_Victor-Notre_Dame_de_Paris_en_processed.xml'
)
etl.process_book(
    in_path='./temp/opus/Books/raw/en/Flaubert_Gustave-Madame_Bovary.xml',
    out_path='./temp/Flaubert_Gustave-Madame_Bovary_en_processed.xml'
)
etl.concat_label_files(
    in_paths=[
        './temp/Cervantes_Miguel-Don_Quijote_en_processed.xml',
        './temp/Hugo_Victor-Notre_Dame_de_Paris_en_processed.xml',
        './temp/Flaubert_Gustave-Madame_Bovary_en_processed.xml'
    ],
    out_path='./temp/translated.csv',
    label='translated'
)

# process and translate native spanish book
etl.process_book(
    in_path='./temp/opus/Books/raw/es/Cervantes_Miguel-Don_Quijote.xml',
    out_path='./temp/Cervantes_Miguel-Don_Quijote_es_processed.xml'
)
etl.translate_book(
    credentials=trans_credentials,
    in_path='./temp/Cervantes_Miguel-Don_Quijote_es_processed.xml',
    out_path='./temp/cervantes_translated.txt',
    source='es',
    target='en',
    chunk_size=300
)

# process and translate native french book
etl.process_book(
    in_path='./temp/opus/Books/raw/fr/Hugo_Victor-Notre_Dame_de_Paris.xml',
    out_path='./temp/Hugo_Victor-Notre_Dame_de_Paris_fr_processed.xml'
)
etl.translate_book(
    credentials=trans_credentials,
    in_path='./temp/Hugo_Victor-Notre_Dame_de_Paris_fr_processed.xml',
    out_path='./temp/victorhugo_translated.txt',
    source='fr',
    target='en',
    chunk_size=300
)

# concatenate translated books
etl.concat_label_files(
    in_paths=[
        './temp/cervantes_translated.txt',
        './temp/victorhugo_translated.txt'
    ],
    out_path='./temp/machine_translated.csv',
    label='machine'
)

# concatenate files
with open('./temp/train.csv', 'w+', encoding='utf8') as out_train:
    with open('./temp/predict.csv', 'w+', encoding='utf8') as out_test:

        # iterate through files and concatenate
        in_paths = [
            './temp/translated.csv',
            './temp/native.csv',
            './temp/machine_translated.csv'
        ]
        for in_path in in_paths:
            with open(in_path, 'r', encoding='utf8') as in_file:
                lines = in_file.readlines()
                train_lines = lines[:-50]
                test_lines = lines[-50:]
                out_train.writelines(train_lines)
                out_test.writelines(test_lines)
                
# upload to gcs
timestamp = str(time.time()).split('.')[0]
gs_train_path = f'gs://gcp-cert-demo-3/train_{timestamp}.csv'
os.system(f'gsutil -m cp -r ./temp/train.csv {gs_train_path}')

### Create Auto ML Client

In [None]:
# create client
client = automl.AutoMlClient(credentials=automl_credentials)

### Create Dataset

In [None]:
# A resource that represents Google Cloud Platform location.
project_location = client.location_path('ml-sandbox-1-191918', 'us-central1')

# Set dataset name and metadata.
my_dataset = {
    "display_name": f'demo3_dataset_{timestamp}',
    "text_classification_dataset_metadata": {"classification_type": "MULTICLASS"},
}

# Create a dataset with the dataset metadata in the region.
dataset = client.create_dataset(project_location, my_dataset)
dataset_id = dataset.name.split('/')[-1]

### Import Dataset (this process may take several minutes, check UI for completion)

In [None]:
# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
    'ml-sandbox-1-191918', 
    'us-central1', 
    dataset_id
)

# Get the multiple Google Cloud Storage URIs.
input_config = {"gcs_source": {"input_uris": [gs_train_path]}}

# Import the dataset from the input URI.
response = client.import_data(dataset_full_id, input_config)

# synchronous check of operation status.
print("Processing import...")
print("Data imported. {}".format(response.result()))

### Train and deploy model once import complete (this process may take several minutes to hours)

In [None]:
# A resource that represents Google Cloud Platform location.
project_location = client.location_path(
    'ml-sandbox-1-191918', 
    'us-central1'
)

# Set model name and model metadata for the dataset.
timestamp = str(time.time()).split('.')[0]
model_name = f'demo3_model_{timestamp}'
my_model = {
    "display_name": model_name,
    "dataset_id": dataset_id,
    "text_classification_model_metadata": {},
}

# Create a model with the model metadata in the region.
response = client.create_model(project_location, my_model)
print("Training operation name: {}".format(response.operation.name))
print("Training started...")

### Load test data

In [None]:
text_df = pd.read_csv(
    './temp/predict.csv',
    names=['line', 'label']
)

### Make predictions on new model (you will need to get the model ID from the AutoML UI)

In [None]:
# set model ID
model_id = ''

# make predictions using new deployed model
predictor = predict.Predictor(
    credentials=automl_credentials,
    model_name=f'projects/261855689705/locations/us-central1/models/{model_id}'
)
predictions = predictor.predict(text_df)
predictions

### Make predictions on our model

In [None]:
# make predictions using our deployed model
predictor = predict.Predictor(
    credentials=automl_credentials,
    model_name='projects/261855689705/locations/us-central1/models/TCN7361261134285897728'
)
predictions = predictor.predict(text_df)
predictions

### Remove Temporary Directory

In [None]:
# remove temporary directory
shutil.rmtree('./temp')