<a href="https://colab.research.google.com/github/poojashreeNS/Vertex_AI/blob/main/Text_classification_gcp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

! pip install {USER_FLAG} --upgrade google-cloud-aiplatform google-cloud-storage jsonlines

In [5]:
import os

PROJECT_ID = ""

if not os.getenv("IS_TESTING"):
    # Get your Google Cloud project ID from gcloud
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Project ID:  


In [6]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "vertexaiprediction"  # @param {type:"string"}

In [7]:
import sys
from datetime import datetime

import jsonlines
from google.cloud import aiplatform, storage
from google.protobuf import json_format

REGION = "us-central1"

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on Google Cloud Notebooks, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

aiplatform.init(project=PROJECT_ID, location=REGION)

**Create a dataset and import your data**

In [8]:
# Use a timestamp to ensure unique resources
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

src_uris = "gs://cloud-ml-data/NL-classification/happiness.csv"
display_name = f"e2e-text-dataset-{TIMESTAMP}"

In [11]:
ds = aiplatform.TextDataset.create(
    display_name=display_name,
    gcs_source=src_uris,
    import_schema_uri=aiplatform.schema.dataset.ioformat.text.single_label_classification,
    sync=True,
)

INFO:google.cloud.aiplatform.datasets.dataset:Creating TextDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TextDataset backing LRO: projects/63364166818/locations/us-central1/datasets/7344517771218124800/operations/2276863249959878656
INFO:google.cloud.aiplatform.datasets.dataset:TextDataset created. Resource name: projects/63364166818/locations/us-central1/datasets/7344517771218124800
INFO:google.cloud.aiplatform.datasets.dataset:To use this TextDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TextDataset('projects/63364166818/locations/us-central1/datasets/7344517771218124800')
INFO:google.cloud.aiplatform.datasets.dataset:Importing TextDataset data: projects/63364166818/locations/us-central1/datasets/7344517771218124800
INFO:google.cloud.aiplatform.datasets.dataset:Import TextDataset data backing LRO: projects/63364166818/locations/us-central1/datasets/7344517771218124800/operations/4024259905379631104
INFO:google.cloud.aiplatfor

**Train your text classification model**

In [12]:
datasets = aiplatform.TextDataset.list(filter=f'display_name="{display_name}"')
print(datasets)

[<google.cloud.aiplatform.datasets.text_dataset.TextDataset object at 0x7f89161383d0> 
resource name: projects/63364166818/locations/us-central1/datasets/7344517771218124800]


In [13]:
# Get the dataset ID if it's not available
dataset_id = "[your-dataset-id]"

if dataset_id == "[your-dataset-id]":
    # Use the reference to the new dataset captured when we created it
    dataset_id = ds.resource_name.split("/")[-1]
    print(f"Dataset ID: {dataset_id}")

text_dataset = aiplatform.TextDataset(dataset_id)

Dataset ID: 7344517771218124800


In [14]:
# Define the training job
training_job_display_name = f"e2e-text-training-job-{TIMESTAMP}"
job = aiplatform.AutoMLTextTrainingJob(
    display_name=training_job_display_name,
    prediction_type="classification",
    multi_label=False,
)

In [15]:
model_display_name = f"e2e-text-classification-model-{TIMESTAMP}"

# Run the training job
model = job.run(
    dataset=text_dataset,
    model_display_name=model_display_name,
    training_fraction_split=0.7,
    validation_fraction_split=0.2,
    test_fraction_split=0.1,
    sync=True,
)

INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/9210759538972557312?project=63364166818
INFO:google.cloud.aiplatform.training_jobs:AutoMLTextTrainingJob projects/63364166818/locations/us-central1/trainingPipelines/9210759538972557312 current state:
PipelineState.PIPELINE_STATE_PENDING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTextTrainingJob projects/63364166818/locations/us-central1/trainingPipelines/9210759538972557312 current state:
PipelineState.PIPELINE_STATE_PENDING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTextTrainingJob projects/63364166818/locations/us-central1/trainingPipelines/9210759538972557312 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTextTrainingJob projects/63364166818/locations/us-central1/trainingPipelines/9210759538972557312 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.train

KeyboardInterrupt: ignored

**Get and review model evaluation scores**

In [None]:
models = aiplatform.Model.list(filter=f'display_name="{model_display_name}"')
print(models)

In [None]:
# Get the ID of the model
model_name = "[your-model-resource-name]"
if model_name == "[your-model-resource-name]":
    # Use the `resource_name` of the Model instance you created previously
    model_name = model.resource_name
    print(f"Model name: {model_name}")


# Get a reference to the Model Service client
client_options = {"api_endpoint": "us-central1-aiplatform.googleapis.com"}
model_service_client = aiplatform.gapic.ModelServiceClient(
    client_options=client_options
)

In [None]:
model_evaluations = model_service_client.list_model_evaluations(parent=model_name)
model_evaluation = list(model_evaluations)[0]

In [None]:
model_eval_dict = json_format.MessageToDict(model_evaluation._pb)
metrics = model_eval_dict["metrics"]
confidence_metrics = metrics["confidenceMetrics"]

print(f'Area under precision-recall curve (AuPRC): {metrics["auPrc"]}')
for confidence_scores in confidence_metrics:
    metrics = confidence_scores.keys()
    print("\n")
    for metric in metrics:
        print(f"\t{metric}: {confidence_scores[metric]}")

**Deploy your text classification model**

In [None]:
deployed_model_display_name = f"e2e-deployed-text-classification-model-{TIMESTAMP}"

endpoint = model.deploy(
    deployed_model_display_name=deployed_model_display_name, sync=True
)

In [None]:
endpoints = aiplatform.Endpoint.list()

endpoint_with_deployed_model = []

for endpoint_ in endpoints:
    for model in endpoint_.list_models():
        if model.display_name.find(deployed_model_display_name) == 0:
            endpoint_with_deployed_model.append(endpoint_)

print(endpoint_with_deployed_model)

**Get online predictions from your model**

In [None]:
endpoint_name = "[your-endpoint-name]"
if endpoint_name == "[your-endpoint-name]":
    endpoint_name = endpoint.resource_name

print(f"Endpoint name: {endpoint_name}")

endpoint = aiplatform.Endpoint(endpoint_name)
content = "I got a high score on my math final!"

response = endpoint.predict(instances=[{"content": content}])

for prediction_ in response.predictions:
    ids = prediction_["ids"]
    display_names = prediction_["displayNames"]
    confidence_scores = prediction_["confidences"]
    for count, id in enumerate(ids):
        print(f"Prediction ID: {id}")
        print(f"Prediction display name: {display_names[count]}")
        print(f"Prediction confidence score: {confidence_scores[count]}")

**Get batch predictions from your model**

In [None]:
instances = [
    "We hiked through the woods and up the hill to the ice caves",
    "My kitten is so cute",
]
input_file_name = "batch-prediction-input.jsonl"

In [None]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "[your-bucket-name]"

if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = f"automl-text-notebook-{TIMESTAMP}"

BUCKET_URI = f"gs://{BUCKET_NAME}"

! gsutil mb -l $REGION $BUCKET_URI

In [None]:
# Instantiate the Storage client and create the new bucket
storage = storage.Client()
bucket = storage.bucket(BUCKET_NAME)

# Iterate over the prediction instances, creating a new TXT file
# for each.
input_file_data = []
for count, instance in enumerate(instances):
    instance_name = f"input_{count}.txt"
    instance_file_uri = f"{BUCKET_URI}/{instance_name}"

    # Add the data to store in the JSONL input file.
    tmp_data = {"content": instance_file_uri, "mimeType": "text/plain"}
    input_file_data.append(tmp_data)

    # Create the new instance file
    blob = bucket.blob(instance_name)
    blob.upload_from_string(instance)

input_str = "\n".join([str(d) for d in input_file_data])
file_blob = bucket.blob(f"{input_file_name}")
file_blob.upload_from_string(input_str)

In [None]:
job_display_name = "e2e-text-classification-batch-prediction-job"
model = aiplatform.Model(model_name=model_name)

batch_prediction_job = model.batch_predict(
    job_display_name=job_display_name,
    gcs_source=f"{BUCKET_URI}/{input_file_name}",
    gcs_destination_prefix=f"{BUCKET_URI}/output",
    sync=True,
)

batch_prediction_job_name = batch_prediction_job.resource_name

In [None]:
from google.cloud.aiplatform import jobs

batch_job = jobs.BatchPredictionJob(batch_prediction_job_name)
print(f"Batch prediction job state: {str(batch_job.state)}")

In [None]:
BUCKET_OUTPUT = f"{BUCKET_URI}/output"

! gsutil ls -a $BUCKET_OUTPUT

In [None]:
RESULTS_DIRECTORY = "prediction_results"
RESULTS_DIRECTORY_FULL = f"{RESULTS_DIRECTORY}/output"

# Create missing directories
os.makedirs(RESULTS_DIRECTORY, exist_ok=True)

# Get the Cloud Storage paths for each result
! gsutil -m cp -r $BUCKET_OUTPUT $RESULTS_DIRECTORY

# Get most recently modified directory
latest_directory = max(
    [
        os.path.join(RESULTS_DIRECTORY_FULL, d)
        for d in os.listdir(RESULTS_DIRECTORY_FULL)
    ],
    key=os.path.getmtime,
)

print(f"Local results folder: {latest_directory}")

In [None]:
# Get downloaded results in directory
results_files = []
for dirpath, _, files in os.walk(latest_directory):
    for file in files:
        if file.find("predictions") >= 0:
            results_files.append(os.path.join(dirpath, file))


# Consolidate all the results into a list
results = []
for results_file in results_files:
    # Open each result
    with jsonlines.open(results_file) as reader:
        for result in reader.iter(type=dict, skip_invalid=True):
            instance = result["instance"]
            prediction = result["prediction"]
            print(f"\ninstance: {instance['content']}")
            for key, output in prediction.items():
                print(f"\n{key}: {output}")

**Clean-up**

In [None]:
if os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI

batch_job.delete()

# `force` parameter ensures that models are undeployed before deletion
endpoint.delete(force=True)

model.delete()

text_dataset.delete()

# Training job
job.delete()