In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Protecting Sensitive Data in Gen AI model responses

## Overview

Your team already has a Python function that identifies and redacts or blocks sensitive data types in Gen AI model responses. You have been asked to expand the function to block Gen AI model responses that contain [US Vehicle Identification Numbers](https://cloud.google.com/sensitive-data-protection/docs/infotypes-reference#united_states), which are sensitive data consisting of a unique 17-digit code assigned to every on-road motor vehicle in North America. 

To help you achieve this goal, complete the following subtasks by following the instructions in the cells below:

1. Run all cells in the section titled Getting started with this notebook. 

2. Expand an existing Python function in the section titled Update an existing Python function to block Gemini 2.0 Flash model responses when a US VIN has been included.

3. Generate an example text response with the following prompt to test your updated function: `Is 4Y1SL65848Z411439 an example of a US Vehicle Identification Number (VIN)?`

## Getting started with this notebook

Below are few steps to get your environment ready, including installing key Python packages and setting your environmental variables (project ID and region). 

Be sure to run each cell in consecutive order using the `Run` button (play arrow) at the top of this notebook. 

### Install necessary packages 

In [None]:
# Install Vertex AI
!pip install google-cloud-aiplatform --upgrade --user

# Install Cloud Data Loss Prevention
! pip install google-cloud-dlp --upgrade --user

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b><p>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</p> When prompted, click OK to continue. </b>
</div>

### Set your project ID and region

In [None]:
# Get the Project ID
PROJECT_ID = !gcloud config get project  # Example: qwiklabs-gcp-04-b75c09c1eb74
PROJECT_ID = PROJECT_ID[0]
print(PROJECT_ID)  # Print the Project ID

# Get the default region
LOCATION = !gcloud compute project-info describe --format="value(commonInstanceMetadata.items[google-compute-default-region])"
print(LOCATION[0])  # Print the region (e.g., us-central1)

### Import Gemini 2.0 Flash model

In [None]:
# Import model for text generation
from vertexai.generative_models import GenerativeModel
model = GenerativeModel("gemini-2.0-flash-001")

## Update an existing Python function to block Gemini 2.0 Flash model responses when a US VIN has been included

In this section, you revise an existing Python function to block output for [US Vehicle Identification Numbers (last entry for United States infoTypes)](https://cloud.google.com/sensitive-data-protection/docs/infotypes-reference#united_states).

In the code block below for the function, __modify the code lines after `# Add conditional return to block responses containing US Vehicle Identification Numbers (VIN)`__ to block model responses containing this infoType.

Be sure to run the cell with your final Python function code before you move onto the next cells to test the updated function.

In [None]:
# Redefine original function to inspect and deidentify output with Sensitive Data Protection
import google.cloud.dlp  
from typing import List 

def deidentify_with_replace_infotype(
    project: str, item: str, info_types: List[str]
) -> None:
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing it with the info type.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = f"projects/{PROJECT_ID}"

    # Construct inspect configuration dictionary
    inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {"primitive_transformation": {"replace_with_info_type_config": {}}}
            ]
        }
    }

    # Call the API for deidentify
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": {"value": item},
        }
    )

    return_payload = response.item.value
    
    # Add conditional return to block responses containing US Vehicle Identification Numbers (VIN)
    info_types = ["DOCUMENT_TYPE/R&D/SOURCE_CODE"]
    inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

    response = dlp.inspect_content(
        request={
            "parent": parent,
            "inspect_config": inspect_config,
            "item": {"value": item},
        }
    )

    if response.result.findings:
        for finding in response.result.findings:
            if finding.info_type.name == "DOCUMENT_TYPE/R&D/SOURCE_CODE":
                return_payload = '[Blocked due to category: Source Code]'
                
    # Print results
    print(return_payload)

## Generate an example with VIN using Gemini 2.0 Flash model and block results

In the code blocks below, generate an example text response containing a US Vehicle Identification Number (VIN) using the following prompt:

`Is 4Y1SL65848Z411439 an example of a US Vehicle Identification Number (VIN)?`

Then, write and execute the appropriate code lines to block responses containing US Vehicle Identification Numbers (VIN). 

In [None]:
# Create prompt that generates an example response with US Vehicle Identification Number (VIN)
prompt = "Is 4Y1SL65848Z411439 an example of a US Vehicle Identification Number (VIN)?"

# Run model with prompt
# Name the output as response_vin
response_vin = model.generate_content(prompt)

# Print response without blocking it (VIN provided)
print(response_vin.text)

# Block model response that includes US Vehicle Identification Number (VIN)
deidentify_with_replace_infotype(
    project=PROJECT_ID,
    item=response_vin.text,
    info_types=["US_VEHICLE_IDENTIFICATION_NUMBER"]
)

In [22]:
def deidentify_with_replace_infotype(
    project: str, item: str, info_types: List[str]
) -> None:
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing it with the info type.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = f"projects/{PROJECT_ID}"

    # Construct inspect configuration dictionary
    inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {"primitive_transformation": {"replace_with_info_type_config": {}}}
            ]
        }
    }

    # Call the API for deidentify
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": {"value": item},
        }
    )

    return_payload = response.item.value
    
    # Add conditional return to block responses containing US Vehicle Identification Numbers (VIN)
    vin_info_types = ["US_VEHICLE_IDENTIFICATION_NUMBER"]
    inspect_config = {"info_types": [{"name": info_type} for info_type in vin_info_types]}

    response = dlp.inspect_content(
        request={
            "parent": parent,
            "inspect_config": inspect_config,
            "item": {"value": item},
        }
    )

    if response.result.findings:
        for finding in response.result.findings:
            if finding.info_type.name == "US_VEHICLE_IDENTIFICATION_NUMBER":
                return_payload = '[Blocked due to category: US Vehicle Identification Number (VIN)]'
                
    # Print results
    print(return_payload)