This notebook is a step by step guide to create a taxonomy, policy tags, data masking rules and managing accesses to masking rules. As a pre-requisite, please create a table customer_data with 3 fields (id, name and driving license) with few rows to test.


In [None]:
#Lets import packages and define variables
from google.cloud import datacatalog_v1
from google.cloud import bigquery
from google.cloud import bigquery_datapolicies_v1
from google.cloud.bigquery_datapolicies_v1 import DataPolicyServiceClient
from google.cloud.bigquery_datapolicies_v1.types import DataPolicy, DataMaskingPolicy
from google.iam.v1 import iam_policy_pb2, policy_pb2


project_id: str = "your project"
location_id: str = "your location"

In [8]:
#create a single client for your program
client = datacatalog_v1.PolicyTagManagerClient()

# Construct a full location path to be the parent of the taxonomy.
parent = datacatalog_v1.PolicyTagManagerClient.common_location_path(project_id, location_id)

Lets create Taxonomy pii-sensitive-taxonomy

In [None]:
# Construct a full Taxonomy object to send to the API.
taxonomy = datacatalog_v1.Taxonomy(
    display_name = 'pii-sensitive-taxonomy',
    description = "This Taxonomy represents ...")

# Send the taxonomy to the API for creation.
taxonomy = client.create_taxonomy(parent=parent, taxonomy=taxonomy)
print(f"Created taxonomy {taxonomy.display_name} : {taxonomy.name}")

Next we will create a first policy tag under pii-sensitive-taxonomy

In [None]:
#construct Catagory tag named "high" that is the severity category high
policyTag = datacatalog_v1.types.PolicyTag( 
    name = "high",
    display_name = "high",
    description = 'Severity Category high')

request = datacatalog_v1.CreatePolicyTagRequest(
    parent=taxonomy.name,
    policy_tag=policyTag)

#create policy tag using request created above
response = client.create_policy_tag(request=request)

print(f"Tag Created {response.display_name}: {response.name}")

Now lets create subtags in high category

In [None]:

#Use response of first tag to define "high" as a parent policy tag to create a subtag
parent_tag_name = response.name

#construct child tag
child_tag = datacatalog_v1.PolicyTag(
        name = "driving_license",
        display_name="driving_license",
        parent_policy_tag=parent_tag_name
    )

#define request
request = datacatalog_v1.CreatePolicyTagRequest(
        parent=taxonomy.name,
        policy_tag=child_tag
    )
  
response = client.create_policy_tag(request=request)

print(f"Subtag created :{response.name}")

Next lets assign FineGrainedReader permissions to users who need access to raw data

In [None]:
resource = f'projects/{yourproject}/locations/{location}/taxonomies/{taxonomyid}/policyTags/{policyTagsid}'

policy = client.get_iam_policy(resource=resource)
binding = policy_pb2.Binding(
        role='roles/datacatalog.categoryFineGrainedReader',
        members=[f'user:{user_email}']
    )
policy.bindings.append(binding)
    
client.set_iam_policy(resource=resource, policy=policy)

Next we will create data masking policy/rules for different type of users

In [None]:
#Lets create a Client and parent for request
client = bigquery_datapolicies_v1.DataPolicyServiceClient()

parent = f"projects//{project_id}locations/us"

In [None]:
# Initialize request argument(s) for our data policy for Sales team who just need to see last 4 chars of driving lisence
data_policy = bigquery_datapolicies_v1.DataPolicy()
data_policy.name="pii_last_four"
data_policy.data_policy_id="pii_last_four"
data_policy.data_policy_type = "DATA_MASKING_POLICY"

#Below use the policy tag id for child tag that we creadted above for driving_license
data_policy.policy_tag = "projects/{project_id}/locations/us/taxonomies/{taxonomy_id}/policyTags/{policyTagid}"
data_policy.data_masking_policy.predefined_expression = "LAST_FOUR_CHARACTERS"

request = bigquery_datapolicies_v1.CreateDataPolicyRequest(
        parent=parent,
        data_policy=data_policy,
    )

# Make the request
response = client.create_data_policy(request=request)

# Handle the response
print(response)

In [None]:
# Initialize request argument(s) for our data policy for Analytics team who do not need to see the original value but just the hash value
data_policy = bigquery_datapolicies_v1.DataPolicy()
data_policy.name="pii_hash"
data_policy.data_policy_id="pii_hash"
data_policy.data_policy_type = "DATA_MASKING_POLICY"
data_policy.policy_tag = "projects/{project_id}/locations/{location_id}/taxonomies/{taxonomy_id}/policyTags/{policyTagid}"
data_policy.data_masking_policy.predefined_expression = "SHA256"

request = bigquery_datapolicies_v1.CreateDataPolicyRequest(
        parent=parent,
        data_policy=data_policy,
    )

# Make the request
response = client.create_data_policy(request=request)

# Handle the response
print(response)

Now lets assign bigquerydatapolicy.maskedReader permissions to different users

In [None]:
resource = f"projects/{project_id}/locations/{location_id}/dataPolicies/pii_last_four"
policy = policy_pb2.Policy(
        bindings=[
            policy_pb2.Binding(
                role="roles/bigquerydatapolicy.maskedReader",
                members=["serviceAccount:youser service account"]
            )
        ]
    )

# Initialize request argument(s)
request = iam_policy_pb2.SetIamPolicyRequest(
    resource=resource,
    policy=policy
    )

response = client.set_iam_policy(request=request)

print(f"Updated IAM policy for resource {resource}: {response}")

#Repeat above for different data policy and different service accounts as needed

Now Lets run queries to test permissions

In [None]:
from google.cloud import bigquery
import pandas as pd
import os

#authenticate using service account or use different uers as needed
#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'your_json filr.json'

# Construct a BigQuery client object.
client = bigquery.Client(project="your_project")

query = (
    "SELECT customer_name, customer_driving_license FROM your_dataset.customer_data"
)

# Set use_legacy_sql to True to use legacy SQL syntax.
job_config = bigquery.QueryJobConfig(use_legacy_sql=False)

# Start the query and waits for query job to complete, passing in the extra configuration.
results = client.query_and_wait(
    query, job_config=job_config
)  # Make an API request.

# Convert the results to a pandas DataFrame
df = results.to_dataframe()

# Print the DataFrame as a table
print(df)