In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Run this Notebook in Google Colab
<a target="_blank" href="https://colab.research.google.com/github/privateai/deid-examples/blob/main/python/LLM%20Examples/Removing%20Confidential%20Financial%20Information%20via%20Redaction%20for%20LLMs.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
!pip install transformers



In [None]:
import pandas as pd
import transformers

# The Starbucks Reviews Dataset

This dataset is from a kaggle contest on Starbucks reviews. The goal of the contest is to analyse customer feedback for sentiment. However, as you can see the dataset contains lots of PII - including comments about workers in the individual reviews. That is not great... let's see how we can fix that with PrivateAI

In [None]:
data_frame = pd.read_csv('./sample_data/starbrucks_reviews_data.csv',header=0)

## Original Dataframe

The original dataframe has comments that talk about specific workers, like Amber and LaDonna in the first record

In [None]:
data_frame.head(4)

Unnamed: 0,name,location,Date,Rating,Review,Image_Links
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images']
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images']
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images']


# Now let's redact the frame

In [None]:
!pip install privateai_client



In [None]:
from privateai_client import PAIClient
from privateai_client import request_objects
import os
import json
import requests
from google.colab import userdata
api_key = userdata.get('PAI_API_KEY') #if you store your API keys in the userdata object in colab

client = PAIClient(url="https://api.private-ai.com/deid/", api_key=api_key)

In [None]:
# Dates are important in many ML tasks so we will leave it in for this notebook
sample_entity_type_selector = request_objects.entity_type_selector_obj(
        type="DISABLE", value= ['DATE'] 
        )

In [None]:
proc_obj = request_objects.processed_text_obj(type="MARKER", pattern="BEST_ENTITY_TYPE")

In [None]:
text_req = request_objects.process_text_obj(
    text=[], processed_text=proc_obj, entity_detection=sample_entity_type_selector
    )

In [None]:
request_objects.processed_text_obj()

<privateai_client.components.request_objects.ProcessedText at 0x79c1af923fd0>

In [None]:
# Here we are going to disable some redaction.
# Since the Starbucks management surely wants to know if there are any specific locations that have worse scores than others 
# we will not redact organizations, locations, or addresses

proc_obj = request_objects.processed_text_obj(type="MARKER", pattern="BEST_ENTITY_TYPE")
entity_type_selector = request_objects.entity_type_selector_obj(type="DISABLE", value=['ORGANIZATION','LOCATION','LOCATION_ADDRESS', 'LOCATION_STATE','LOCATION_CITY','DATE'])
entity_detection = request_objects.entity_detection_obj(entity_types=[entity_type_selector])
text_req = request_objects.process_text_obj(
    text=[], processed_text=proc_obj, entity_detection=entity_detection
    )

for column in data_frame.columns:
    text_req.text.append(f"{column}:{' | '.join([str(row) for row in data_frame[column]])}")

resp = client.process_text(text_req)
redacted_data = dict()
for row in resp.processed_text:
    data = row.split(':',1)
    redacted_data[data[0]] = data[1].split(' | ')
redacted_data_frame = pd.DataFrame(redacted_data)

## Redacted Dataframe

After redaction, you can clearly see that the PII in the comments has been removed! Now this dataset can be safely used training custom ML models without the risk of memorizing PII :)

In [None]:
redacted_data_frame.head(5)

Unnamed: 0,name,location,Date,Rating,Review,Image_Links
0,NAME_GIVEN,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,NAME_GIVEN and NAME_GIVEN at the Starbucks on ...,['No Images']
1,NAME_GIVEN,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images']
2,NAME_GIVEN,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['URL']
3,NAME_GIVEN,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images']
4,NAME_GIVEN,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...,"['URL', 'URL']"


# PII Safe Sentiment Analysis

Now that we have a redaced dataframe, we can run a sentiment analysis without including the names of the people involved

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [None]:
data_frame.iloc[0][4]

'Amber and LaDonna at the Starbucks on Southwest Parkway are always so warm and welcoming. There is always a smile in their voice when they greet you at the drive-thru. And their customer service is always spot-on, they always get my order right and with a smile. I would actually give them more than 5 stars if they were available.'

In [None]:
redacted_data_frame.iloc[0][4]

'NAME_GIVEN and NAME_GIVEN at the Starbucks on Southwest Parkway are always so warm and welcoming. There is always a smile in their voice when they greet you at the drive-thru. And their customer service is always spot-on, they always get my order right and with a smile. I would actually give them more than 5 stars if they were available.'

## Call Hugging Face for a Quick sentiment analysis

In [None]:
import requests
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

def get_sentiment(text):
    API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest"
    headers = {"Authorization": f'Bearer {hf_token}'}

    def query(payload):
        response = requests.post(API_URL, headers=headers, json=payload)
        return response.json()

    output = query({
	    "inputs": text,
    })
    return output

In [None]:
print(get_sentiment(data_frame.iloc[0][4]))

[[{'label': 'positive', 'score': 0.9743512868881226}, {'label': 'neutral', 'score': 0.02006211131811142}, {'label': 'negative', 'score': 0.005586681421846151}]]


In [None]:
print(get_sentiment(redacted_data_frame.iloc[0][4]))

[[{'label': 'positive', 'score': 0.9751478433609009}, {'label': 'neutral', 'score': 0.019699547439813614}, {'label': 'negative', 'score': 0.005152557976543903}]]


# Wrapping Up

As you can see above the sentiment scores for both redacted and unredacted reviews are basically the same!