In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Run this Notebook in Google Colab
<a target="_blank" href="https://colab.research.google.com/github/privateai/deid-examples/blob/clean-up-structured-examples/python/structured_data/PII%20Safe%20Sentiment%20Analysis.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [57]:
!pip install transformers



In [58]:
import pandas as pd
import transformers

# The Starbucks Reviews Dataset

This dataset is from a kaggle contest on Starbucks reviews. The goal of the contest is to analyse customer feedback for sentiment. However, as you can see the dataset contains lots of PII - including comments about workers in the individual reviews. That is not great... let's see how we can fix that with PrivateAI

In [59]:
data_frame = pd.read_csv('/content/drive/Shareddrives/Public Demo Notebooks/Structured Data/sample_structured/reviews_data.csv',header=0)

## Original Dataframe

The original dataframe has comments that talk about specific workers, like Amber and LaDonna in the first record

In [60]:
#just taking a few rows here for an example
data_frame = data_frame.head(4)
data_frame.head()

Unnamed: 0,name,location,Date,Rating,Review,Image_Links
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images']
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images']
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images']


# Now let's redact the frame

In [61]:
!pip install privateai_client



In [62]:
from privateai_client import PAIClient
from privateai_client import request_objects
import os
import json
from google.colab import userdata
api_key = userdata.get('PAI_API_KEY')
with open('/content/drive/MyDrive/demodata/demo_config.json') as jsonfile:
  KEY_DATA = json.load(jsonfile)

client = PAIClient(url="https://api.private-ai.com/deid/", api_key=api_key)

In [63]:
# Here we are going to disable some redaction.
# Since the Starbucks management surely wants to know if there are any specific locations that have worse scores than others
# we will not redact organizations, locations, or addresses

def redact_text(text):
  # since we don't care about distinct individuals in these reviews we set the marker type the "BEST_ENTITY_TYPE"
  proc_obj = request_objects.processed_text_obj(type="MARKER", pattern="BEST_ENTITY_TYPE")
  # to allow locations, organizations and addresses to pass through untouched we create a "disable" filter
  entity_type_selector = request_objects.entity_type_selector_obj(
      type="DISABLE", value=['ORGANIZATION','LOCATION','LOCATION_ADDRESS', 'LOCATION_STATE','LOCATION_CITY','DATE']
      )
  # create the redaction request object with the entity_detection set to include the filter created above
  entity_detection = request_objects.entity_detection_obj(entity_types=[entity_type_selector])
  text_req = request_objects.process_text_obj(
    text=[text], processed_text=proc_obj, entity_detection=entity_detection
    )
  return client.process_text(text_req).processed_text[0]

In [64]:
data_frame["redacted_review"] = data_frame["Review"].apply(redact_text)

In [65]:
data_frame.head()

Unnamed: 0,name,location,Date,Rating,Review,Image_Links,redacted_review
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images'],NAME_GIVEN and NAME_GIVEN at the Starbucks on ...
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images'],** at the Starbucks by the fire station on 436...
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...,I just wanted to go out of my way to recognize...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images'],Me and my friend were at Starbucks and my card...


In [66]:
data_frame.head()

Unnamed: 0,name,location,Date,Rating,Review,Image_Links,redacted_review
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images'],NAME_GIVEN and NAME_GIVEN at the Starbucks on ...
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images'],** at the Starbucks by the fire station on 436...
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...,I just wanted to go out of my way to recognize...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images'],Me and my friend were at Starbucks and my card...


## Redacted Dataframe

After redaction, you can clearly see that the PII in the comments has been removed! Now this dataset can be safely used training custom ML models without the risk of memorizing PII :)

# PII Safe Sentiment Analysis

Now that we have a column that is redacted in dataframe, we can run a sentiment analysis without including the names of the people involved

In [68]:
data_frame.iloc[0][4]

'Amber and LaDonna at the Starbucks on Southwest Parkway are always so warm and welcoming. There is always a smile in their voice when they greet you at the drive-thru. And their customer service is always spot-on, they always get my order right and with a smile. I would actually give them more than 5 stars if they were available.'

In [69]:
data_frame.iloc[0][6]

'NAME_GIVEN and NAME_GIVEN at the Starbucks on Southwest Parkway are always so warm and welcoming. There is always a smile in their voice when they greet you at the drive-thru. And their customer service is always spot-on, they always get my order right and with a smile. I would actually give them more than 5 stars if they were available.'

In [70]:
import requests
hf_token = userdata.get('HF_TOKEN')

def get_sentiment(text):
    API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest"
    headers = {"Authorization": f'Bearer {hf_token}'}

    def query(payload):
	    response = requests.post(API_URL, headers=headers, json=payload)
	    return response.json()

    output = query({
	    "inputs": text, #data_frame.iloc[0][4],
    })
    return output

In [71]:
print(get_sentiment(data_frame.iloc[0][4]))

[[{'label': 'positive', 'score': 0.9743512868881226}, {'label': 'neutral', 'score': 0.02006211131811142}, {'label': 'negative', 'score': 0.005586681421846151}]]


In [72]:
print(get_sentiment(data_frame.iloc[0][6]))

[[{'label': 'positive', 'score': 0.9751478433609009}, {'label': 'neutral', 'score': 0.019699547439813614}, {'label': 'negative', 'score': 0.005152557976543903}]]
