# Detect PII in Node Text Data

In [None]:
!pip install llama-index==0.5.26

In [7]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.indices.postprocessor import PIINodePostprocessor
from llama_index import ServiceContext, Document
from llama_index.data_structs import Node
from llama_index import GPTSimpleVectorIndex

In [8]:
# load documents
text = """
Hello Paulo Santos. The latest statement for your credit card account \
1111-0000-1111-0000 was mailed to 123 Any Street, Seattle, WA 98109.
"""
node = Node(text)

In [9]:
service_context = ServiceContext.from_defaults()
processor = PIINodePostprocessor(service_context=service_context)

In [10]:
new_nodes = processor.postprocess_nodes([node])

In [17]:
# view redacted text
new_nodes[0].get_text()

'Hello [NAME]. The latest statement for your credit card account [CREDIT_CARD_NUMBER] was mailed to [ADDRESS].'

In [18]:
# get mapping in node_info
# NOTE: this is not sent to the LLM! 
new_nodes[0].node_info["__pii_node_info__"]

{'NAME': 'Paulo Santos',
 'CREDIT_CARD_NUMBER': '1111-0000-1111-0000',
 'ADDRESS': '123 Any Street, Seattle, WA 98109'}

In [19]:
# feed into index
index = GPTSimpleVectorIndex.from_documents(new_nodes)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 30 tokens
> [build_index_from_nodes] Total embedding token usage: 30 tokens
> [build_index_from_nodes] Total embedding token usage: 30 tokens


In [20]:
response = index.query("What address was the statement mailed to?")
print(str(response))

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 72 tokens
> [query] Total LLM token usage: 72 tokens
> [query] Total LLM token usage: 72 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 8 tokens
> [query] Total embedding token usage: 8 tokens
> [query] Total embedding token usage: 8 tokens

[ADDRESS]
