Skip to content

Commit

Permalink
[text analytics] Add redacted_text (Azure#13449)
Browse files Browse the repository at this point in the history
  • Loading branch information
iscai-msft authored and rakshith91 committed Sep 4, 2020
1 parent 829d7c6 commit 1c80ef6
Show file tree
Hide file tree
Showing 10 changed files with 217 additions and 11 deletions.
1 change: 1 addition & 0 deletions sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- We are now targeting the service's v3.1-preview.1 API as the default. If you would like to still use version v3.0 of the service,
pass in `v3.0` to the kwarg `api_version` when creating your TextAnalyticsClient
- We have added an API `recognize_pii_entities` which returns entities containing personal information for a batch of documents. Only available for API version v3.1-preview.1 and up.
- In API version v3.1-preview.2 and up, the redacted text of the document is returned on the top-level result object `RecognizePiiEntitiesResult` through property `redacted_text`.
- Added `offset` and `length` properties for `CategorizedEntity`, `SentenceSentiment`, and `LinkedEntityMatch`. These properties are only available for API versions v3.1-preview.1 and up.
- `length` is the number of characters in the text of these models
- `offset` is the offset of the text from the start of the document
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ class RecognizePiiEntitiesResult(DictMixin):
:ivar entities: Recognized PII entities in the document.
:vartype entities:
list[~azure.ai.textanalytics.PiiEntity]
:ivar str redacted_text: Returns the text of the input document with all of the PII information
redacted out. Only returned for API versions v3.1-preview.2 and up.
:ivar warnings: Warnings encountered while processing document. Results will still be returned
if there are warnings, but they may not be fully accurate.
:vartype warnings: list[~azure.ai.textanalytics.TextAnalyticsWarning]
Expand All @@ -155,18 +157,28 @@ class RecognizePiiEntitiesResult(DictMixin):
~azure.ai.textanalytics.TextDocumentStatistics
:ivar bool is_error: Boolean check for error item when iterating over list of
results. Always False for an instance of a RecognizePiiEntitiesResult.
.. versionadded:: v3.1-preview.2
The *redacted_text* parameter.
"""

def __init__(self, **kwargs):
self.id = kwargs.get("id", None)
self.entities = kwargs.get("entities", None)
self.redacted_text = kwargs.get("redacted_text", None)
self.warnings = kwargs.get("warnings", [])
self.statistics = kwargs.get("statistics", None)
self.is_error = False

def __repr__(self):
return "RecognizePiiEntitiesResult(id={}, entities={}, warnings={}, statistics={}, is_error={})" \
.format(self.id, repr(self.entities), repr(self.warnings), repr(self.statistics), self.is_error)[:1024]
return "RecognizePiiEntitiesResult(id={}, entities={}, redacted_text={}, warnings={}, " \
"statistics={}, is_error={})" .format(
self.id,
repr(self.entities),
self.redacted_text,
repr(self.warnings),
repr(self.statistics),
self.is_error
)[:1024]


class DetectLanguageResult(DictMixin):
Expand Down Expand Up @@ -214,9 +226,9 @@ class CategorizedEntity(DictMixin):
:ivar subcategory: Entity subcategory, such as Age/Year/TimeRange etc
:vartype subcategory: str
:ivar int offset: The entity text offset from the start of the document.
Returned in unicode code points. Only returned for api versions v3.1-preview.1 and up.
Returned in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar int length: The length of the entity text. Returned
in unicode code points. Only returned for api versions v3.1-preview.1 and up.
in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar confidence_score: Confidence score between 0 and 1 of the extracted
entity.
:vartype confidence_score: float
Expand Down Expand Up @@ -671,9 +683,9 @@ class LinkedEntityMatch(DictMixin):
:vartype confidence_score: float
:ivar text: Entity text as appears in the request.
:ivar int offset: The linked entity match text offset from the start of the document.
Returned in unicode code points. Only returned for api versions v3.1-preview.1 and up.
Returned in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar int length: The length of the linked entity match text. Returned
in unicode code points. Only returned for api versions v3.1-preview.1 and up.
in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:vartype text: str
.. versionadded:: v3.1-preview.1
The *offset* and *length* properties.
Expand Down Expand Up @@ -785,9 +797,9 @@ class SentenceSentiment(DictMixin):
:vartype confidence_scores:
~azure.ai.textanalytics.SentimentConfidenceScores
:ivar int offset: The sentence offset from the start of the document. Returned
in unicode code points. Only returned for api versions v3.1-preview.1 and up.
in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar int length: The length of the sentence. Returned
in unicode code points. Only returned for api versions v3.1-preview.1 and up.
in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar mined_opinions: The list of opinions mined from this sentence.
For example in "The food is good, but the service is bad", we would
mind these two opinions "food is good", "service is bad". Only returned
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def pii_entities_result(entity, results): # pylint: disable=unused-argument
return RecognizePiiEntitiesResult(
id=entity.id,
entities=[PiiEntity._from_generated(e) for e in entity.entities], # pylint: disable=protected-access
redacted_text=entity.redacted_text if hasattr(entity, "redacted_text") else None,
warnings=[TextAnalyticsWarning._from_generated(w) for w in entity.warnings], # pylint: disable=protected-access
statistics=TextDocumentStatistics._from_generated(entity.statistics), # pylint: disable=protected-access
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "My SSN is 859-98-0987.", "language":
"en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '80'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://cognitiveusw2dev.azure-api.net/text/analytics/v3.1-preview.2/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"redactedText":"My SSN is ***********.","id":"0","entities":[{"text":"859-98-0987","category":"U.S.
Social Security Number (SSN)","offset":10,"length":11,"confidenceScore":0.65}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id:
- c5ba8c84-0e46-471a-b4c8-f02c411c20ec
content-type:
- application/json; charset=utf-8
csp-billing-usage:
- CognitiveServices.TextAnalytics.BatchScoring=1
date:
- Mon, 31 Aug 2020 20:15:43 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '78'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "My SSN is 859-98-0987.", "language":
"en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '80'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"859-98-0987","category":"U.S.
Social Security Number (SSN)","offset":10,"length":11,"confidenceScore":0.65}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id:
- 4ae026d1-15d1-4d77-8913-46922e72d7cb
content-type:
- application/json; charset=utf-8
csp-billing-usage:
- CognitiveServices.TextAnalytics.BatchScoring=1
date:
- Mon, 31 Aug 2020 19:58:17 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '68'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "My SSN is 859-98-0987.", "language":
"en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '80'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://cognitiveusw2dev.azure-api.net/text/analytics/v3.1-preview.2/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"redactedText":"My SSN is ***********.","id":"0","entities":[{"text":"859-98-0987","category":"U.S.
Social Security Number (SSN)","offset":10,"length":11,"confidenceScore":0.65}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id: dc638432-dc71-4f52-aadb-829c2dfd1935
content-type: application/json; charset=utf-8
csp-billing-usage: CognitiveServices.TextAnalytics.BatchScoring=1
date: Mon, 31 Aug 2020 20:15:43 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '80'
status:
code: 200
message: OK
url: https://cognitiveusw2dev.azure-api.net//text/analytics/v3.1-preview.2/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "My SSN is 859-98-0987.", "language":
"en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '80'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"859-98-0987","category":"U.S.
Social Security Number (SSN)","offset":10,"length":11,"confidenceScore":0.65}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id: eeda4dd4-74dd-4e54-88cb-5a0352f065cf
content-type: application/json; charset=utf-8
csp-billing-usage: CognitiveServices.TextAnalytics.BatchScoring=1
date: Mon, 31 Aug 2020 19:58:17 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '106'
status:
code: 200
message: OK
url: https://westus2.api.cognitive.microsoft.com//text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# ------------------------------------

import os
import pytest
import platform
import functools
Expand Down Expand Up @@ -576,6 +576,24 @@ def test_recognize_pii_entities_v3(self, client):

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

# currently only have this as playback since the dev endpoint is unreliable
@pytest.mark.playback_test_only
@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer(client_kwargs={
"api_version": TextAnalyticsApiVersion.V3_1_PREVIEW_2,
"text_analytics_account_key": os.environ.get('AZURE_TEXT_ANALYTICS_KEY'),
"text_analytics_account": "https://cognitiveusw2dev.azure-api.net/"
})
def test_redacted_text(self, client):
result = client.recognize_pii_entities(["My SSN is 859-98-0987."])
self.assertEqual("My SSN is ***********.", result[0].redacted_text)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_redacted_text_v3_1_preview_1(self, client):
result = client.recognize_pii_entities(["My SSN is 859-98-0987."])
self.assertIsNone(result[0].redacted_text)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_phi_domain_filter(self, client):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# ------------------------------------

import os
import pytest
import platform
import functools
Expand Down Expand Up @@ -574,6 +574,24 @@ async def test_recognize_pii_entities_v3(self, client):

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

# currently only have this as playback since the dev endpoint is unreliable
@pytest.mark.playback_test_only
@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer(client_kwargs={
"api_version": TextAnalyticsApiVersion.V3_1_PREVIEW_2,
"text_analytics_account_key": os.environ.get('AZURE_TEXT_ANALYTICS_KEY'),
"text_analytics_account": "https://cognitiveusw2dev.azure-api.net/"
})
async def test_redacted_text(self, client):
result = await client.recognize_pii_entities(["My SSN is 859-98-0987."])
self.assertEqual("My SSN is ***********.", result[0].redacted_text)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_redacted_text_v3_1_preview_1(self, client):
result = await client.recognize_pii_entities(["My SSN is 859-98-0987."])
self.assertIsNone(result[0].redacted_text)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_phi_domain_filter(self, client):
Expand Down
4 changes: 3 additions & 1 deletion sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,13 @@ def test_recognize_pii_entities_result(self, pii_entity, text_analytics_warning,
model = _models.RecognizePiiEntitiesResult(
id="1",
entities=[pii_entity[0]],
redacted_text="***********",
warnings=[text_analytics_warning[0]],
statistics=text_document_statistics[0],
is_error=False
)
model_repr = "RecognizePiiEntitiesResult(id=1, entities=[{}], warnings=[{}], statistics={}, is_error=False)".format(
model_repr = "RecognizePiiEntitiesResult(id=1, entities=[{}], redacted_text=***********, warnings=[{}], " \
"statistics={}, is_error=False)".format(
pii_entity[1], text_analytics_warning[1], text_document_statistics[1]
)

Expand Down

0 comments on commit 1c80ef6

Please sign in to comment.