In [1]:
!python3 -m spacy download en_core_web_sm


Looking in indexes: https://dtn.jfrog.io/artifactory/api/pypi/pypi/simple
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Grammar Consistency Check
This section shows the validation of how the grammatical structure is retained when redacted with categories of the Personal Health Information(PHI).
In comparison with different sentence structure of the original and redacted sentence below is the result:

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jayragaileortiz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jayragaileortiz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
text = """Mr. Diego De Guzman, a 65 year old male, patient of Dr. Cuadra visited for the first time for several issues. Met with PCP in Feb for multiple issues, Acute Bronchitis changed from Robitussin to NyQuil, referral to Pulmonology; Streptococcal Pharyngitis, (on meds, GI consult) HTN (changed from nitrates to Lisinopril) cholesterol, GERD/H-pylori,  (on meds, GI consult)"""
tokens = nltk.word_tokenize(text)
tag = nltk.pos_tag(tokens)
print(tag)

[('Mr.', 'NNP'), ('Diego', 'NNP'), ('De', 'NNP'), ('Guzman', 'NNP'), (',', ','), ('a', 'DT'), ('65', 'CD'), ('year', 'NN'), ('old', 'JJ'), ('male', 'NN'), (',', ','), ('patient', 'NN'), ('of', 'IN'), ('Dr.', 'NNP'), ('Cuadra', 'NNP'), ('visited', 'VBD'), ('for', 'IN'), ('the', 'DT'), ('first', 'JJ'), ('time', 'NN'), ('for', 'IN'), ('several', 'JJ'), ('issues', 'NNS'), ('.', '.'), ('Met', 'NNP'), ('with', 'IN'), ('PCP', 'NNP'), ('in', 'IN'), ('Feb', 'NNP'), ('for', 'IN'), ('multiple', 'JJ'), ('issues', 'NNS'), (',', ','), ('Acute', 'NNP'), ('Bronchitis', 'NNP'), ('changed', 'VBD'), ('from', 'IN'), ('Robitussin', 'NNP'), ('to', 'TO'), ('NyQuil', 'NNP'), (',', ','), ('referral', 'JJ'), ('to', 'TO'), ('Pulmonology', 'NNP'), (';', ':'), ('Streptococcal', 'NNP'), ('Pharyngitis', 'NNP'), (',', ','), ('(', '('), ('on', 'IN'), ('meds', 'NNS'), (',', ','), ('GI', 'NNP'), ('consult', 'NN'), (')', ')'), ('HTN', 'NNP'), ('(', '('), ('changed', 'VBN'), ('from', 'IN'), ('nitrates', 'NNS'), ('to', 'TO

In [5]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.render(doc, style = "dep")

In [6]:
text_redacted = """NAME, a AGE year old male, patient of NAME-DOCTOR visited for the first time for several issues. Met with PCP in Feb for multiple issues, Acute Bronchitis changed from Robitussin to NyQuil, referral to Pulmonology; Streptococcal Pharyngitis, (on meds, GI consult) HTN (changed from nitrates to Lisinopril) cholesterol, GERD/H-pylori,  (on meds, GI consult)"""
tokens_redacted = nltk.word_tokenize(text)
tags_redacted = nltk.pos_tag(tokens)
print(tags_redacted)

[('Mr.', 'NNP'), ('Diego', 'NNP'), ('De', 'NNP'), ('Guzman', 'NNP'), (',', ','), ('a', 'DT'), ('65', 'CD'), ('year', 'NN'), ('old', 'JJ'), ('male', 'NN'), (',', ','), ('patient', 'NN'), ('of', 'IN'), ('Dr.', 'NNP'), ('Cuadra', 'NNP'), ('visited', 'VBD'), ('for', 'IN'), ('the', 'DT'), ('first', 'JJ'), ('time', 'NN'), ('for', 'IN'), ('several', 'JJ'), ('issues', 'NNS'), ('.', '.'), ('Met', 'NNP'), ('with', 'IN'), ('PCP', 'NNP'), ('in', 'IN'), ('Feb', 'NNP'), ('for', 'IN'), ('multiple', 'JJ'), ('issues', 'NNS'), (',', ','), ('Acute', 'NNP'), ('Bronchitis', 'NNP'), ('changed', 'VBD'), ('from', 'IN'), ('Robitussin', 'NNP'), ('to', 'TO'), ('NyQuil', 'NNP'), (',', ','), ('referral', 'JJ'), ('to', 'TO'), ('Pulmonology', 'NNP'), (';', ':'), ('Streptococcal', 'NNP'), ('Pharyngitis', 'NNP'), (',', ','), ('(', '('), ('on', 'IN'), ('meds', 'NNS'), (',', ','), ('GI', 'NNP'), ('consult', 'NN'), (')', ')'), ('HTN', 'NNP'), ('(', '('), ('changed', 'VBN'), ('from', 'IN'), ('nitrates', 'NNS'), ('to', 'TO

In [13]:
doc = nlp(text_redacted)
jpeg = displacy.render(doc, style = "dep", jupyter=False)

In [15]:
with open("dependency.svg", "w", encoding="utf-8") as f:
    f.write(jpeg)

In [7]:
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'a' |'the' | 'my'
N -> 'Mr.' | 'Diego' | 'De' |'Guzman,' | '65' | 'year' | 'male,' | 'patient' | 'Dr.' | 'Cuadra' | 'time' | 'issues.' | 'issues,' |'Met' | 'PCP' | 'Feb' | 'issues' | 'Acute' | 'Bronchitis' | 'Robitussin' | 'NyQuil,' | 'referral' | 'Streptococcal' | 'Pharyngitis' | 'Pulmonology;'| 'Pharyngitis,'| '(on'| 'meds,'| 'GI'| 'consult)'| 'HTN'| '(changed'| 'nitrates'| 'Lisinopril)'| 'cholesterol,'| 'GERD/H-pylori,'| '(on'| 'meds,'| 'GI'| 'consult)'
V -> 'visited' | 'changed'
P -> 'in' | 'of' | 'for' | 'with' | 'from' | 'to' | 'on'
ADJ -> 'old' | 'first' | 'several' | 'multiple'
""")

In [8]:
sent = text.split()
parser = nltk.ChartParser(groucho_grammar)
print(parser.parse(sent))
for tree in parser.parse(sent):
    print(tree)

<generator object Chart.parses at 0x134cfec00>
