In [1]:
import pandas as pd
import spacy
import pickle
from pathlib import Path
import hashlib

nlp = spacy.load("en_core_web_sm")

# Load notes
notes_path = Path("../interim/patient_notes.csv")
df_notes = pd.read_csv(notes_path)
print(df_notes.shape)

# Load trained CRF model
model_path = Path("../interim/crf_model.pkl")
print("CRF model exists:", model_path.exists())

with model_path.open("rb") as f:
    crf = pickle.load(f)

crf


(24714, 26)
CRF model exists: True


0,1,2
,algorithm,'lbfgs'
,min_freq,
,all_possible_states,
,all_possible_transitions,True
,c1,0.1
,c2,0.1
,max_iterations,100
,num_memories,
,epsilon,
,period,


In [2]:
import re

def word_shape(token):
    shape = []
    for ch in token:
        if ch.isupper():
            shape.append("X")
        elif ch.islower():
            shape.append("x")
        elif ch.isdigit():
            shape.append("d")
        else:
            shape.append(ch)
    return "".join(shape)

def token_features(tokens, i):
    token = tokens[i]
    feats = {
        "bias": 1.0,
        "word.lower()": token.lower(),
        "word.isupper()": token.isupper(),
        "word.istitle()": token.istitle(),
        "word.isdigit()": token.isdigit(),
        "word.shape": word_shape(token),
        "suffix3": token[-3:],
        "prefix1": token[:1],
    }

    if i > 0:
        prev = tokens[i-1]
        feats.update({
            "-1:word.lower()": prev.lower(),
            "-1:word.istitle()": prev.istitle(),
            "-1:word.isupper()": prev.isupper(),
        })
    else:
        feats["BOS"] = True

    if i < len(tokens) - 1:
        nxt = tokens[i+1]
        feats.update({
            "+1:word.lower()": nxt.lower(),
            "+1:word.istitle()": nxt.istitle(),
            "+1:word.isupper()": nxt.isupper(),
        })
    else:
        feats["EOS"] = True

    return feats

def sent2features(tokens):
    return [token_features(tokens, i) for i in range(len(tokens))]


In [3]:
def labels_to_spans(doc, labels):
    """
    doc: spaCy Doc
    labels: list of BIO tags, len = len(doc)
    Returns list of dicts: {type, start_char, end_char, text}
    """
    spans = []
    current_type = None
    start_idx = None

    for i, (token, tag) in enumerate(zip(doc, labels)):
        if tag == "O":
            if current_type is not None:
                end_token = doc[i-1]
                spans.append({
                    "type": current_type,
                    "start_char": doc[start_idx].idx,
                    "end_char": end_token.idx + len(end_token),
                    "text": doc.text[doc[start_idx].idx : end_token.idx + len(end_token)]
                })
                current_type = None
                start_idx = None
            continue

        # tag like B-NAME or I-NAME
        bio, ent_type = tag.split("-", 1)

        if bio == "B":
            # close previous entity if open
            if current_type is not None:
                end_token = doc[i-1]
                spans.append({
                    "type": current_type,
                    "start_char": doc[start_idx].idx,
                    "end_char": end_token.idx + len(end_token),
                    "text": doc.text[doc[start_idx].idx : end_token.idx + len(end_token)]
                })
            current_type = ent_type
            start_idx = i
        elif bio == "I":
            # continue only if same type
            if current_type is None:
                current_type = ent_type
                start_idx = i
            elif ent_type != current_type:
                # different type: close old, start new
                end_token = doc[i-1]
                spans.append({
                    "type": current_type,
                    "start_char": doc[start_idx].idx,
                    "end_char": end_token.idx + len(end_token),
                    "text": doc.text[doc[start_idx].idx : end_token.idx + len(end_token)]
                })
                current_type = ent_type
                start_idx = i

    # close last entity
    if current_type is not None:
        end_token = doc[len(doc)-1]
        spans.append({
            "type": current_type,
            "start_char": doc[start_idx].idx,
            "end_char": end_token.idx + len(end_token),
            "text": doc.text[doc[start_idx].idx : end_token.idx + len(end_token)]
        })

    return spans


In [4]:
DEID_POLICY = {
    "NAME": {"action": "mask", "replacement": "<NAME>"},
    "SSN": {"action": "hash"},
    "ID": {"action": "hash"},
    "PHONE": {"action": "mask", "replacement": "<PHONE>"},
    "ADDRESS": {"action": "mask", "replacement": "<ADDRESS>"},
    "DATE": {"action": "generalize_date"},
    # if you later add PAYER_NAME, ORG_NAME, etc. you can put them here too
}

SALT = "rithi_master_project_salt"  # fixed salt for hashing (document this in report)

def hash_value(text, label_type):
    h = hashlib.sha256((SALT + text).encode("utf-8")).hexdigest()[:10]
    return f"<{label_type}_HASH_{h}>"

def generalize_date(text):
    # for now, keep only the year
    try:
        dt = pd.to_datetime(text, errors="coerce")
        if pd.isna(dt):
            return "<DATE>"
        return f"<YEAR_{dt.year}>"
    except Exception:
        return "<DATE>"

def apply_policy(label_type, text):
    base_type = label_type  # e.g. NAME, SSN, DATE
    rule = DEID_POLICY.get(base_type, {"action": "none"})

    action = rule.get("action", "none")
    if action == "mask":
        return rule.get("replacement", f"<{base_type}>")
    elif action == "hash":
        return hash_value(text, base_type)
    elif action == "generalize_date":
        return generalize_date(text)
    else:
        # no rule: keep as-is
        return text


In [5]:
def deidentify_note(text):
    # 1) tokenize with spaCy
    doc = nlp(text)
    tokens = [t.text for t in doc]

    # 2) build features and predict labels with CRF
    feats = sent2features(tokens)
    y_pred = crf.predict_single(feats)  # list of BIO tags

    # 3) convert BIO tags to spans with char offsets
    spans = labels_to_spans(doc, y_pred)

    if not spans:
        return text, []  # nothing redacted

    # 4) apply policy span by span
    new_text_parts = []
    cursor = 0
    redacted_spans = []

    # sort spans by start_char to be safe
    spans = sorted(spans, key=lambda s: s["start_char"])

    for span in spans:
        start = span["start_char"]
        end = span["end_char"]
        orig = span["text"]
        label_type = span["type"]  # e.g. NAME, SSN, DATE

        # add text before span
        if cursor < start:
            new_text_parts.append(text[cursor:start])

        # transformed replacement
        replacement = apply_policy(label_type, orig)
        new_text_parts.append(replacement)

        redacted_spans.append({
            "type": label_type,
            "original": orig,
            "replacement": replacement,
            "start_char": start,
            "end_char": end,
        })

        cursor = end

    # add trailing text
    if cursor < len(text):
        new_text_parts.append(text[cursor:])

    new_text = "".join(new_text_parts)
    return new_text, redacted_spans


In [6]:
sample_text = df_notes["note"].iloc[0]
print("ORIGINAL:")
print(sample_text)

redacted, spans = deidentify_note(sample_text)
print("\nREDACTED:")
print(redacted)

spans[:5]


ORIGINAL:
Patient Jacinto644 Kris249. They were born on 2017-08-24. They live at 888 Hickle Ferry Suite 38, Springfield, Massachusetts 1106.0. Their social security number is 999-68-6630.

REDACTED:
Patient Jacinto644 Kris249. They were born on <YEAR_2017><DATE><DATE><DATE><DATE>. They live at 888 <NAME> <NAME> Suite 38, Springfield, Massachusetts 1106.0. Their social security number is <SSN_HASH_13c4fb31f7><SSN_HASH_7d5c7e1bf5><SSN_HASH_df1569f2a2><SSN_HASH_7d5c7e1bf5><SSN_HASH_ec3f069908>.


[{'type': 'DATE',
  'original': '2017',
  'replacement': '<YEAR_2017>',
  'start_char': 46,
  'end_char': 50},
 {'type': 'DATE',
  'original': '-',
  'replacement': '<DATE>',
  'start_char': 50,
  'end_char': 51},
 {'type': 'DATE',
  'original': '08',
  'replacement': '<DATE>',
  'start_char': 51,
  'end_char': 53},
 {'type': 'DATE',
  'original': '-',
  'replacement': '<DATE>',
  'start_char': 53,
  'end_char': 54},
 {'type': 'DATE',
  'original': '24',
  'replacement': '<DATE>',
  'start_char': 54,
  'end_char': 56}]

In [7]:
redacted_notes = []
num_spans = []

for text in df_notes["note"].tolist():
    red, spans = deidentify_note(text)
    redacted_notes.append(red)
    num_spans.append(len(spans))

df_notes["note_redacted"] = redacted_notes
df_notes["num_redacted_spans"] = num_spans

df_notes[["note", "note_redacted", "num_redacted_spans"]].head()


Unnamed: 0,note,note_redacted,num_redacted_spans
0,Patient Jacinto644 Kris249. They were born on ...,Patient Jacinto644 Kris249. They were born on ...,12
1,Patient Alva958 Krajcik437. They were born on ...,Patient Alva958 Krajcik437. They were born on ...,12
2,Patient Jayson808 Fadel536. They were born on ...,Patient Jayson808 Fadel536. They were born on ...,13
3,Patient Jimmie93 Harris789. They were born on ...,Patient Jimmie93 Harris789. They were born on ...,13
4,Patient Gregorio366 Auer97. They were born on ...,Patient Gregorio366 Auer97. They were born on ...,14


In [8]:
out_path = Path("../processed/patient_notes_deidentified.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
df_notes.to_csv(out_path, index=False)
out_path


WindowsPath('../processed/patient_notes_deidentified.csv')

In [9]:
df_notes["num_redacted_spans"].describe()

count    24714.000000
mean        18.670066
std          5.746938
min          2.000000
25%         14.000000
50%         16.000000
75%         23.000000
max         39.000000
Name: num_redacted_spans, dtype: float64

In [10]:
print("Total PHI spans redacted:", df_notes["num_redacted_spans"].sum())
print("Percentage of notes with at least 1 redaction:",
      (df_notes["num_redacted_spans"] > 0).mean() * 100)

Total PHI spans redacted: 461412
Percentage of notes with at least 1 redaction: 100.0
