In [1]:
import pandas as pd
import re
import spacy
from pathlib import Path

nlp = spacy.load("en_core_web_sm")

In [2]:
df = pd.read_csv("D:/RITHI_AIML_PROJECT/interim/patient_notes.csv")
df.head()

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,note
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,2017-08-24,,999-68-6630,,,,Jacinto644,Kris249,,...,888 Hickle Ferry Suite 38,Springfield,Massachusetts,Hampden County,1106.0,42.151961,-72.598959,8446.49,1499.08,Patient Jacinto644 Kris249. They were born on ...
1,067318a4-db8f-447f-8b6e-f2f61e9baaa5,2016-08-01,,999-15-5895,,,,Alva958,Krajcik437,,...,1048 Skiles Trailer,Walpole,Massachusetts,Norfolk County,2081.0,42.17737,-71.281353,89893.4,1845.72,Patient Alva958 Krajcik437. They were born on ...
2,ae9efba3-ddc4-43f9-a781-f72019388548,1992-06-30,,999-27-3385,S99971451,X53218815X,Mr.,Jayson808,Fadel536,,...,1056 Harris Lane Suite 70,Chicopee,Massachusetts,Hampden County,1020.0,42.181642,-72.608842,577445.86,3528.84,Patient Jayson808 Fadel536. They were born on ...
3,199c586f-af16-4091-9998-ee4cfc02ee7a,2004-01-09,,999-73-2461,S99956432,,,Jimmie93,Harris789,,...,201 Mitchell Lodge Unit 67,Pembroke,Massachusetts,Plymouth County,,42.075292,-70.757035,336701.72,2705.64,Patient Jimmie93 Harris789. They were born on ...
4,353016ea-a0ff-4154-85bb-1cf8b6cedf20,1996-11-15,,999-60-7372,S99917327,X58903159X,Mr.,Gregorio366,Auer97,,...,1050 Lindgren Extension Apt 38,Boston,Massachusetts,Suffolk County,2135.0,42.352434,-71.02861,484076.34,3043.04,Patient Gregorio366 Auer97. They were born on ...


In [3]:
patterns = {
    "NAME": r"\b([A-Z][a-z]+)\s([A-Z][a-z]+)\b",
    "DATE": r"\b\d{4}-\d{2}-\d{2}\b",
    "SSN": r"\b\d{3}-\d{2}-\d{4}\b",
    "PHONE": r"\b\d{3}-\d{3}-\d{4}\b",
    "ID": r"\b[A-Z0-9]{6,}\b",
    "ADDRESS": r"\b\d+\s[A-Za-z]+\s(?:St|Street|Ave|Road|Rd|Lane|Ln|Blvd|Way)\b"
}

In [4]:
def label_text(text):
    doc = nlp(text)
    tokens = [t.text for t in doc]
    labels = ["O"] * len(tokens)

    for label, pattern in patterns.items():
        for match in re.finditer(pattern, text):
            start, end = match.span()
            for i, token in enumerate(doc):
                if token.idx >= start and token.idx < end:
                    if labels[i] == "O":
                        labels[i] = "B-" + label
                    else:
                        labels[i] = "I-" + label
    return tokens, labels

In [5]:
token_sequences = []
label_sequences = []

for note in df["note"].tolist()[:2000]:  # limit for faster run
    tokens, labels = label_text(note)
    token_sequences.append(tokens)
    label_sequences.append(labels)

len(token_sequences), len(label_sequences)


(2000, 2000)

In [6]:
import json

output_path = Path("D:/Rithi_AIML_Project/interim/silver_ner_training.jsonl")
with output_path.open("w") as f:
    for tokens, labels in zip(token_sequences, label_sequences):
        f.write(json.dumps({"tokens": tokens, "labels": labels}) + "\n")

output_path

WindowsPath('D:/Rithi_AIML_Project/interim/silver_ner_training.jsonl')