In [None]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
from negspacy.negation import Negex
from negspacy.termsets import termset
ts = termset("en_clinical_sensitive")
ts.add_patterns({
            "preceding_negations": ["whether this is primarily"],
            "preceding_negations": ["risk for developing"],
            "preceding_negations": ["-no"],
            "preceding_negations": ["practice guidelines on the management of nonproliferative and"],
            "following_negations": ["presence unspecified"],
        })
import json
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import string
from nltk.tokenize import RegexpTokenizer
from dateutil import parser
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix
)

## NLP

In [2]:
nlp = spacy.load("en_ner_bc5cdr_md")

nlp.add_pipe(
    "negex",
    last=True,
)

<negspacy.negation.Negex at 0x10692dd30>

In [3]:
# Construction via add_pipe
ruler = nlp.add_pipe("entity_ruler", before ="ner")
patterns = [{"label": "DISEASE", "pattern": "proliferative diabetic retinopathy"},
            {"label": "DISEASE", "pattern": "proliferative retinopathy"}]
ruler.add_patterns(patterns)

In [None]:
df = pd.read_csv("~/thesis/uncleaned_notes.csv")  
def extract_ap_section(note):
    if not isinstance(note, str):
        return ''
    
    match = re.search(r'(Assessment and Plan|A/P|Plan)[:\-]?\s*(.+?)(?=\n[A-Z][^\n]*?:|\Z)', 
                      note, re.IGNORECASE | re.DOTALL)
    if match:
        ap_text = match.group(2).strip()
        if len(ap_text) < 15 or ap_text.lower() in ['assessment and plan', 'a/p', 'plan']:
            return ''
        return ap_text
    return ''


df['ap_section'] = df['note_text'].apply(extract_ap_section)
valid_ap_df = df[df['ap_section'].str.strip() != '']
num_patients = valid_ap_df['PatientDurableKey'].nunique()
print(f"Patients with valid A/P sections: {num_patients}")
valid_ap_df[['PatientDurableKey', 'note_text', 'ap_section']].to_csv("valid_ap_sections1.csv", index=False)

In [42]:
### xl['note'] refers to the column in your dataframe that consists of clinical notes
xl = pd.read_csv("/Users/ritikabatte02/Downloads/pdr_AP_notes.csv")

In [None]:
all_patients = xl['PatientDurableKey'].unique()
print("Total unique patients in full dataset:", len(all_patients))

In [47]:
xl['pdr'] = None
xl['pdr_binary'] = None

for x in xl.index:
    a = nlp(xl['assessment_plan_text'][x])
    df = pd.DataFrame(columns=['tokens', 'negex'])
    tokens = []
    negex = []
    label = []

    for e in a.ents:
        tokens.append([e.text])
        negex.append([e._.negex])
        label.append([e.label_])
    df['tokens'] = tokens
    df['negex'] = negex
    df['label'] = label

    df['tokens'] = df['tokens'].astype(str).apply(lambda x: x[2:-2])
    df['negex'] = df['negex'].astype(str).apply(lambda x: x[1:-1])
    df['label'] = df['label'].astype(str).apply(lambda x: x[2:-2])

    searchfor = ['DISEASE']
    dis = df[df.label.str.contains('|'.join(searchfor))]

    searchfor_pdr = [r'\bproliferative retinopathy\b', r'\bproliferative diabetic retinopathy\b', r'\bPDR\b']
    pdr = dis[dis.tokens.str.contains('|'.join(searchfor_pdr))].copy()

    pdr = pdr[~pdr['tokens'].str.contains(r'\b(?:mild|moderate|non[\s-]?proliferative)\b', case=False, regex=True)]

    pdr.loc[:, 'PDR'] = np.where(pdr['negex'].str.contains("False"), 1, 0)

    if 1 in pdr['PDR'].values:
        xl.loc[x, 'pdr_name'] = 'PDR'
        xl.loc[x, 'pdr_binary'] = 1
    else:
        xl.loc[x, 'pdr_name'] = 'No PDR'
        xl.loc[x, 'pdr_binary'] = 0

In [None]:
# Functions

def extract_date_from_text(text):
    date_pattern = r'(\d{4}-\d{2}-\d{2})|(\d{1,2}/\d{1,2}/\d{2,4})|(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[\s\-]?\d{1,2},?[\s\-]?\d{4})'

    dates = re.findall(date_pattern, text)
    extracted = []

    for group in dates:
        for date_str in group:
            if date_str:
                try:
                    parsed_date = parser.parse(date_str, fuzzy=True)
                    extracted.append(parsed_date)
                except ValueError:
                    continue

    return extracted

def get_pdr_date_by_proximity(note_text):
    doc = nlp(note_text)
    text = doc.text.lower()

    pdr_terms = ['proliferative diabetic retinopathy', 'proliferative retinopathy', 'pdr']
    pdr_positions = [text.find(term) for term in pdr_terms if term in text]
    pdr_positions = [pos for pos in pdr_positions if pos != -1]
    if not pdr_positions:
        return None

    date_positions = []
    date_regex = re.finditer(
        r'\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{2,4}|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[\s\-]\d{1,2},?[\s\-]?\d{4}',
        text)
    for match in date_regex:
        date_str = match.group()
        try:
            parsed_date = parser.parse(date_str, fuzzy=True)
            date_positions.append((parsed_date, match.start()))
        except:
            continue

    if not date_positions:
        return None

    min_dist = float('inf')
    closest_date = None
    for pdx in pdr_positions:
        for date, didx in date_positions:
            dist = abs(pdx - didx)
            if dist < min_dist:
                min_dist = dist
                closest_date = date

    return closest_date

In [None]:
pdr_dates = []

for patient_id, group in xl.groupby('PatientDurableKey'):
    patient_has_pdr = group['pdr_binary'].max() == 1 
    patient_dates = []

    if patient_has_pdr:
        for note in group['assessment_plan_text']:
            date = get_pdr_date_by_proximity(note)
            if date:
                patient_dates.append(date)

        if patient_dates:
            earliest = min(patient_dates)
        else:
            earliest = pd.NaT  
    else:
        earliest = pd.NaT  

    pdr_dates.append({'PatientDurableKey': patient_id, 'pdr_dates': earliest})


In [58]:
pdr_date_df = pd.DataFrame(pdr_dates)

xl = xl.drop(columns=[col for col in xl.columns if 'pdr_date' in col], errors='ignore')

xl = xl.merge(pdr_date_df, on="PatientDurableKey", how="left")

In [59]:
xl.to_csv('pdr_nlp_notes.csv', index=False)

In [60]:
summary = xl.groupby('PatientDurableKey').agg({
    'pdr_binary': 'max',
    'pdr_dates': 'first'
}).reset_index()

summary.to_csv('pdr_summary.csv', index=False)