In [160]:
import pandas as pd

In [162]:
# names of medications to detect later
med_names = ['altretamine', 'bendamustine', 'busulfan', 'carboplatin', 'chlorambucil', 'cisplatin', 'cyclophosphamide', 'dacarbazine', 'ifosfamide', 'mechlorethamine', 'melphalan', 'oxaliplatin', 'procarbazine', 'temozolomide', 'thiotepa', 'trabectedin', 'acetaminophen', 'adderall', 'amitriptyline', 'amlodipine', 'amoxicillin', 'ativan', 'atorvastatin', 'azithromycin', 'benzonatate', 'brilinta', 'bunavail', 'buprenorphine', 'cephalexin', 'ciprofloxacin', 'citalopram', 'clindamycin', 'clonazepam', 'cyclobenzaprine', 'cymbalta', 'doxycycline', 'dupixent', 'entresto', 'entyvio', 'farxiga', 'fentanyl', 'patch', 'gabapentin', 'gilenya', 'humira', 'hydrochlorothiazide', 'hydroxychloroquine', 'ibuprofen', 'imbruvica', 'invokana', 'januvia', 'jardiance', 'kevzara', 'lexapro', 'lisinopril', 'lofexidine', 'loratadine', 'lyrica', 'melatonin', 'meloxicam', 'metformin', 'methadone', 'methotrexate', 'metoprolol', 'naloxone', 'naltrexone', 'naproxen', 'narcan', 'nurtec', 'omeprazole', 'onpattro', 'otezla', 'ozempic', 'pantoprazole', 'prednisone', 'probuphine', 'rybelsus', 'secukinumab', 'sublocade', 'tramadol', 'trazodone', 'viagra', 'wegovy', 'wellbutrin', 'xanax']

In [161]:
# get all notes
df = pd.read_csv('../detailed_withhistory_augmented_formatted.csv')[:3000]
texts = df['response'].values

## Use the presence of certain keywords to guess label

In [163]:
def detect_appt(line):
    for term in ['follow-up', 'follow up', 'followup', 'appointment', 'appt', 'ology', 'ologist', 'nutritionist', 'dietician', 'refer']:
        if term in line.lower() and 'as needed' not in line and len(line.split()) > 3:
            return True
    return False
def detect_pt(line):
    for term in ['advise', 'monitor', 'instruct', 'refrain', 'lifestyle', 'diet', 'exercise']:
        if term in line.lower():
            return True
    return False
def detect_other(line):
    for term in ['will', 'monitor']:
        if term in line.lower():
            return True
    return False
def detect_meds(line):
    for term in med_names + ['adjust', 'supplement', 'taper', 'wean']:  # TODO maybe specify line length > 4
        if term in line.lower() and len(line.split()) >= 4 and 'current' not in line.lower() and 'continue' not in line.lower():
            return True
    return False
def detect_procedure(line):
    for term in ["exam", "physical therapy", "scopy", 'determine']:
        if term in line.lower() and 'undergone' not in line and 'underwent' not in line and 'unremarkable' not in line and 'reveals' not in line:
            return True
    return False
def detect_labs(line):
    for term in ['lab', 'test', 'assess']:
        if term in line.lower() and 'undergone' not in line and 'underwent' not in line and 'experienc' not in line and 'assessment' not in line.lower():
            return True
    return False
def detect_imaging(line):
    for term in ["imaging", 'graphy', 'gram', 'determine', 'scan']:
        if term in line.lower() and 'undergone' not in line and 'underwent' not in line:
            return True
    return False

### Order labels according to priority, essentially

In [164]:
def label_lines(lines):
    line_labels = []
    for i, line in enumerate(lines):
        if detect_procedure(line) and i >= len(lines) - 10:
            line_labels.append('Procedure')
            continue
        elif detect_imaging(line):
            line_labels.append('Imaging')
            continue
        elif detect_appt(line):
            line_labels.append('Appointment')
            continue
        elif detect_pt(line) and i >= len(lines)/2 and not line.startswith('We will'):
            line_labels.append('Patient instructions')
            continue
        elif detect_labs(line) and i >= len(lines) - 10:
            line_labels.append('Labs')
            continue
        elif detect_other(line):
            line_labels.append('Other')
            continue
        elif detect_meds(line) and i >= len(lines) - 10:
            line_labels.append('Medications')
            continue
        elif i > len(lines) - 6 and len(line.split()) > 3 and not line.endswith('Mr') and not line.endswith('Mrs') and not line.endswith('Ms'):
            line_labels.append('Other')
            continue
        else:
            line_labels.append('No label')
            continue
    return line_labels

In [165]:
out = pd.DataFrame(columns=['text', 'labels'])

## Get lines from notes and send them off to be labeled

In [166]:
for text in texts:
    lines = text.split('\n')  # get individual lines
    text = '. '.join(text.split('\n'))
    lines = [line.strip() for line in text.split('.') if line.strip()]  # remove extra spaces
    labels = label_lines(lines)
    text_df = pd.DataFrame(columns=['text', 'labels'])
    text_df['text'] = lines
    text_df['labels'] = labels
    out = pd.concat([out, text_df])  # export

## Save to CSV

In [168]:
out.to_csv('labeled_lines.csv')