In [5]:
#mount to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd ./drive/MyDrive/

/content/drive/MyDrive


In [3]:
import os
import json

def count_json_entries(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    return len(data)

def count_all_json_entries(folder_path):
    entries = []
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(".json"):
            filepath = os.path.join(folder_path, filename)
            try:
                entries.append(count_json_entries(filepath))
            except Exception as e:
                print(f"❌ {filename}: error reading file ({e})")


In [4]:
pwd

'/content/drive/MyDrive'

In [None]:
# read in the cleaned files
import pandas as pd

gemini_and_ag = pd.read_csv('./cleaned_extracted_medical_info_all_drug_categories_RG.csv')
med7 = pd.read_csv('./Med7_Extracted_Results_RG.csv')
med7.rename(columns={'Unnamed: 0': 'Patient Index'}, inplace=True)
gemini = gemini_and_ag.drop(columns=['Age', 'Gender'])
print(gemini.head(1))
print(med7.head(1))

   Patient Index                                          Drug Name  \
0              0  ['gemcitabine', 'Abraxane', 'oxycodone', 'anti...   

                                         Drug Dosage  \
0  ['Not mentioned', 'Not mentioned', 'Not mentio...   

                        Drug Route of Administration  \
0  ['Not mentioned', 'Not mentioned', 'Oral', 'IV...   

                                       Drug Strength  \
0  ['Not mentioned', 'Not mentioned', 'Not mentio...   

                                           Drug Form  \
0  ['Not mentioned', 'Not mentioned', 'Not mentio...   

                            Frequency of Drug Intake  \
0  ['6 full cycles', '6 full cycles', 'Intermitte...   

                             Duration of Drug Intake  
0  ['Not mentioned', 'Not mentioned', 'Not mentio...  
   Patient Index                                          Drug Name  \
0              0  ['gemcitabine', 'Abraxane', 'antibiotics', 'ox...   

                                       

In [None]:
print(med7.columns)
print(gemini.columns)

Index(['Patient Index', 'Drug Name', 'Drug Dosage',
       'Drug Route of Administration', 'Drug Strength', 'Drug Form',
       'Frequency of Drug Intake', 'Duration of Drug Intake'],
      dtype='object')
Index(['Patient Index', 'Drug Name', 'Drug Dosage',
       'Drug Route of Administration', 'Drug Strength', 'Drug Form',
       'Frequency of Drug Intake', 'Duration of Drug Intake'],
      dtype='object')


In [None]:
import ast
from collections import defaultdict

def parse_if_list(value):
    try:
        parsed = ast.literal_eval(value)
        if isinstance(parsed, list):
            return parsed
    except (ValueError, SyntaxError):
        pass
    return [value]  # wrap single value in list if not a list string

def build_nested_dict(df):
    result = {}
    for patient_id, group in df.groupby('Patient Index'):
        drug_dict = defaultdict(list)
        for _, row in group.iterrows():
            drug_names = parse_if_list(row['Drug Name'])
            dosages = parse_if_list(row['Drug Dosage'])
            routes = parse_if_list(row['Drug Route of Administration'])
            strengths = parse_if_list(row['Drug Strength'])
            forms = parse_if_list(row['Drug Form'])
            frequencies = parse_if_list(row['Frequency of Drug Intake'])
            durations = parse_if_list(row['Duration of Drug Intake'])

            for i, drug in enumerate(drug_names):
                drug_dict[drug].append({
                    'Drug Dosage': dosages[i] if i < len(dosages) else 'Not mentioned',
                    'Drug Route of Administration': routes[i] if i < len(routes) else 'Not mentioned',
                    'Drug Strength': strengths[i] if i < len(strengths) else 'Not mentioned',
                    'Drug Form': forms[i] if i < len(forms) else 'Not mentioned',
                    'Frequency of Drug Intake': frequencies[i] if i < len(frequencies) else 'Not mentioned',
                    'Duration of Drug Intake': durations[i] if i < len(durations) else 'Not mentioned',
                })

        result[patient_id] = dict(drug_dict)
    return result

med7_dict = build_nested_dict(med7)
gemini_dict = build_nested_dict(gemini)
med7_dict[0]
gemini_dict[0]


{'gemcitabine': [{'Drug Dosage': 'Not mentioned',
   'Drug Route of Administration': 'Not mentioned',
   'Drug Strength': 'Not mentioned',
   'Drug Form': 'Not mentioned',
   'Frequency of Drug Intake': '6 full cycles',
   'Duration of Drug Intake': 'Not mentioned'}],
 'Abraxane': [{'Drug Dosage': 'Not mentioned',
   'Drug Route of Administration': 'Not mentioned',
   'Drug Strength': 'Not mentioned',
   'Drug Form': 'Not mentioned',
   'Frequency of Drug Intake': '6 full cycles',
   'Duration of Drug Intake': 'Not mentioned'}],
 'oxycodone': [{'Drug Dosage': 'Not mentioned',
   'Drug Route of Administration': 'Oral',
   'Drug Strength': 'Not mentioned',
   'Drug Form': 'Not mentioned',
   'Frequency of Drug Intake': 'Intermittent',
   'Duration of Drug Intake': 'Not mentioned'}],
 'antibiotics': [{'Drug Dosage': 'IV and oral',
   'Drug Route of Administration': 'IV and oral',
   'Drug Strength': 'Not mentioned',
   'Drug Form': 'Not mentioned',
   'Frequency of Drug Intake': 'Not ment

In [None]:
gemini['Drug Name'][0]
med7['Drug Name'][0]

"['gemcitabine', 'Abraxane', 'antibiotics', 'oxycodone', 'antihypertensive', 'Penicillins', 'CONTRAST', 'Omnipaque', 'contrast', 'chemotherapy']"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful


# spaCy v3 with RoBERTa-base Med7

In [6]:
!pip install -U wheel pip setuptools spacy spacy-transformers

Collecting pip
  Downloading pip-25.1-py3-none-any.whl.metadata (3.6 kB)
Collecting setuptools
  Downloading setuptools-80.0.0-py3-none-any.whl.metadata (6.5 kB)
Collecting spacy-transformers
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy-tra

In [1]:
pip install "en-core-med7-trf @ https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl"

[33mDEPRECATION: Wheel filename 'en_core_med7_trf-any-py3-none-any.whl' is not correctly normalised. Future versions of pip will raise the following error:
Invalid wheel filename (invalid version): 'en_core_med7_trf-any-py3-none-any'

 pip 25.3 will enforce this behaviour change. A possible replacement is to rename the wheel to use a correctly normalised name (this may require updating the version in the project metadata). Discussion can be found at https://github.com/pypa/pip/issues/12938[0m[33m
[0mCollecting en-core-med7-trf@ https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl
  Downloading https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl (1018.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 GB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy<3.5.0,>=3.4.2 (from en-core-med7-trf@ https://huggingface.co/kormilitzin/en_core_med7_tr

In [2]:
pip install --force-reinstall --no-cache-dir numpy

Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m288.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.5 which is incompatible.
sentence-transformers 3.4.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.25.1 which is incompatible.
albumentations 2.0.5 requires pydantic>=2.9.2, but you have pydantic 1.10.22 w

In [3]:
pip install --force-reinstall --no-cache-dir thinc

Collecting thinc
  Downloading thinc-9.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting blis<1.1.0,>=1.0.0 (from thinc)
  Downloading blis-1.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting murmurhash<1.1.0,>=1.0.2 (from thinc)
  Downloading murmurhash-1.0.12-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting cymem<2.1.0,>=2.0.2 (from thinc)
  Downloading cymem-2.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from thinc)
  Downloading preshed-3.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting wasabi<1.2.0,>=0.8.1 (from thinc)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.0 (from thinc)
  Downloading srsly-2.5.1-cp311-cp311-manylinux_2_1

In [2]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [3]:
!pip install --force-reinstall numpy

Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m104.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.5
    Uninstalling numpy-2.2.5:
      Successfully uninstalled numpy-2.2.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy 3.4.4 requires pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4, but you have pydantic 2.11.3 which is incompatible.
spacy 3.4.4 requires thinc<8.2.0,>=8.1.0, but you have thinc 9.1.1 which is incompatible.
spacy 3.4.4 requires wasabi<1.1.0,>=0.9.1, but you have wasabi 1.1.3 which is incompatible.
ten

In [4]:
!pip install --force-reinstall spacy spacy-transformers

Collecting spacy
  Downloading spacy-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-transformers
  Using cached spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86

In [3]:
import spacy
import en_core_med7_trf



In [2]:
!pip install --force-reinstall --no-cache-dir torch torchvision torchaudio
!pip install --upgrade transformers
!pip install --force-reinstall --no-cache-dir timm

Collecting torch
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidi

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m108.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.49.0
    Uninstalling transformers-4.49.0:
      Successfully uninstalled transformers-4.49.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy-transformers 1.3.8 requires transformers<4.50.0,>=3.4.0, but you have transformers 4.51.3 which is incompatible.[0m[31m
[0mSuccessfully installed transformers-4.51.3
Collecting timm
  Downloading timm-1.0.15-py3-none-any.whl.metadata (52 kB)
Collecting torch (from timm)
  Downloading torch-2.7.0-cp311-cp311-ma

In [4]:
med7 = en_core_med7_trf.load()


If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current 'transformers' and 'spacy-transformers' versions. For more details and available updates, run: python -m spacy validate


In [None]:
# create distinct colours for labels

col_dict = {}
seven_colours = ['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4']
for label, colour in zip(med7.pipe_labels['ner'], seven_colours):
    col_dict[label] = colour

options = {'ents': med7.pipe_labels['ner'], 'colors':col_dict}

In [None]:
# some text which contains medical concepts
text = 'A patient was prescribed Magnesium hydroxide 400mg/5ml suspension PO of total 30ml  bid for the next 5 days.'
doc = med7(text)
[(ent.text, ent.label_, ent.start_char, ent.end_char, ent.start) for ent in doc.ents]

Testing on a patient note

In [None]:
note = """ID: ***** ***** is a 81 y.o. postmenopausal patient with a recent diagnosis of breast cancer, who presents in consultation to discuss treatment options and to establish care.    HPI: The patient last had a screening mammogram on 11/06/11 (at 76 years) which was notable for heterogeneously dense breast tissue, but no features of malignancy.  The patient felt a painful mass in her right breast in August 2016.  She presented to her PCP (Dr. *****) of 30+ years on 09/11/16 who noted a firm mobile mass (6 cm) in the right upper outer quadrant of her breast with dimpling of overlying skin.      A diagnostic mammogram on 09/17/16 demonstrated heterogeneously dense breast tissue and a round, indistinct, high density mass (4.1 x 3.3 cm) at the site of the palpable lump, which was in the upper outer right breast, posterior depth.  A targeted ultrasound at that time was notable for a hypoechoic irregular solid mass (2.7 x 2.9 x 3.8 cm) in the upper outer right breast, at the 10:00 position, 9 cm from the nipple.  She underwent a right breast fine needle aspiration at the 10:00 position on 09/17/16.  The pathology was consistent with adenocarcinoma with ER negative, PR negative, and HER2 negative (IHC 0; FISH ratio 1.7).    She established care with Dr. ***** ***** on 09/25/16.  She was offered breast conserving surgery, but preferred mastectomy in an effort to avoid radiation.  ***** underwent a right breast simple mastectomy and sentinel lymph node biopsy on 10/25/16 with Dr. *****.  The pathology was notable for 3.6 cm of grade 3 IDC.  The mitotic count was quite elevated at 74/10 hpf.  The final margins were negative.  Features suggestive of lymphovascular invasion were present.  No DCIS was present.  One of the 2 sentinel lymph nodes was positive for adenocarcinoma (1.2 cm, but no extranodal extension).  Biomarkers were notable for estrogen receptor negative, progesterone receptor negative, HER2 positive (IHC 1; FISH ratio 2.1,
 but with HER2 sig/nuc 3.0 and Cen17 sig/nuc 1.5), and variable Ki67 expression (<5% to 25-30%; average ~15%).      ***** developed dyspnea on the evening of 11/20/16 and presented to ***** ***** ***** in the context of her known congestive hart failure with left venrticular dysfunction (LVEF 25%).  She was managed with Lasix in the CDU and discharged earlier today.  She presents to clinic with her daughter (*****) and granddaughter (*****).  She feels much better today, back to her baseline with regards to energy and dyspnea.  She has no pain at the site of her breast surgery.      Past Medical History:   Diagnosis Date    Cardiac pacemaker in situ     Cataract     CHF (congestive heart failure)     CKD (chronic kidney disease)     Chronic kidney disease (CKD) stage G3b/A1, moderately decreased glomerular filtration rate (GFR) between 30-44 mL/min/1.73 square meter and albuminuria creatinine ratio less than 30 mg/g    Coronary atherosclerosis of unspecified type of vessel, native or graft     Diabetes mellitus     115-180s    GERD (gastroesophageal reflux disease)     Glaucoma     suspect    HTN (hypertension)     Hypertension     Other and unspecified hyperlipidemia     Pacemaker     SBO (small bowel obstruction)     Partial sbo without history of previous abdominal surgeries. Unclear etiology. DDx includes small occult neoplasm vs focal enteritis. Adhesion less likely with lack of abdominal surgeries. EGD/Colo neg in October 2012    Sick sinus syndrome        Past Surgical History:   Procedure Laterality Date    BIOPSY / EXCISION BREAST      CHOLECYSTECTOMY      MASTECTOMY Right 10/25/2016    PACEMAKER INSERTION  1998    SENTINEL LYMPH NODE BIOPSY Right 10/25/2016        Family History   Problem Relation Age of Onset    Stroke Mother     Uterine cancer Mother 40     died at 43    Stroke Father     Diabetes Other     Arthritis Other     Stroke Sister 68     March 2013    Stomach cancer Maternal
 Grandfather     Uterine cancer Maternal Aunt 37     died at 39    Blindness Neg Hx     Amblyopia Neg Hx     Cataracts Neg Hx     Glaucoma Neg Hx     Macular degen Neg Hx     Retinal detachment Neg Hx     Strabismus Neg Hx        Social History     Social History    Marital status: Married     Spouse name: N/A    Number of children: N/A    Years of education: N/A     Occupational History    Not on file.     Social History Main Topics    Smoking status: Never Smoker    Smokeless tobacco: Never Used    Alcohol use No    Drug use: No    Sexual activity: Not on file     Other Topics Concern    Not on file     Social History Narrative    Married; husband still alive. Lives with her daughter.  Originally from *****.     The patient underwent menarche at 11 years of age.  She never took OCP.  She is G7P5.  She underwent menopause at 55.  She has no history of HRT.        Outpatient Encounter Prescriptions as of 11/21/2016   Medication Sig Dispense Refill    acetaminophen (TYLENOL) 500 mg tablet Take 1 tablet (500 mg total) by mouth every 6 (six) hours as needed (MILD PAIN).      amLODIPine (NORVASC) 5 mg tablet Take 0.5 tablets (2.5 mg total) by mouth Daily. 90 tablet 3    aspirin 81 mg EC tablet TAKE 1 TABLET BY MOUTH DAILY 90 tablet 3    atorvastatin (LIPITOR) 40 mg tablet TAKE 1 TABLET BY MOUTH EVERY DAY 90 tablet 3    bisoprolol (ZEBETA) 5 mg tablet TAKE 1/2 TABLET BY MOUTH DAILY 50 tablet 3    blood glucose (BLOOD GLUCOSE) test strip Check blood sugar twice daily as directed. 180 each 3    blood glucose monitoring kit Check blood sugar as directed 1 each 0    brimonidine (ALPHAGAN) 0.1 % ophthalmic solution Place 1 drop into both eyes 2 (two) times daily. 10 mL 11    calcium carbonate-vitamin D (OYSCO 500/D) 1,250 mg (500 mg elemental)-200 unit tablet TAKE 1 TABLET BY MOUTH EVERY DAY 90 tablet 3    glipiZIDE (GLUCOTROL) 10 mg tablet TAKE 1 TABLET BY MOUTH TWICE DAILY BEFORE MEALS 180
 tablet 1    insulin glargine (LANTUS) 100 unit/mL injection INJECT 5 UNITS INTO THE SKIN EVERY NIGHT AT BEDTIME 10 mL 5    insulin syringe-needle U-100 0.3 mL 29 SYRINGE USE DAILY AS DIRECTED 100 Syringe 3    lancets lancets Use twice daily as directed. 180 each 3    latanoprost (XALATAN) 0.005 % ophthalmic solution Place 1 drop into both eyes nightly at bedtime. 2.5 mL 11    metFORMIN (GLUCOPHAGE) 1,000 mg tablet TAKE 1 TABLET BY MOUTH TWICE DAILY TAKE WITH MEALS 200 tablet 3    valsartan (DIOVAN) 80 mg tablet TAKE 1 TABLET BY MOUTH DAILY 90 tablet 2    ALPHAGAN P 0.1 % ophthalmic solution INSTILL 1 DROP IN BOTH EYES TWICE DAILY 5 mL 0    docusate sodium (COLACE) 100 mg capsule Take 1 capsule (100 mg total) by mouth Twice a day. 60 capsule 0    loratadine (CLARITIN) 10 mg tablet Take 1 tablet (10 mg total) by mouth Daily. 90 tablet 1    mometasone (NASONEX) 50 mcg/actuation spray 2 sprays by Nasal route Daily. 17 g 6    nitroGLYCERIN (NITROSTAT) 0.4 mg SL tablet Place 1 tablet (0.4 mg total) under the tongue every 5 (five) minutes as needed for Chest pain. 100 tablet 3    olopatadine (PATANOL) 0.1 % ophthalmic solution Place 1 drop into both eyes daily as needed (1 drop as needed). 5 mL 6    [DISCONTINUED] doxycycline (MONODOX) 100 mg capsule       [DISCONTINUED] furosemide (LASIX) 40 mg tablet Take 0.5 tablets (20 mg total) by mouth Daily. 3 tablet 0    [DISCONTINUED] oxyCODONE (ROXICODONE) 5 mg tablet Take 1 tablet (5 mg total) by mouth every 4 (four) hours as needed for Pain. 30 tablet 0     Facility-Administered Encounter Medications as of 11/21/2016   Medication Dose Route Frequency Provider Last Rate Last Dose    [COMPLETED] furosemide (LASIX) injection 20 mg  20 mg Intravenous Once ***** *****, MD   20 mg at 11/20/16 1632    [DISCONTINUED] 0.9 % sodium chloride flush injection syringe  3 mL Intravenous Q8H SCH ***** ***** *****, *****-C   3 mL at 11/20/16 2138    [DISCONTINUED] 0.9 % sodium
 chloride flush injection syringe  3 mL Intravenous PRN ***** ***** *****, *****-C        [DISCONTINUED] acetaminophen (TYLENOL) tablet 500 mg  500 mg Oral Q6H PRN ***** ***** *****, *****-C        [DISCONTINUED] acetaminophen (TYLENOL) tablet 500 mg  500 mg Oral Q6H PRN ***** ***** *****, *****-C        [DISCONTINUED] amLODIPine (NORVASC) tablet 2.5 mg  2.5 mg Oral Daily (AM) ***** ***** *****, *****-C        [DISCONTINUED] aspirin EC tablet 81 mg  81 mg Oral Daily (AM) ***** ***** *****, *****-C        [DISCONTINUED] atorvastatin (LIPITOR) tablet 40 mg  40 mg Oral Q PM ***** ***** *****, *****-C   40 mg at 11/20/16 2128    [DISCONTINUED] brimonidine (ALPHAGAN) 0.1 % ophthalmic solution 1 drop  1 drop Both Eyes BID ***** ***** *****, *****-C   1 drop at 11/20/16 2128    [DISCONTINUED] dextrose 50% injection syringe 12.5 g  25 mL Intravenous Q15 Min PRN ***** ***** *****, *****-C        [DISCONTINUED] docusate sodium (COLACE) capsule 100 mg  100 mg Oral BID ***** ***** *****, *****-C   100 mg at 11/20/16 2128    [DISCONTINUED] glipiZIDE (GLUCOTROL) tablet 10 mg  10 mg Oral Daily with Breakfast ***** ***** *****, *****-C        [DISCONTINUED] glucose chewable tablet 20 g  20 g Oral Q15 Min PRN ***** ***** *****, *****-C        [DISCONTINUED] insulin aspart (NovoLOG) injection 100 units/mL pen  0-20 Units Subcutaneous TID ***** ***** ***** *****, *****-C   0 Units at 11/20/16 1843    [DISCONTINUED] insulin aspart (NovoLOG) injection 100 units/mL pen  0-3 Units Subcutaneous Bedtime and early am ***** ***** *****, *****-C   0 Units at 11/20/16 2134    [DISCONTINUED] insulin glargine (LANTUS, BASAGLAR) injection 100 units/mL pen  5 Units Subcutaneous Bedtime ***** ***** *****, *****-C   5 Units at 11/20/16 2136    [DISCONTINUED] latanoprost (XALATAN) 0.005 % ophthalmic solution 1 drop  1 drop Both Eyes Bedtime ***** ***** *****, *****-C   1 drop at 11/20/16 2128    [DISCONTINUED] loratadine (CLARITIN) tablet 10 mg  10 mg Oral Daily (AM) ***** *****
 *****, *****-C        [DISCONTINUED] metFORMIN (GLUCOPHAGE) tablet 1,000 mg  1,000 mg Oral BID ***** ***** ***** *****, *****-C   1,000 mg at 11/20/16 1845    [DISCONTINUED] nitroGLYCERIN (NITROSTAT) SL tablet 0.4 mg  0.4 mg Sublingual Q5 Min PRN ***** ***** *****, *****-C        [DISCONTINUED] ondansetron (ZOFRAN) injection 4 mg  4 mg Intravenous Q8H PRN ***** ***** *****, *****-C        [DISCONTINUED] ondansetron (ZOFRAN) tablet 4 mg  4 mg Oral Q8H PRN ***** ***** *****, *****-C        [DISCONTINUED] valsartan (DIOVAN) tablet 80 mg  80 mg Oral Daily (AM) ***** ***** *****, *****-C           No Known Allergies       Review of Systems:   General - some fatigue at baseline; stable weight   Eyes - no vision changes   HENT - stable hearing, no nasal discharge or sinus tenderness, no difficulty swallowing, no mouth sores, no sore throat   Breast - no breast pain  Respiratory - dyspnea at baseline; no cough, no wheezing   Cardiovascular - no chest pain, no palpitations, no lower extremity edema   GI - constipation; no heartburn, no nausea, no emesis, no abdominal pain, no diarrhea, no melena, no hematochezia   GU - no suprapubic pain, no dysuria, no vaginal bleeding   Musculoskeletal - joint stiffness; no muscle pain, no bone pain   Endocrine - no heat/cold intolerance, no hot flashes  Heme/Lymph - no easy bruising/bleeding, no lymphedema   Neurological - no headaches, no dizziness, no numbness/tingling, no falls   Psychological - anxious; no depression   Skin - no rashes/lesions, no diaphoresis       Physical Exam:   ECOG 2   Vital Signs - BP 118/67 | Pulse 87 | Temp 36.3 C (97.3 F) (Oral)  | Resp 16 | Ht 150 cm (4' 11.06") Comment: 11/21/2016 ***** | Wt 53.5 kg (118 lb) | SpO2 98% | BMI 23.79 kg/m2   Constitutional - WDWN, NAD  Eyes - sclera anicteric, PERRL, EOMI  HENT - sinuses nontender, nasal mucosa intact, pharynx without erythema, stomatitis, or thrush   Lymph Nodes - no cervical, supraclavicular, or axillary lymphadenopathy.
   Respiratory - resonant to percussion throughout, CTA bilaterally; no wheezes, rhonchi, or crackles  Cardiovascular - Normal heart rate, normal rhythm, no murmurs, no edema  Breast - right simple mastectomy; no mass in left breast  GI - Bowel sounds normal, soft; no tenderness, no distention, no HSM  Musculoskeletal - No tenderness over bones or joints.   Neurologic - Alert & oriented x 3, ambulates w/o difficulty, good strength throughout  Psychiatric - Mood stable; no HI, SI, hallucinations, paranoia, or delusions  Skin - warm without rashes or lesions       Studies:  Available labs, pathology, and imaging were reviewed and independently interpreted, as described above in the HPI.  Lab Results   Component Value Date    WBC Count 9.9 11/20/2016    Hemoglobin 9.6 (L) 11/20/2016    Hematocrit 28.9 (L) 11/20/2016    Platelet Count 222 11/20/2016     Lab Results   Component Value Date    Sodium, Serum / Plasma 133 (L) 11/20/2016    Potassium, Serum / Plasma 4.2 11/20/2016    Chloride, Serum / Plasma 102 11/20/2016    Carbon Dioxide, Total 22 11/20/2016    Urea Nitrogen, Serum / Plasma 18 11/20/2016    Creatinine 1.01 (H) 11/20/2016    Glucose, non-fasting 201 (H) 11/20/2016     Lab Results   Component Value Date    Calcium, total, Serum / Plasma 9.4 11/20/2016     Lab Results   Component Value Date    Aspartate transaminase 27 11/20/2016    Alanine transaminase 19 11/20/2016    Alkaline Phosphatase 60 11/20/2016    Bilirubin, Total 0.9 11/20/2016    Albumin, Serum / Plasma 3.0 (L) 06/01/2015     10/25/16 SURGICAL PATHOLOGY REPORT    ***** WITH ADDENDUM *****    Patient Name: *****, *****  *****. Rec.#: *****  DOB: 04/22/1935 (Age: 81)  Sex: Female  Accession #: *****-*****  Visit #: *****  Service Date: 10/25/2016  Received: 10/25/2016  Location: PPE  Client:*****   Physician(s): ***** *****. ***** ((*****) *****-*****)    FINAL PATHOLOGIC DIAGNOSIS    A. Right breast, simple mastectomy:   1. Invasive ductal carcinoma, SBR   grade
 3, 3.6 cm, negative margins;  see comment.  2. Fibroadenomas.  3. Cystic dilatation of ducts and apocrine metaplasia.  4. Calcifications associated with apocrine metaplasia and benign ducts.    5. Unremarkable skin and nipple.    B. Right axillary sentinel lymph node, #1- ex vivo count 5000, biopsy:   Metastatic carcinoma in one lymph node, 1.2 cm, no extranodal extension  (May 19); see comment.    C. Right axillary palpable non-sentinel lymph node #1, biopsy: No tumor  in one lymph node (0/1).    D. Right new anterior lateral margin, excision: Benign fibroadipose  tissue, no carcinoma.    E. Right breast skin, excision: Benign skin, no carcinoma.    COMMENT:  Breast Tumor Synoptic Comment    - Laterality: Right.  - Tumor site: Upper outer quadrant.    - Position: 10 o'clock.  - Invasive tumor type: Invasive ductal carcinoma.  - Invasive tumor size: 3.6 cm.  - Tumor size determined based on tumor present in 4 consecutive slides  (slice *****-*****, slice thickness 0.9 cm).  - Invasive tumor size after neoadjuvant therapy: N/A.  - Invasive tumor grade (modified Scarff-Bloom-Richardson): Grade 3.    - Nuclear grade: 3 points.    - Mitotic count: 74/10HPF: 3 points.    - Glandular/tubular differentiation: 3 points.    - Total points: 9 points = grade 3.  - EIC (extensive intraductal component): Negative.  - Lymphatic/vascular invasion: Features suggestive of lymphovascular  invasion present (slide A3).  - Skin/nipple: No significant pathologic abnormality.  - Skeletal muscle: No significant pathologic abnormality.  - Margins for invasive tumor: Negative.    - Posterior margin: Negative (tumor is 1.6 cm away, on slide A3).    - Medial margin: Negative (tumor is 12 cm away).    - Lateral margin: Negative (tumor is 2.4 cm away).    - Anterior/superior margin: Negative (tumor is <0.1 cm away, on  slide A4-A5).    - Anterior/inferior margin: Negative (tumor is 0.3 cm away,
 on  slide A6).  - Ductal carcinoma in situ (DCIS): None.  - Microcalcifications: Present associated with apocrine metaplasia and  benign ducts.  - Lobular carcinoma in situ: None.  - Non-neoplastic breast: Cystic dilatation of ducts, apocrine  metaplasia, fibroadenomas.  - Lymph node status: Positive.      - Total number of nodes examined: 2.      - Total number of nodes with micrometastases: 0.      - Total number of nodes with macrometastases: 1.      - Size of largest metastasis in node: 1.2 cm.      - Extranodal extension: None.    - AJCC/UICC stage: pT2N1a.    - Tumor biomarker (ER/PR/HER2) status: Will be reported by addendum.        GROSS ABNORMALITIES: A firm, palpable, pink-white mass with irregular  borders (2.6 cm medial-to-lateral x 3 cm superior-to-inferior x 2.8 cm  anterior-to-posterior; spanning slices 5 through 8) is present at 10  o'clock, approximately 7 cm from the nipple. The inferior tip of slice  9 has a pink, dark-yellow area (1 x 0.7 x 0.8 cm) resembling a prior  biopsy site.    A small (0.5 cm greatest dimension) orange-yellow translucent bead is  identified in slice 10 within white, firm fibrous-appearing tissue. The  bead is wrapped in tissue paper and gauze and placed with the remainder  of slice 10 in the specimen container.    Margins:  -Anterior/superior: Mass is <0.1 cm from margin (slice 6 and 7).  -Anterior/inferior: Mass is 0.1 cm from margin (***** 7).  -Medial: Mass is 12 cm from margin.  -Lateral: Mass is 2.4 cm from margin  -Deep: Mass is 1.5 cm from margin (slice 8).  -Nipple base: Mass is 8.4 cm from nipple base.  -Skin: Mass is 2.5 cm from lateral skin ellipse tip.    There is no nipple retraction or crust. The skin has a blue discolored  area (0.7 cm) immediately adjacent to the areola grossly consistent with  a bruise. No scars are seen. The deep surface is smooth and appears  complete. The anterior surfaces have lobular
 yellow adipose tissue.   The parenchyma uninvolved by the mass consist of mixed yellow adipose  tissue (70%) and white, firm tissue (30%). The specimen is radiographed  in the Pathology Department and a clip is not identified.    ***** *****/Pathology Resident  ***** *****/Pathologist    Electronically signed out on *****/*****/***** *****:*****      Addendum   Date Ordered:   10/31/2016   Status:  Signed Out    Date Complete:   10/31/2016   By: ***** *****    Date Reported:   10/31/2016     Addendum Diagnosis    Addendum Comment    Immunohistochemical tests for estrogen and progesterone receptors, HER2  and Ki-67 were performed by manual morphometry on block A4.    The test for estrogen receptors is negative. There is no nuclear  staining in tumor cells. Internal positive control is present, and  external positive control is appropriate.    The test for progesterone receptors is negative. There is no nuclear  staining in tumor cells. Internal positive control is present, and  external positive control is appropriate.    Result of HER2 test: This carcinoma is negative for HER2 oncoprotein  over-expression. The staining intensity of this carcinoma was 1 on a  scale of 0-3.    Ki-67 proliferation index: Variable expression ranging from <5% to  25-30%. Overall, it is estimated to be ~15%.       Assessment and Recommendations:  81 y.o. postmenopausal patient with multiple medical comorbidities, including heart failure with LVEF 25% and DM II, who has recently been diagnosed with a Stage II (T2N1) triple negative breast cancer who requires additional testing before final recommendations can be made.    I discussed with the patient and her family the natural history of triple negative breast cancer.  We reviewed the available pathology and imaging reports.  I explained that chemotherapy is the only FDA approved treatment for triple negative breast cancer and that we generally provide chemotherapy to
 patients with Stage II TNBC.  We then reviewed her multiple medical comorbidities and I stated great concern that our chemotherapy regimens would likely cause her great harm and that she would have a very difficult time tolerating them.  We discussed in brief the regimens of AC/T (which would not be safe in the context of her heart failure), TC (which would not be safe in the context of her heart failure due to fluid shifts as well as her diabetes due to use of dexamethasone), and finally CMF (although less toxic, still introducing great risk).    We discussed the likelihood of recurrence which is quite difficult to assess, but certainly notable in the context of a high grade, node positive, triple negative breast cancer.  We discussed recurrence as both a local and distant concept.      We discussed the role of staging imaging to assess for metastasis at this time.  She is interested in knowing whether or not her disease has already spread and believes she might want to consider a lower risk chemotherapy were she found to have metastasis.  I think that with regards to her other medical issues, it would be of prognostic value at this time to know whether or not she does have metastases. We will obtain a PET/CT and resume our conversation with regards to treatment at that time.     Recommendations in Brief:  - obtain PET/CT    I spent a total of 90 minutes face-to-face with the patient and 85 minutes of that time was spent counseling regarding the diagnosis, the treatment plan, the prognosis, medication risks, symptoms and therapeutic options.
"""
test = med7(note)
[(ent.text, ent.label_) for ent in test.ents]


In [None]:
from collections import defaultdict

# example Med7 output
#entities = [('Lasix', 'DRUG'), ('500 mg', 'STRENGTH'), ('tablet', 'FORM'), ('1', 'DOSAGE'), ('tablet', 'FORM'),
            #('by mouth', 'ROUTE'), ('every 6 (six) hours as needed', 'FREQUENCY'),
            #('acetaminophen', 'DRUG'), ('500 mg', 'STRENGTH'), ('tablet', 'FORM'), ('Lasix', 'DRUG'), ('10', 'DOSAGE')]

# Step 1: Group by DRUG
entities = []
for ent in test.ents:
    entity = {
        'text': ent.text,
        'label': ent.label_,
        'start': ent.start_char,
        'end': ent.end_char
    }
    entities.append(entity)

# View result
for e in entities:
    print(e)


In [None]:
ls drive/MyDrive/

In [5]:
# read in the dataset as a pandas dataframe
import pandas as pd

# Load the Excel files
file1 = "./drive/MyDrive/merged_random50_discharge_prescriptions.csv"
#file2 = "./drive/My Drive/patient_data_cleaned.xlsx"
df1 = pd.read_csv(file1)

df1.head()

Unnamed: 0,subject_id,hadm_id,text,charttime,drug,dose_val_rx,dose_unit_rx,route,form_unit_disp
0,13954263,20218399,\nName: ___ Unit No: ___\n...,2188-12-02 00:00:00,"['CloniDINE', 'Nicotine Polacrilex', 'CloniDIN...","['0.1', '1', '0.1', '150', '25', '50', '1', '2...","['mg', 'STCK', 'mg', 'mg', 'mg', 'mg', 'PUFF',...","['PO', 'PO', 'PO', 'PO', 'PO/NG', 'PO/NG', 'IH...","['TAB', 'STCK', 'TAB', 'TAB', 'CAP', 'CAP', 'I..."
1,16651008,29326316,\nName: ___ Unit No: ___...,2144-07-31 00:00:00,"['Aspirin', 'Glucose Gel', 'Sodium Chloride 0....","['81', '15', '1000', '0', '1', '1334', '1', '1...","['mg', 'g', 'mL', 'UNIT', 'CAP', 'mg', 'mg', '...","['PO/NG', 'PO', 'IV', 'SC', 'PO/NG', 'PO/NG', ...","['TAB', 'TUBE', 'mL', 'VIAL', 'CAP', 'CAP', 'V..."
2,16421439,29826046,\nName: ___ Unit No: ___\n \...,2151-02-09 00:00:00,"['Heparin', 'OxyCODONE (Immediate Release)', '...","['5000', '5', '1', '3-10', '100', '500', '3-10...","['UNIT', 'mg', 'mg', 'mL', 'mL', 'mg', 'mL', '...","['SC', 'PO/NG', 'IV', 'IV', 'IV', 'IV', 'IV', ...","['mL', 'TAB', 'SYR', 'SYR', 'mL', 'BAG', 'SYR'..."
3,11369634,29132156,\nName: ___. Unit No: ___\n...,2136-12-11 00:00:00,"['Ondansetron', 'Docusate Sodium', 'OxycoDONE ...","['8', '200', '5', '5', '600', nan, '50', '500'...","['mg', 'mg', 'mg', 'mg', 'mg', nan, 'mcg', 'mg...","['IV', 'PO', 'PO/NG', 'PO/NG', 'PO/NG', 'SC', ...","['VIAL', 'CAP', 'TAB', 'TAB', 'TAB', nan, 'TAB..."
4,14082490,23218874,\nName: ___ Unit No: ___\n...,2187-08-13 00:00:00,"['LORazepam', 'Sodium Chloride 0.9% Flush', '...","['1', '3-10', '100', '125', '1', '50', '10', '...","['mg', 'mL', 'mg', 'mg', 'TAB', 'mL', 'mg', 'm...","['PO/NG', 'IV', 'PO/NG', 'PO/NG', 'PO/NG', 'IV...","['TAB', 'SYR', 'TAB', 'mL', 'TAB', 'mL', 'mL',..."


In [6]:
import re

def parse_dosage(dosage_string):
    # Remove spaces around hyphens
    dosage_string = re.sub(r'\s*-\s*', '-', dosage_string.strip())

    # Match dosage and unit
    match = re.match(r'([\d\.]+(?:-\d+\.?)?)\s*([a-zA-Z]+)', dosage_string)

    if match:
        dosage_value = match.group(1)
        unit = match.group(2)
        return dosage_value, unit
    else:
        return None, None

# Examples
examples = ["30 mg", "30mg", "3-5mg", "3mg-5mg", "3.5mg", "3.5-5.0 mg"]
for ex in examples:
    dosage, unit = parse_dosage(ex)
    print(f"Input: '{ex}' --> Dosage: '{dosage}', Unit: '{unit}'")

Input: '30 mg' --> Dosage: '30', Unit: 'mg'
Input: '30mg' --> Dosage: '30', Unit: 'mg'
Input: '3-5mg' --> Dosage: '3-5', Unit: 'mg'
Input: '3mg-5mg' --> Dosage: '3', Unit: 'mg'
Input: '3.5mg' --> Dosage: '3.5', Unit: 'mg'
Input: '3.5-5.0 mg' --> Dosage: 'None', Unit: 'None'


In [7]:
def group_med7_entities(med7_output):
    """Group Med7 entity outputs into structured medication blocks."""
    entities = []
    for ent in med7_output.ents:
      entity = {
          'text': ent.text,
          'label': ent.label_,
          'start': ent.start_char,
          'end': ent.end_char,
          'start word': ent.start,
          'end word': ent.end
      }
      entities.append(entity)

    return entities

def group_med7_entities_only_label_text(med7_output):
    """Group Med7 entity outputs into structured medication blocks."""
    entities = []
    for ent in med7_output.ents:
      entities.append({ent.label_: ent.text})

    return entities

def group_heuristic(med7_out):
    """Group Med7 entity outputs into structured medication blocks."""
    #entities = [dose_val, dose_unit, route, form]
    drug_list = []
    counter = 0
    for ent in med7_out.ents:
      # if it is a drug, make it a new entry
      if ent.label_ == 'DRUG':
        drug_list.append(counter)
      counter += 1

    drug_dict = {}
    j = 0
    # itterate through all of the indecies which are not drugs
    for j in range(len(drug_list)-1):
      start = drug_list[j] + 1
      end = drug_list[j+1]

      entities = ['Not Mentioned', 'Not Mentioned', 'Not Mentioned', 'Not Mentioned']
      while start != end:
        if med7_out.ents[start].label_ == 'ROUTE':
          entities[2] = (med7_out.ents[start].text)
        elif med7_out.ents[start].label_ == 'STRENGTH':
          dos, unit = parse_dosage(med7_out.ents[start].text)
          entities[0] = dos
          entities[1] = unit
        elif med7_out.ents[start].label_ == 'FORM':
          entities[3] = (med7_out.ents[start].text)
        start +=1

      # we need to add the tuple of terms to the dictionary
      drug = med7_out.ents[drug_list[j]].text.lower()
      if drug in drug_dict.keys():
        drug_dict[drug].append(tuple(entities))
      else:
        drug_dict[drug] = [tuple(entities)]

    # for key in drug_dict.keys():
    #   print(f"{key.lower()}")

    return drug_dict


In [None]:
group_heuristic(test)

In [None]:
print(df1['text'][0])

In [None]:
ls drive/MyDrive

In [None]:
import json

In [None]:
with open(f'./drive/My Drive/Med7_mimic/med7_test_{df1["hadm_id"][0]}.json', 'w') as f:
        json.dump(group_heuristic(test), f, indent=2)

In [None]:
test = med7(df1['text'][0])
group_med7_entities(test)

In [10]:
test1 = med7("23. Digoxin 125 mcg Tablet Sig: 0.5 Tablet PO DAILY (Daily). 24. Torsemide 20 mg Tablet Sig: Three (3) Tablet PO once a day.  25. Warfarin 1 mg Tablet Sig: Three (3) Tablet PO Once Daily at 4 ___.  ...Please note the following changes to your medications: please stop taking lasix, and ativan (as it may over sedate you); please begin taking torsemide 60mg once a day, metoprolol 100mg three times a day, digoxin 0.0625mg daily, coumadin 3mg daily, levofloxacin 500mg once a day for 2 more days. ")

In [11]:
group_med7_entities_only_label_text(test1)

[{'DRUG': 'Digoxin'},
 {'STRENGTH': '125 mcg'},
 {'FORM': 'Tablet'},
 {'DOSAGE': '0.5'},
 {'FORM': 'Tablet'},
 {'ROUTE': 'PO'},
 {'FREQUENCY': 'DAILY (Daily)'},
 {'DRUG': 'Torsemide'},
 {'STRENGTH': '20 mg'},
 {'FORM': 'Tablet'},
 {'DOSAGE': 'Three (3)'},
 {'FORM': 'Tablet'},
 {'ROUTE': 'PO'},
 {'FREQUENCY': 'once a day'},
 {'DRUG': 'Warfarin'},
 {'STRENGTH': '1 mg'},
 {'FORM': 'Tablet'},
 {'DOSAGE': 'Three (3)'},
 {'FORM': 'Tablet'},
 {'ROUTE': 'PO'},
 {'FREQUENCY': 'Once Daily at 4 ___.  ...Please note the following changes to your medications: please stop taking lasix,'},
 {'DRUG': 'ativan'},
 {'DRUG': 'torsemide'},
 {'STRENGTH': '60mg'},
 {'FREQUENCY': 'once a day'},
 {'DRUG': 'metoprolol'},
 {'STRENGTH': '100mg'},
 {'FREQUENCY': 'three times a day'},
 {'DRUG': 'digoxin'},
 {'STRENGTH': '0.0625mg'},
 {'FREQUENCY': 'daily'},
 {'DRUG': 'coumadin'},
 {'STRENGTH': '3mg'},
 {'FREQUENCY': 'daily'},
 {'DRUG': 'levofloxacin'},
 {'STRENGTH': '500mg'},
 {'FREQUENCY': 'once a day'},
 {'DURATI

In [None]:
note3 = combined_df['Entire Patient Note'][3]
# parse note3 by words
words = note3.split()
print(words[37])

In [None]:
test = med7(combined_df['Entire Patient Note'][3])
group_med7_entities(test)

In [None]:
ls ./drive/My\ Drive/Colab\ Notebooks/med7_char_out

In [None]:
for i, note in enumerate(df1['text']):
  print(note[:20])

In [8]:
def extract_medications(text):
    text_lower = text.lower()

    # Find position of "medications on admission"
    # idx = text_lower.find("medications on admission")
    # if idx != -1:
    #     return text[idx:]  # Return original casing starting from that point

    # Otherwise, find "discharge medications"
    idx = text_lower.find("discharge medications")
    if idx != -1:
        return text[idx:]  # Return original casing starting from that point

    # If neither phrase is found
    return None  # or `return text` if you prefer to keep the whole text

# Apply to create new column
df1['meds_section'] = df1['text'].apply(extract_medications)
df1.head()

Unnamed: 0,subject_id,hadm_id,text,charttime,drug,dose_val_rx,dose_unit_rx,route,form_unit_disp,meds_section
0,13954263,20218399,\nName: ___ Unit No: ___\n...,2188-12-02 00:00:00,"['CloniDINE', 'Nicotine Polacrilex', 'CloniDIN...","['0.1', '1', '0.1', '150', '25', '50', '1', '2...","['mg', 'STCK', 'mg', 'mg', 'mg', 'mg', 'PUFF',...","['PO', 'PO', 'PO', 'PO', 'PO/NG', 'PO/NG', 'IH...","['TAB', 'STCK', 'TAB', 'TAB', 'CAP', 'CAP', 'I...",Discharge Medications:\n1. Multivitamin Ta...
1,16651008,29326316,\nName: ___ Unit No: ___...,2144-07-31 00:00:00,"['Aspirin', 'Glucose Gel', 'Sodium Chloride 0....","['81', '15', '1000', '0', '1', '1334', '1', '1...","['mg', 'g', 'mL', 'UNIT', 'CAP', 'mg', 'mg', '...","['PO/NG', 'PO', 'IV', 'SC', 'PO/NG', 'PO/NG', ...","['TAB', 'TUBE', 'mL', 'VIAL', 'CAP', 'CAP', 'V...",Discharge Medications:\n1. Acetaminophen 650 m...
2,16421439,29826046,\nName: ___ Unit No: ___\n \...,2151-02-09 00:00:00,"['Heparin', 'OxyCODONE (Immediate Release)', '...","['5000', '5', '1', '3-10', '100', '500', '3-10...","['UNIT', 'mg', 'mg', 'mL', 'mL', 'mg', 'mL', '...","['SC', 'PO/NG', 'IV', 'IV', 'IV', 'IV', 'IV', ...","['mL', 'TAB', 'SYR', 'SYR', 'mL', 'BAG', 'SYR'...",Discharge Medications:\n1. Acetaminophen 1000...
3,11369634,29132156,\nName: ___. Unit No: ___\n...,2136-12-11 00:00:00,"['Ondansetron', 'Docusate Sodium', 'OxycoDONE ...","['8', '200', '5', '5', '600', nan, '50', '500'...","['mg', 'mg', 'mg', 'mg', 'mg', nan, 'mcg', 'mg...","['IV', 'PO', 'PO/NG', 'PO/NG', 'PO/NG', 'SC', ...","['VIAL', 'CAP', 'TAB', 'TAB', 'TAB', nan, 'TAB...",Discharge Medications:\n1. omeprazole 20 mg Ca...
4,14082490,23218874,\nName: ___ Unit No: ___\n...,2187-08-13 00:00:00,"['LORazepam', 'Sodium Chloride 0.9% Flush', '...","['1', '3-10', '100', '125', '1', '50', '10', '...","['mg', 'mL', 'mg', 'mg', 'TAB', 'mL', 'mg', 'm...","['PO/NG', 'IV', 'PO/NG', 'PO/NG', 'PO/NG', 'IV...","['TAB', 'SYR', 'TAB', 'mL', 'TAB', 'mL', 'mL',...",Discharge Medications:\n1. Vancomycin Oral Li...


In [9]:
import json
for i, note in enumerate(df1['meds_section']):
  # get med7 data
  hadm_ids = [26590365, 25987676, 28337573, 27119045, 23058381]
  if df1['hadm_id'][i] in hadm_ids:
    print(f"Original note length: {len(df1['text'][i])}")
    print(f"Subset note length: {len(note)}")
    print(f"Starting med7 for group {df1['hadm_id'][i]}")
    med7_out = med7(note)
    print(f"Completed med7 for group {df1['hadm_id'][i]}")
    med_list = group_heuristic(med7_out)

    # Save to JSON
    with open(f'./drive/My Drive/Med7_mimic_val/med7_val_{df1["hadm_id"][i]}.json', 'w') as f:
        json.dump(med_list, f, indent=2)
    print(f"Saved output as json for group {df1['hadm_id'][i]}")



Original note length: 16827
Subset note length: 3805
Starting med7 for group 26590365
Completed med7 for group 26590365
Saved output as json for group 26590365
Original note length: 7034
Subset note length: 2932
Starting med7 for group 23058381
Completed med7 for group 23058381
Saved output as json for group 23058381
Original note length: 10879
Subset note length: 2611
Starting med7 for group 28337573
Completed med7 for group 28337573
Saved output as json for group 28337573
Original note length: 12571
Subset note length: 3528
Starting med7 for group 25987676
Completed med7 for group 25987676
Saved output as json for group 25987676
Original note length: 10074
Subset note length: 1812
Starting med7 for group 27119045
Completed med7 for group 27119045
Saved output as json for group 27119045
