In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm
import collections
import random
import re

In [3]:
# Define relevant paths (adjust as needed)
MIMIC_DIR = '../data/mimic3'
NOTEEVENTS_FNAME = 'NOTEEVENTS.csv'
NOTE_SPLIT_DIR = os.path.join(MIMIC_DIR, 'split')
NOTE_PER_SPLIT = 100000  # Number of rows per split

ENGLISH_WORDS_FPATH = '../data/english_words/words.txt'
UMLS_DIR = '../data/umls'
LEXICON_OUT_DIR = '../data/lexicon'
LEXICON_OUT_FNAME = 'lexicon.json'

# For filtering
ALPHA_SET = set("abcdefghijklmnopqrstuvwxyz")
ALLOWED_CHAR_SET = set("abcdefghijklmnopqrstuvwxyz'-&")
CHAR_SET = set("0123456789abcdefghijklmnopqrstuvwxyz+-*/^.,;:=!?'()[]{} ")

In [5]:
# Split the NOTEEVENTS.csv into smaller files

orig_note_fpath = os.path.join(MIMIC_DIR, NOTEEVENTS_FNAME)

# Load the original MIMIC-III note file
df_note = pd.read_csv(orig_note_fpath, low_memory=False)
print(f"{orig_note_fpath} contains {len(df_note)} rows")

# Create output dir if it doesn't exist
if not os.path.exists(NOTE_SPLIT_DIR):
    os.makedirs(NOTE_SPLIT_DIR)

# Split and save
for i in range(0, len(df_note), NOTE_PER_SPLIT):
    split_chunk = df_note[i:i+NOTE_PER_SPLIT]
    split_fpath = os.path.join(NOTE_SPLIT_DIR, f'NOTEEVENTS_{i//NOTE_PER_SPLIT}.csv')
    split_chunk.to_csv(split_fpath, index=False)
    print(f"Created {split_fpath} with {len(split_chunk)} rows")


../data/mimic3\NOTEEVENTS.csv contains 2083180 rows
Created ../data/mimic3\split\NOTEEVENTS_0.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_1.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_2.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_3.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_4.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_5.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_6.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_7.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_8.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_9.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_10.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_11.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_12.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_13.csv with 100000 rows
Created ../data/mimic3\split\NOTEEVENTS_14.csv with 100000 row

In [7]:

# 1) Load English words (with explicit UTF-8, fallback to 'replace' if you want to avoid errors altogether)
with open(ENGLISH_WORDS_FPATH, 'r', encoding='utf-8', errors='replace') as fd:
    english_vocab = [line.strip().lower() for line in fd]

# Filter out words that have only allowed chars and start with alpha
english_vocab = [
    w for w in english_vocab
    if w
    and all(c in ALLOWED_CHAR_SET for c in w)
    and (w[0] in ALPHA_SET)
]
english_vocab_set = set(english_vocab)
print(f"Loaded {len(english_vocab_set)} English words")

# 2) Load UMLS LRWD
lrwd_vocab_set = set()
lrwd_path = os.path.join(UMLS_DIR, 'LRWD')

with open(lrwd_path, 'r', encoding='utf-8', errors='replace') as fd:
    lines = fd.read().splitlines()  # or fd.readlines()
for line in tqdm(lines, desc="Loading LRWD"):
    # The original code splits the line into word, eui, temp using '|'
    # If your file format differs, handle that here
    word, eui, temp = line.split('|')
    # word might have spaces, but typically they are single words
    if all(c in ALLOWED_CHAR_SET for c in word) and (word[0] in ALPHA_SET):
        for splited in word.split():
            lrwd_vocab_set.add(splited.lower())

print(f"LRWD has {len(lrwd_vocab_set)} words")

# 3) Load UMLS Prevariants
prevariants_vocab_set = set()
prevars_path = os.path.join(UMLS_DIR, 'prevariants')

with open(prevars_path, 'r', encoding='utf-8', errors='replace') as fd:
    lines = fd.read().splitlines()
for line in tqdm(lines, desc="Loading prevariants"):
    phrase, eui, temp = line.split('|')
    for w in phrase.split():
        if all(c in ALLOWED_CHAR_SET for c in w) and (w[0] in ALPHA_SET):
            prevariants_vocab_set.add(w.lower())

print(f"Prevariants has {len(prevariants_vocab_set)} words")

# Merge them
umls_vocab_set = lrwd_vocab_set | prevariants_vocab_set
print(f"UMLS total = {len(umls_vocab_set)} words")

# 4) Create final total vocab
total_vocab_set = english_vocab_set | umls_vocab_set
total_vocab_list = sorted(list(total_vocab_set))
print(f"Total vocab = {len(total_vocab_list)} words")

# 5) Write out the lexicon (JSON)
if not os.path.exists(LEXICON_OUT_DIR):
    os.makedirs(LEXICON_OUT_DIR)

lexicon_out_fpath = os.path.join(LEXICON_OUT_DIR, LEXICON_OUT_FNAME)
with open(lexicon_out_fpath, 'w', encoding='utf-8') as fd:
    json.dump(total_vocab_list, fd, ensure_ascii=False)

print(f"Saved final lexicon to {lexicon_out_fpath}")



Loaded 465091 English words


Loading LRWD: 100%|██████████| 1304243/1304243 [00:01<00:00, 723352.02it/s]


LRWD has 401471 words


Loading prevariants: 100%|██████████| 923076/923076 [00:02<00:00, 431610.30it/s]


Prevariants has 509533 words
UMLS total = 514305 words
Total vocab = 822919 words
Saved final lexicon to ../data/lexicon\lexicon.json


In [13]:
# Load the saved lexicon
with open(os.path.join(LEXICON_OUT_DIR, LEXICON_OUT_FNAME), 'r', encoding='utf-8') as f:
    final_lexicon = set(json.load(f))

# Read a portion of a split MIMIC-III file (adjust filename/column if needed)
df_sample = pd.read_csv(os.path.join(NOTE_SPLIT_DIR, 'NOTEEVENTS_0.csv'), nrows=1000)
text_lines = df_sample['TEXT'].dropna().tolist()

# Quick, naive tokenization on whitespace
tokens = []
for line in text_lines:
    tokens.extend(line.lower().split())

if tokens:
    in_lex = sum(token in final_lexicon for token in tokens)
    coverage = in_lex / len(tokens)
    print(f"Checked {len(tokens)} tokens. Coverage in lexicon: {coverage:.2%}")
else:
    print("No tokens found (column might be empty).")



Checked 1580871 tokens. Coverage in lexicon: 69.15%


In [19]:
from collections import Counter

# If you want to keep the original name, do this:
# all_tokens = tokens

freq_counter = Counter(tokens)          # ← use the list you actually filled
print(f"Unique token types: {len(freq_counter):,}")

most_common_tokens = freq_counter.most_common(10)
print("Top 10 tokens by frequency:")
for token, count in most_common_tokens:
    print(f"{token}: {count}")

rare_count = sum(1 for t, c in freq_counter.items() if c == 1)
print(f"\nTokens that appear only once: {rare_count:,}")

oov_list = [t for t in freq_counter if t not in final_lexicon]
print(f"\nOut‑of‑lexicon token types: {len(oov_list):,}")

oov_sorted = sorted(oov_list, key=lambda x: freq_counter[x], reverse=True)[:20]
print("Top 20 out‑of‑lexicon tokens:")
for token in oov_sorted:
    print(f"{token}: {freq_counter[token]}")




Unique token types: 84,206
Top 10 tokens by frequency:
the: 38562
and: 32645
to: 28448
of: 27545
was: 26744
with: 19844
on: 18046
a: 17907
in: 15520
for: 14022

Tokens that appear only once: 44,730

Out‑of‑lexicon token types: 68,443
Top 20 out‑of‑lexicon tokens:
(1): 6775
sig:: 6649
.: 5967
-: 5908
**]: 5444
[**last: 3887
history:: 2812
1.: 2710
2.: 2656
[**hospital1: 2541
3.: 2350
2: 2235
1: 2207
#: 2178
date:: 2011
4.: 1937
(stitle): 1906
[**first: 1736
s/p: 1732
(daily).: 1695


In [21]:
corrected = [
    "cardiology", "lungs", "lungs", "lungs", "lungs", "lungs",
    "echinocytes", "procedure", "procedure", "evening", "evening",
    "evening", "etanercept", "etanercept", "etanercept", "hepatology"
    # ... more if needed
]

with open('../data/mimic_clinspell/lexicon_en.json', 'r', encoding='utf-8') as f:
    clinspell_list = json.load(f)

clinspell_set = set(clinspell_list)
print(f"Clinspell vocabulary size: {len(clinspell_set)}")

missing = [w for w in corrected if w not in clinspell_set]
print(f"Clinspell did not cover {len(missing)} corrected words:")
print(missing[:20])



Clinspell vocabulary size: 293463
Clinspell did not cover 1 corrected words:
['echinocytes']
