In [None]:
import pandas as pd
import re
from collections import Counter
from tqdm import tqdm
tqdm.pandas()

pnotes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")

In [None]:
import json

#See https://www.kaggle.com/jdoesv/spellcheck-pseudonames-removal
pseudonames = ["Dillon Cleveland", "Suzanne Powelton", "Dolores Montgomery", 
               "Chad Hamilton", "Karin Moore", "Edie Whelan", 
               "Kane Smith", "Angela Tompkins", "Loraine Wicks", "Stephanie Madden"]
pseudonames = [nc for name in pseudonames for nc in name.lower().split()]
pseudonames = list(set(pseudonames))
with open("pseudonames.json", "w") as f:
    json.dump(pseudonames , f, indent=2) #These neednt be updated if there's a partial match

```python
acronym_regex = re.compile(r"\W([a-zA-Z]{1,2}\/){2,}[a-zA-Z]{1,2}\W")
med_acronyms = pnotes["pn_history"].progress_apply(lambda txt: [x.group() for x in acronym_regex.finditer(txt)])
med_acronyms = sum(med_acronyms.tolist(), [])
med_acronyms = pd.Series(med_acronyms).value_counts()
ma_hashes = ["".join(sorted(x.replace("/", ""))) for x in med_acronyms.index]

med_acronyms = med_acronyms.reset_index()
med_acronyms.columns = ["acr", "ct"]

med_acronyms["ma_hash"] = med_acronyms["acr"].apply(lambda x: "".join(sorted(x.replace("/", ""))))

ma_hashes = med_acronyms.groupby("ma_hash").agg({"ct": "sum", "acr": list}).reset_index()
ma_hashes = ma_hashes[ma_hashes["ct"]>1]
ma_hashes.sort_values("ct")
```

In [None]:
#f/c/n/v , fcnv = fever, cough, nausea, and vomiting
#n/v/d/c, n/v/d, nvdc, nvd = nausea, vomitting, diarrhea, constipation
#w/c/o, wco = with complaints of Ok to ignore
#f/u: follow up
#h/a, ha = head ache


#n/v/d/c/f nausea, vomitting, diarrhea, has the most. cp = chest pain. But let the model learn it. 
#w/t/n = weakness, tingling, numbing. ignored. 
_ = """
import itertools
acrs = "nvdcf"
all_acrs = ["".join(x) for x in itertools.permutations(acrs)]
all_acrs = list(sorted(set(["".join(x) for combo in all_acrs for i in range(2, len(acrs)) for x in itertools.combinations(combo, i)])))
"""
#TODO: Revisit.

In [None]:
from nltk.corpus import words, wordnet, brown, gutenberg
from nltk.stem import WordNetLemmatizer
manywords = set.union(*[set(words.words()), set(wordnet.words()), set(brown.words()), set(gutenberg.words())])
lemmatizer = WordNetLemmatizer()

In [None]:
def words(text): 
    return re.findall(r'[a-z]+', text.lower()) #1 letter outliers are part of acronyms

WORDS = Counter(words("\n".join(pnotes["pn_history"].tolist())))
terms = pd.Series(WORDS).reset_index()
terms.columns = ["term", "ct"]
terms["is_eng_word"] = terms["term"].progress_apply(lambda x: lemmatizer.lemmatize(x) in manywords)
terms.loc[terms["term"].isin(pseudonames), "is_eng_word"] = True

In [None]:
import json
with open("../input/med-terms-from-uml/med_terms_unique.json") as f:
    med_lex = json.load(f)
med_lex = set(med_lex["med_terms"])
terms["is_med_word"] = terms["term"].progress_apply(lambda x: x in med_lex)    
terms.loc[terms["term"]=="adoral", "is_med_word"] = False
terms.loc[terms["term"]=="adoral", "is_eng_word"] = False

In [None]:
#terms[(terms["term"].str.fullmatch("a+d+[aeiou]*r+[aeiou]*l+"))|(terms["term"].isin(["etoh", "episdoe"]))]

In [None]:
(terms["is_med_word"]|terms["is_eng_word"]).value_counts()

Fix everything in word level. Easier to map back to original indices. 

In [None]:
terms["is_word_manual"] = False

#Filtered from terms[~(terms["is_med_word"])&~(terms["is_eng_word"])&(terms["ct"]>=50)].sort_values("ct")["term"].tolist()
med_terms = ["medhx", "obhx", "sexhx", "socialhx", "sigecaps", "surghx", "aggrav", "allev", 'fmhx', 'ocps', 'pshx', 'nkda', "nvdc"] #nvda = nausea, vomitting, diarrhea, constipation
med_terms += ["dyschezia", "holocranial", "blurring", "yom", "yof"]
terms["is_word_manual"] = terms["term"].isin(med_terms)

In [None]:
terms = terms[terms["term"].apply(len)!=1].copy().reset_index(drop=True) #We can't make anything meaningful here.

In [None]:
is_correct_spelling = terms["is_eng_word"]|terms["is_med_word"]|terms["is_word_manual"]
clinical_corpus = terms.loc[is_correct_spelling]["term"].tolist() #, "ct"]].copy().set_index("term")
WORDS = terms.loc[is_correct_spelling][["term", "ct"]].copy().set_index("term").to_dict()["ct"]

In [None]:
import re
from collections import Counter

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N if word in WORDS else 0.

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))


In [None]:
terms["corrected_spelling"] = None
terms.loc[is_correct_spelling, "corrected_spelling"] = terms.loc[is_correct_spelling, "term"]
terms.loc[~is_correct_spelling, "corrected_spelling"] = terms.loc[~is_correct_spelling, "term"].progress_apply(correction)

In [None]:
terms[terms["term"].isin(["yof", "yom", "etoh", "caffeine", "episdoe", "menses", "cramp", "tampon", "metrorrhagias"])]

In [None]:
terms.to_csv("corrected_spellings.csv", index=False)

In [None]:
terms[terms["corrected_spelling"]!=terms["term"]].sample(20)#.shape[0]+terms[is_correct_spelling].shape[0], terms.shape