This will mainly list the different abbreviations with examples in addition to the word count

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import randint
from nltk.corpus import stopwords
import string, re
%matplotlib inline
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
df.head()

We can see that the patient_notes csv file doesn't contain any nulls and contains 42k notes.

This should be good to explore the data a bit.

In [None]:
df.info()

# Simple Histogram for case numbers

From the histogram, we can see that case number 1 doesn't have a lot of examples

In [None]:
df_label = df['case_num']
arr = plt.hist(df_label);
plt.xticks(range(10));
for i in range(10):
    plt.text(arr[1][i],arr[0][i],str(int(arr[0][i])))

# Random notes and examples on abbreviations

Here, I'll show some sentences so you can see the note structure and see some of the abbreviations in context

**N.B.** Some abbreviations might mean different things.

sh could mean either social history or surgical history.

ED could be emergency department, erectile dysfunction or eating disorder.

So a direct table substition might not perform as well as you think, the optimal way is to predict which one it is based on something like a context window.

In [None]:
abbreviations = {
    "ros": "review of systems",
    "fh": "family history",
    "fhx": "family history",
    "pmh": "past medical history",
    "nka": "no known allergies",
    "nkda": "no known drug allergies",
    "rx": "medical prescription", #rx is actually short for recipe
    "lmp": "last menstrual period",
    "etoh": "alcohol",
    "sob": "shortness of breath",
    "c/o": "complains of",
    "alls": "allergies",
    "hpi": "history of present illness",
    "f/u": "follow up",
    "htn": "hypertension",
    "rlq": "right lower quadrant",
    "llq": "left lower quadrant",
    "ruq": "right upper quadrant",
    "luq": "left upper quadrant",
    
#     "psh": "past surgical hisotry",
#     "psh": "past social hisotry",
#     "sh": "social history",
#     "sh": "surgical history",
#     "sh": "sexual history",
}

In [None]:
len_sample = 300
pn_notes = df['pn_history']
sample = pn_notes.sample(len_sample)

### Sentence 1

In [None]:
print(sample.iloc[12])

****
### Sentece 1 Notes

You can see the use of MHx -> medical hisotry, SH -> social history, ETOH -> alcohol, SHx -> surgical history

****

In [None]:
print(sample.iloc[14])

****
### Sentence 2 Notes

c/o -> complains of , HPI -> history of present illness

nkda -> no known drug allergies, ROS -> review of systems
****

In [None]:
print(sample.iloc[99])

****
### Sentece 3 Notes

f/u -> follow up, sob -> shortnes of breath

****

# Word count for each case number

In [None]:
word_count = [ dict() for _ in range(10)] # 10 emmpty dicts

# we are going to exclude stop words and punctuations
exluded_words = [*stopwords.words('english'), *[x for x in string.punctuation], 'ago', 'per', 'day', 'days', 'month' , 'months']
# patterns
pattern = re.compile('\d{1,}[-/]\d{1,}')

for case_num, sentence in tqdm(zip(df['case_num'], df['pn_history'])):
    for word in sentence.lower().split():
        if (word not in exluded_words and not word.isdigit() and not re.match(pattern, word)):
            word_count[int(case_num)].setdefault(word, 0)
            word_count[int(case_num)][word] += 1


In [None]:
fig, axes = plt.subplots(nrows=10,ncols=1, figsize=(15,60))

for i, ax in enumerate(axes):
    wc = sorted(word_count[i].items(), key=lambda x : -x[1])[:15]
    wc = wc[::-1]
    counts = [x[1] for x in wc]
    words = [x[0] for x in wc]
    ax.barh(words,counts)
    ax.set_title(f'case number {i} word count')
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.tick_params(axis='both', which='minor', labelsize=12)
    for i, v in enumerate(counts):
        ax.text(v + 1, i - .1, str(v), color='red', fontweight='bold')
    