In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
print(pd.__version__)
pd.options.display.max_columns = 300
pd.options.display.max_colwidth = 200
pd.options.display.min_rows = 200
pd.options.display.max_rows = 200

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import ast
import os
from difflib import get_close_matches


# Read data

In [None]:
train = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/train.csv") 
#                    nrows = nrows_)
notes = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv")
features = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/features.csv")
test = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/test.csv")

print(len(train), len(notes))
train_merged = pd.merge(train, notes, 
                        on = ["case_num", "pn_num"], 
                        how = "inner")
train_merged = pd.merge(train_merged, features[["feature_num","feature_text"]], 
                        on = ["feature_num"], 
                        how = "left")
print(len(train_merged))
train_merged.head(3)

# Key statistics

In [None]:
print(f"Unique case num: {train_merged.case_num.nunique()}")
print(f"Unique pn num: {train_merged.pn_num.nunique()}")
print(f"Unique feature num: {train_merged.feature_num.nunique()}")
print(f"Cases with annotation: {train_merged.location[train_merged.location != '[]'].shape[0]}")
print(f"Cases without annotation: {train_merged.location[train_merged.location == '[]'].shape[0]}")


# Does missing annotation have some info that is missed out?

In [None]:
train_merged['find_f_txt_pn'] = train_merged.apply(lambda x: [el for el in x["feature_text"].lower().replace("-"," ").split(" or ") if el in x["pn_history"].lower()], axis=1)

train_merged[train_merged.location == '[]'][['feature_text','find_f_txt_pn']]\
    .explode('find_f_txt_pn')\
    .groupby(['feature_text','find_f_txt_pn']).size()\
    .reset_index().set_axis(['feature_text','find_f_txt_pn','cnt'],axis='columns')\
    .sort_values(['cnt'],ascending=False)

# Number of cases for each feature_text in train data

- Male / Female and Age related have high counts (> 200) 
- All other feature_text are 100 counts

In [None]:
# Count of feature_text in train data
train_merged\
    .groupby('feature_text').agg({'id':['count']})\
    .reset_index().set_axis(['feature_text','count'],axis='columns')\
    .sort_values('count',ascending=True)\
    .plot(kind='barh',figsize=(12,24),
          x='feature_text', 
          y='count', 
          title ='count of feature_text')

# Pct of missing annotation across feature_text

- 90% + missing annotation for certain feature_text like Stress, etc

In [None]:
# Count of feature_text in train data
train_merged.assign(no_ann = lambda x: x['location'] == '[]' )\
    .groupby('feature_text').agg({'no_ann':['count',lambda z:100*np.mean(z)]})\
    .reset_index().set_axis(['feature_text','count','no_ann_pct'],axis='columns')\
    .sort_values('no_ann_pct',ascending=True)\
    .plot(kind='barh',figsize=(12,24),
          x='feature_text', 
          y='no_ann_pct', 
          title ='% missing annotations across feature_text',
          color='red')

# Common annotations for each feature text 

- Age related / Gender seems quite predictable with annotations and features being similar

In [None]:
# Wordcloud of annotations for each feature type
f_text_ann = train_merged[['feature_text','annotation']]\
                 .assign(annotation = lambda x: [[''] if e == '[]' else ast.literal_eval(e) for e in x['annotation']])\
                 .explode('annotation')\
                 .assign(annotation = lambda x: x['annotation'].str.lower())\
                 .groupby(['feature_text']).apply(lambda x: " ".join(x['annotation'].astype('str')))\
                 .reset_index()\
                 .set_axis(['feature_text','annotation'], axis='columns')


for txt in f_text_ann.iterrows():
    print("==="*30)
    #i = txt[0]+1
    print(txt[1]['feature_text'].upper())
    #ax = fig.add_subplot(j, 2, i)#.set_title( txt[1]['feature_text'].upper() )
    #print()
    wordcloud = WordCloud(max_font_size=50, 
                          max_words=30, 
                          background_color="white").generate(txt[1]['annotation'])
    plt.figure(figsize = (8,8))
    #plt.plot()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()


# Top 5 annotations for each feature_text

In [None]:
train_merged[['feature_text','annotation']]\
         .query("annotation != '[]'")\
         .assign(annotation = lambda x: [[''] if e == '[]' else ast.literal_eval(e) for e in x['annotation']])\
         .explode('annotation')\
         .assign(annotation = lambda x: x['annotation'].str.lower())\
         .groupby(['feature_text','annotation']).size()\
         .reset_index()\
         .set_axis(['feature_text','annotation', 'cnt'], axis='columns')\
         .assign(rnk = lambda x: x.groupby('feature_text').cnt.transform('rank',method='max', ascending=False))\
         .sort_values(['feature_text','cnt'], ascending=[True, False])\
         .query("rnk<6")\
         .groupby("feature_text").agg({"annotation":lambda x: list(x),
                                       "cnt":lambda x: list(x)})\
         .reset_index()\
         .set_axis(['feature_text','top 5 annotation','top 5 annotation cnt'], 
                   axis='columns')


In [None]:
train_merged[['feature_text','annotation']]\
         .query("annotation != '[]'")\
         .assign(annotation = lambda x: [[''] if e == '[]' else ast.literal_eval(e) for e in x['annotation']])\
         .explode('annotation')\
         .assign(annotation_len = lambda x: [len(el.split(" ")) for el in x['annotation']] )\
         .groupby('feature_text').agg({'annotation_len':['count','mean', lambda x: x.quantile(0.5)]})\
         .reset_index()\
         .set_axis(['feature_text','cnt', 'mean_len_wrds','med_len_wrds'], axis='columns')\
         .sort_values('med_len_wrds',ascending=False)


# Top 5 annotation words for each feature_text

In [None]:
train_merged_split_words = \
train_merged[['feature_text','annotation']]\
         .query("annotation != '[]'")\
         .assign(annotation = lambda x: [[''] if e == '[]' else ast.literal_eval(e) for e in x['annotation']])\
         .explode('annotation')\
         .assign(annotation = lambda x: [str(y).lower().split() for y in x['annotation']])\
         .explode('annotation')\
         .groupby(['feature_text','annotation']).size()\
         .reset_index()\
         .set_axis(['feature_text','annotation', 'cnt'], axis='columns')\
         .assign(rnk = lambda x: x.groupby('feature_text').cnt.transform('rank',method='max', ascending=False))\
         .sort_values(['feature_text','cnt'], ascending=[True, False])

train_merged_split_words.query("rnk<6")\
         .groupby("feature_text").agg({"annotation":lambda x: list(x),
                                       "cnt":lambda x: list(x)})\
         .reset_index()\
         .set_axis(['feature_text','top 5 annotation','top 5 annotation cnt'], 
                   axis='columns')


# Similar words (probable spelling mistakes) across feature_text annotations

- <b>2-to-3-beers-a-week</b> have similar words like <u>[occasional, oocasional, occasioanl, occaional]<\u>
- <b>Adderall-use</b> have similar words like <u>[aderral, adderal, aderrall, adderral]<\u>
- <b>burning-OR-gnawing-OR-burning-and-gnawing</b> have similar words like <u>[nawing, ngawing, gnawing, knawing]<\u>
- <b>Recent-upper-respiratory-symptoms</b> have similar words like <u>[rhinorrhea, rhinnorhea, rhinorrhea,, rhinorhea]<\u>

In [None]:
# Function to extract similar words and group them
def get_close_matches_(lst):
    similar_lst = []
    while len(lst) > 1:
        ref_word = lst[0]
        lst = lst[1:]
        matches = get_close_matches(ref_word, lst, cutoff = 0.75)
        if len(matches) > 0:
            similar_lst.append( matches+[ref_word] )
            lst = list(set(lst) - set(matches))
    return similar_lst

#get_close_matches_(['ape', 'apple', 'peach', 'puppy','appl'])

In [None]:
train_merged_split_words\
    .groupby("feature_text")\
    .apply(lambda x: list(x["annotation"]))\
    .reset_index()\
    .set_axis(['feature_text','annotation words'], axis='columns')\
    .assign(similar_wrds = lambda x: [get_close_matches_(x_) for x_ in x['annotation words']])\
    [['feature_text','similar_wrds']]\
    .explode('similar_wrds')\
    .assign(len_ = lambda x: [len(x_) if isinstance(x_, list) else 0 for x_ in x['similar_wrds']])\
    .sort_values('len_',ascending=False)[['feature_text','similar_wrds']]
