In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install textstat

import numpy as np
import pandas as pd 
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import textstat
import spacy
nlp = spacy.load('en_core_web_sm')

from termcolor import colored
from wordcloud import WordCloud,STOPWORDS
from spacy import displacy
from nltk.tokenize import sent_tokenize, word_tokenize 

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
test = pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv')
ss = pd.read_csv('../input/nbme-score-clinical-patient-notes/sample_submission.csv')
pn = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
train = train.merge(features, on=['case_num','feature_num'], validate='m:1')
train = train.merge(pn, validate='m:1')

In [None]:
features.head()

In [None]:
pn.nunique()

In [None]:
def patient_data(pn_num):
    subset = train[train['pn_num'] == pn_num]
    
    features_lst = subset['feature_num'].tolist()
    annotations_lst = subset['annotation'].tolist()
    
    subset_c = subset.copy()
    subset_c['location'] = subset_c['location'].apply(eval)
    subset_c['annotation'] = subset_c['annotation'].apply(eval)
    locations  = subset_c["location"]
    annotations = subset_c["annotation"]
    
    print("*"*80)
    print(colored("Patient Number: " + str(pn_num), 'green'))
    patient_history = pn[pn['pn_num']==pn_num]['pn_history'].item()
    
    print(colored("\nAnnotated Patient History", 'green'))
    
    ents = []
    for location in locations:
        for i in range(len(location)):
            for loc in location:
                val = loc.split()
                ents.append({
                'start': int(val[0]), 
                'end' :  int(val[1]),
                'label' : "Annotation"
                })
    ents = sorted(ents, key = lambda i: i['start'])

    doc = {
        'text' : patient_history,
        'ents' : ents
    }
    colors = {"Annotation" :"linear-gradient(to right, #2980b9, #6dd5fa, #ffffff);" } 
    options = {"colors": colors}
    spacy.displacy.render(doc, style='ent', options = options , manual=True, jupyter=True);
    
    print(colored("\nVisualizing NER", 'green'))
    doc = nlp(patient_history)
    displacy.render(doc, style='ent', jupyter = True)
    
    print(colored("\nVisualizing POS tagging", 'green'))
    sentences = sent_tokenize(patient_history)
    word_count = lambda sentence: len(word_tokenize(sentence))
    pos_text = max(sentences, key=word_count)  
    doc = nlp(pos_text)
    displacy.render(doc, style="dep")

    print(colored("\nFeatures", 'green'))
    for feature_num in features_lst:
        feature = features[features['feature_num'] == feature_num]['feature_text'][feature_num]
        print(colored(feature, 'blue'))
        
patient_data(16)
patient_data(46)
patient_data(100)

In [None]:
def avg_word_len(df):
    df = df.str.split().apply(lambda x : [len(i) for i in x]).map(lambda x: np.mean(x))
    return df

In [None]:
def plot_distribution(text_props, num_sub):
    fig, ax = plt.subplots(1,num_sub,figsize=(20,10))
    sns.kdeplot(data=text_props, x="text_len",color="#7209B7",ax=ax[0])
    ax[0].set_title("Character count distribution",font="Serif")
    
    sns.kdeplot(data=text_props, x="avg_text",color="#FFBA08",ax=ax[1])
    ax[1].set_title("Average word length distribution",font="Serif")
    
    if num_sub != 2:
        sns.kdeplot(data=text_props, x="lexicon_count",color="#F72585",ax=ax[2])
        ax[2].set_title("Word count distribution",font="Serif")
        
    plt.tight_layout()
    fig.subplots_adjust(wspace=0.2, hspace=0.2, top=0.93)
    plt.show()

In [None]:
def text_properties(df, col, num_sub):
    text_props = df.copy()
    text_len = df[col].str.len()
    avg_text = avg_word_len(df[col])
    lexicon_count = []
    sentence_count = []
    for i in range(len(df)):
        lc = textstat.lexicon_count(df[col][i])
        lexicon_count.append(lc)

    text_props['text_len'] = text_len
    text_props['lexicon_count'] = lexicon_count
    text_props['avg_text'] = avg_text
    
    print(colored(col, 'green'))
    plot_distribution(text_props, num_sub)
    
text_properties(train, 'annotation',3)
text_properties(features, 'feature_text',2)
text_properties(pn, 'pn_history',3)

In [None]:
# Add label if the test taker correctly identified the feature
train['correct'] = ~(train['annotation'] == "[]")
train.groupby('pn_num')['correct'].mean().sort_values() \
    .plot(kind='hist', bins=25, figsize=(12, 5),
          title='% of Features Correctly Noted by Doctor')
plt.show()

In [None]:
test_taker_results = train.groupby(['pn_num','case_num'])['correct'] \
    .mean().reset_index()
fig, ax = plt.subplots(figsize=(12, 5))
sns.boxplot(data=test_taker_results, x='case_num', y='correct')
ax.set_title('% of Features Captured by Case Number')
ax.set_xlabel('Case Number')
ax.set_ylabel('% of Features Captured')
plt.show()

In [None]:
train.groupby('feature_num')['correct'].mean() \
    .plot(kind='hist', bins=50, color='#00BFC4', figsize=(12, 5),
          title='% of Correct Annotation for Features', edgecolor='black')
ax.set_xlabel('% of Correct Annotations')
plt.show()

In [None]:
train.groupby('feature_num')['correct'].mean().sort_values()
train.query('feature_num == 807').head()

In [None]:
train.query('feature_num == 807').loc[
    train.query('feature_num == 807')['pn_history'].str.lower().str.contains('hallucinations')
]

In [None]:
print(
    train.query('feature_num == 807').loc[
    train.query('feature_num == 807')['pn_history'].str.lower().str.contains('hallucinations') &
    train.query('feature_num == 807')['pn_history'].str.lower().str.contains('ambien')
]['pn_history'].values[0]
)

In [None]:
print(train.query('feature_num == 209').loc[
    train.query('feature_num == 209')['pn_history'].str.lower().str.contains('stress')
]['pn_history']
     )

In [None]:
print(train.query('feature_num == 209').sort_values('correct') \
    .query('pn_num == 21054')['pn_history'].values[0])