In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install stylecloud

 <h1 align='center'>Dataset Info



## Training data
**patient_notes.csv - A collection of about 40,000 Patient Note history portions.**

* ```pn_num``` - A unique identifier for each patient note.
* ```case_num``` - A unique identifier for the clinical case a patient note represents.
* ```pn_history``` - The text of the encounter as recorded by the test taker.

**features.csv - The rubric of features (or key concepts) for each clinical case.**

* ```feature_num``` - A unique identifier for each feature.
* ```case_num``` - A unique identifier for each case.
* ```feature_text``` - A description of the feature.

**train.csv - Feature annotations for 1000 of the patient notes, 100 for each of ten cases.**

* ```id``` - Unique identifier for each patient note / feature pair.
* ```pn_num``` - The patient note annotated in this row.
* ```feature_num``` - The feature annotated in this row.
* ```case_num``` - The case to which this patient note belongs.
* ```annotation``` - The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.
* ```location``` - Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon ;.


<h1 align='center'>TABLE OF COTENTENTS</h1>

* Import libraries
* Reading the data
* Explore
    * Train data
    * Feutures data
    * Patients Note data

## IMPORT LIBRARIES

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
import plotly.express as px
import plotly.figure_factory as ff
from IPython.display import Image
import sklearn
import stylecloud
import ast
from collections import Counter, defaultdict
import nltk
import spacy
from spacy import displacy

import warnings
warnings.simplefilter('ignore')

<h2>READING THE DATA</h2>

In [None]:
train = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')
train.head()

In [None]:
train.info()

In [None]:
feature = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')
feature.head()

In [None]:
feature.info()

In [None]:
patient_note = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')
patient_note.head()

In [None]:
patient_note.info()

<h1 align='center'>EXPLORE</h1>

<h2>Train Data</h2>

<h4>We will start by looking the distribution of case_num.

In [None]:
plt.figure(figsize=(10, 8))

sns.countplot(x='case_num', data=train, palette='flare')
plt.title('Distribution of Case_Num in Training Data', fontsize=15)

plt.show()

<h4>Next step is to analyse the annotations. First, we will focus at the number of features in annotations and then we will consider the most important words in the annotations.</h4> 

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 8))
fig.suptitle('Distribution of Number of Annotations', size=15)
train['annot_features'] = train['annotation'].apply(lambda x : len(ast.literal_eval(x))) 
sns.countplot(x=train['annot_features'], palette='crest', ax=ax[0])

sizes = []
no_annotations = len(train[train['annot_features']==0])
sizes.append(no_annotations)
annotated = len(train) - len(train[train['annot_features']==0])
sizes.append(annotated)

print('Number of Rows with no Annotations -', no_annotations)
print('Number of Rows with Annotations -', annotated)

labels = ['No Annotation', 'Annotation']
colors = ['#72CC50', '#54C2CC']
ax[1].pie(sizes, colors=colors, startangle=90, labels=labels,
        autopct='%1.0f%%', pctdistance=0.7,textprops={'fontsize':12}, counterclock=False)

plt.show()

In [None]:
def annot_list(annotation):
    text = [word for words in ast.literal_eval(annotation) for word in words.split()]
    return text


train['text'] = train['annotation'].apply(lambda x : annot_list(x))
top = Counter([word for words in train['text'] for word in words])


df_temp = pd.DataFrame(top.most_common(25))
df_temp.columns = ['Common_words','count']

fig = px.bar(df_temp, x='count', y='Common_words', title='Most Common Words(including stopwords) in Annotations', orientation='h', width=800,height=600, color='Common_words')
fig.show()


In [None]:
def stopwords_remove(annotation):
    text = [word for words in ast.literal_eval(annotation) for word in words.split() if word not in set(nltk.corpus.stopwords.words('english'))]
    return text

train['text'] = train['annotation'].apply(lambda x : stopwords_remove(x))

top = Counter([word for words in train['text'] for word in words])
df_temp = pd.DataFrame(top.most_common(25))
df_temp.columns = ['Common_words','count']

fig = px.bar(df_temp, x='count', y='Common_words', title='Most Common Words(including stopwords) in Annotations', orientation='h', width=800,height=600, color='Common_words')
fig.show()

# Feuture Data

#### Now we will analyse the features data. We will start by looking the distribution of case_num.

In [None]:
plt.figure(figsize=(10, 8))

sns.countplot(x='case_num', data=feature, palette = 'Purples_r')
plt.title('Distribution of Case_Num in Features Data', fontsize=15)

plt.show()

#### Then we will inspect some feature_text properties like number of words in it and the average word length distributions.

In [None]:
text_len = feature['feature_text'].str.split('-').map(lambda x : len(x))
#text_len = [len(i) for i in text_len]
fig = ff.create_distplot([text_len], ['feature'], colors=['#2ca02c'])
fig.update_layout(title_text='Word Count Distribution')
fig.show()

In [None]:
avg_word_len = feature['feature_text'].str.split('-').apply(lambda x : [len(i) for i in x]).map(lambda x : np.mean(x))
fig = ff.create_distplot([avg_word_len], ['feature'], colors=['#ffa408'])
fig.update_layout(title_text='Average Word Length Distribution')
fig.show()

# Patient Notes Data

#### Lastly, we will analyse the patient_notes data. We will start by looking the distribution of case_num.

In [None]:
plt.figure(figsize=(15, 9))

sns.countplot(x='case_num', data=patient_note, palette = 'winter')
plt.title('Distribution of Case_Num in Patient Notes Data', fontsize=15)

plt.show()

#### Then similarly we will inspect some patient history notes properties like number of words in it and the average word length distributions.

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(20, 12))

text_len = patient_note['pn_history'].str.split().map(lambda x : len(x))
sns.histplot(text_len, element="step", kde=True, color='#2ca02c', ax=ax[0])
ax[0].set_title('Word Count Distribution', size=20)

avg_word_len = patient_note['pn_history'].str.split().apply(lambda x : [len(i) for i in x]).map(lambda x : np.mean(x))
sns.histplot(avg_word_len, element="step", kde=True, color='#ffa408', ax=ax[1])
ax[1].set_title('Average Word Length Distribution', size=20)

plt.tight_layout()
plt.show()

# Anotations Visualization

In [None]:
# Reference - https://www.kaggle.com/vanguarde/nbme-eda
nlp = spacy.blank('en')
loc = list(train.loc[(train.pn_num==224) & (train.location!='[]'), 'location'].str.replace("['", "", regex=False).str.replace("']", "", regex=False))
text = patient_note[patient_note.pn_num==224].pn_history.values[0]
doc = nlp.make_doc(text)
ents = []
for l in loc:
    start, end = l.split(' ')
    ent = doc.char_span(int(start), int(end), label='annotation')
    ents.append(ent)
doc.ents = ents
color = {"Annotation": '#A32EFF'}
displacy.render(doc, style="ent", jupyter=True, options={'colors': color})