In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Uderstanding of Problem

* In order to work in the United States as a doctors, one needs to pass the USMLE (United States Medical Licensing Examination).There is a part of the exam where the doctor speaks with standardized patients about their experiences in the past, lifestyle, common diseases in their families, etc. The verbal information to be written on the exam sheet and submitted.

* The exam sheet will be checked by a licensed trained physician (examiner).

* An examiner will have a rubric (answer set) to which the exam sheet will be compared, and based on its similarity, a score will be assigned.

* Examiners can rate students by using a file features.csv that contains the words or sentences that should be included in each patient's history.

# Dataset Loading

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import ast
import random


import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from wordcloud import WordCloud,STOPWORDS


import warnings
warnings.filterwarnings('ignore')

In [None]:
pn_note = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
test = pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv')

In [None]:
df_pn_note = pn_note.copy()
df_features = features.copy()
df_train = train.copy()
df_test = test.copy()

# EDA

## Description
### <span style ='color:Orange'>**Patient Notes**</span>

In [None]:
pn_note.nunique()

In [None]:
pn_note['text_len'] = pn_note.pn_history.map(lambda x : len(x))

summary1 = pd.DataFrame({'Number of Rows':pn_note.shape[0],'Number of Columns':pn_note.shape[1],
                        'Number of Total Datapoints':pn_note.count().sum(),
                         'Number of Missing Entry':pn_note.isnull().sum().sum(),
                        'Average Note Length':pn_note['text_len'].mean(),'Minimum Text Length':pn_note['text_len'].min(),
                        'Maximum Text Length':pn_note['text_len'].max()},
                       index = ['Patient Notes'])
summary1

In [None]:
pn_note.head()

**Distribution of the Patient Notes per Case**

In [None]:
notes_counts = pn_note.groupby('case_num').count()

case = ['Case 0', 'Case 1', 'Case 2', 'Case 3', 'Case 4', 'Case 5', 'Case 6', 'Case 7', 'Case 8', 'Case 9']

plt.figure(figsize = (20,8))
sns.barplot(x = case, y = notes_counts.pn_num, palette = 'mako')
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('Case Number', fontsize = 15)
plt.ylabel('Patients Number', fontsize = 15)
plt.title('Distribution of the Patient Notes per Case', fontsize = 15)

plt.show()

**Distribution of Patient Note Length**

In [None]:
length = []

for i in range(len(pn_note)):
    length.append(len(pn_note.pn_history[i])) 

plt.figure(figsize = (20,5))
plt.title('Distribution of the Feature Text Length', fontsize = 15)
sns.histplot(length)
plt.xticks(np.arange(0, 1001, 200), fontsize = 14)
plt.yticks(fontsize = 14)
plt.xlabel('Text Length', fontsize = 15)
plt.ylabel('Count', fontsize = 15)

plt.show()

**Case wise Patient Note Length Distribution**

In [None]:
plt.figure(figsize = (20,5))
sns.boxplot(x="case_num", y="text_len", data = pn_note)
plt.xticks(np.arange(10), labels = case, fontsize = 14)
plt.yticks(np.arange(0,1001,200),fontsize = 14)
plt.xlabel('Case Number', fontsize = 15)
plt.ylabel('Text Length', fontsize = 15)
plt.show()

**Frequent Words in each Clinical Case**

In [None]:
pn_note.pn_history = pn_note.pn_history.apply(lambda x: " ".join(x.lower() for x in x.split()))
pn_note.pn_history = pn_note.pn_history.str.replace("[^a-zA-Z\s]","")
pn_note.pn_history = pn_note.pn_history.apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

words = pn_note.groupby('case_num')['pn_history'].apply(' '.join)

word_cloud_list = []

for i in range(len(words)):
    word_cloud_list.append(WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      colormap = 'autumn',
                      width=800,
                      height=800
                     ).generate(words[i]))

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(27, 12))
fig.subplots_adjust(hspace=0.05, wspace = 0.03)
ctr = 0
for r in range(2):
    for c in range(5):
        ax[r,c].imshow(word_cloud_list[ctr])
        ax[r,c].title.set_text('Case {}'.format(ctr))
        ax[r,c].axis("off")
        ctr+=1
plt.tight_layout()
plt.show()

### <span style ='color:Orange'>**Features**</span>

In [None]:
features.nunique()

In [None]:
features['text_len'] = features.feature_text.map(lambda x : len(x))

summary2 = pd.DataFrame({'Number of Rows':features.shape[0],'Number of Columns':features.shape[1],
                        'Number of Total Datapoints':features.count().sum(),
                         'Number of Missing Entry':features.isnull().sum().sum(),
                        'Average Note Length':features['text_len'].mean(),'Minimum Text Length':features['text_len'].min(),
                        'Maximum Text Length':features['text_len'].max()},
                       index = ['Features'])
summary2

In [None]:
features.head()

**Distribution of the Feature Texts per Case**

In [None]:
feature_counts = features.groupby('case_num').count()

case = ['Case 0', 'Case 1', 'Case 2', 'Case 3', 'Case 4', 'Case 5', 'Case 6', 'Case 7', 'Case 8', 'Case 9']

plt.figure(figsize = (20,8))
sns.barplot(x = case, y = feature_counts.feature_num, palette = 'mako')
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('Case Number', fontsize = 15)
plt.ylabel('Features Number', fontsize = 15)
plt.title('Distribution of the Feature Texts per Case', fontsize = 15)

plt.show()

**Distribution of Feature Text Length**

In [None]:
length = []

for i in range(len(features)):
    length.append(len(features.feature_text[i])) 

plt.figure(figsize = (20,5))
plt.title('Distribution of the Feature Text Length', fontsize = 15)
sns.histplot(length)
plt.xticks(np.arange(0, 101, 20), fontsize = 14)
plt.yticks(fontsize = 14)
plt.xlabel('Text Length', fontsize = 15)
plt.ylabel('Count', fontsize = 15)

plt.tight_layout()
plt.show()

**Case wise Feature Length Distribution**

In [None]:
plt.figure(figsize = (20,5))
sns.boxplot(x="case_num", y="text_len", data = features)
plt.xticks(np.arange(10), labels = case, fontsize = 14)
plt.yticks(np.arange(0,101,20),fontsize = 14)
plt.xlabel('Case Number', fontsize = 15)
plt.ylabel('Text Length', fontsize = 15)
plt.show()

### <span style ='color:Orange'>**Training Dataset**</span>

In [None]:
train.nunique()

In [None]:
train.head()

In [None]:
summary3 = pd.DataFrame({'Number of Rows':train.shape[0],'Number of Columns':train.shape[1],
                        'Number of Total Datapoints':train.count().sum(),
                         'Number of Missing Entry':train.isnull().sum().sum()},
                       index = ['Training Dataset'])
summary3

## Patient analysis : Joining of Features Datadet and Train Dataset for better understanding of Problem

In [None]:
train_data = df_train.merge(df_features, on=['feature_num', 'case_num'], how='left')
train_data = df_train.merge(df_pn_note, on=['pn_num', 'case_num'], how='left')
train_data['annotation_length'] = train_data['annotation'].apply(len)

train_data.head()

## Relate Datasets with Problem Statements

### **patient note** for **patient_number** **16** and **case number 0** as registered by the **Doctor**

In [None]:
pn16=df_pn_note[df_pn_note['pn_num']==16][df_pn_note['case_num']==0]['pn_history'].values[0]
print (pn16)

### **features** for above patient checked by a **Licensed Trained Physician**

In [None]:
for x in features[features['case_num']==0]['feature_text']:
    print (x)

The contents in the **features_text**, match to the contents present in the **patient_notes**. This is the rubric or the answer sheet.