In [None]:
import numpy as np
import pandas as pd 
import os
import re
import ast
import matplotlib.pyplot as plt
from matplotlib.colors import cnames
import seaborn as sns
import spacy
nlp = spacy.load('en_core_web_sm')

from termcolor import colored
from IPython.display import Markdown

from spacy import displacy

import warnings
warnings.filterwarnings("ignore")

## Load dataset

This problem are composed of three main dataset - train, patient_notes and features - which are related as the it showed in DER graph below

![DER](https://raw.githubusercontent.com/Guillin/nbme-score-clinical/main/images/der.png)

In [None]:
FILE_PATH = "../input/nbme-score-clinical-patient-notes/"
features_df = pd.read_csv(FILE_PATH + "features.csv")
patient_notes_df = pd.read_csv(FILE_PATH + "patient_notes.csv")
train_df = pd.read_csv(FILE_PATH + "train.csv")
test_df = pd.read_csv(FILE_PATH + "test.csv")
submission_df = pd.read_csv(FILE_PATH + "sample_submission.csv")


## Training Data


    


#### train.csv 

Feature annotations for 1000 of the patient notes, 100 for each of ten cases.
- id : Unique identifier for each patient note / feature pair.
- case_num : The case to which this patient note belongs.
- pn_num : The patient note annotated in this row.
- feature_num : The feature annotated in this row.
- annotation : The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.
- location : Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon ;.

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.nunique()

Feature_num has a range of value from 0 to 18 at the most for each case number. It increases to a hundred when case number increases an unit.

In [None]:
train_df.groupby('case_num').agg({'id':['count',pd.Series.nunique],'pn_num': [pd.Series.nunique], 'feature_num': ["min", "max",pd.Series.nunique]})

In [None]:
train_df.dtypes

From the example below, we can notice that each patient has many anotation that belong to diferents students and it goes from 0 to 8

In [None]:
train_df[train_df['pn_num']==16]

In [None]:
# number annotations
train_df['count_annotation'] = train_df['location'].apply(lambda x : ast.literal_eval(x).__len__())
train_df['count_annotation'].describe()

####  Annotation by case_num

As we can see in the graph below, the mean of annotation is under 1 for most clinical cases, except for cases 4 and 7. Cases number 5 are those how have more number of annotation, and cases number 7 are those who have less number of annotation.  

In [None]:
sns.set_theme()
count_annotation_case_df = train_df.groupby('case_num').agg(count_annotation=('count_annotation','sum')).reset_index()\
    .sort_values('count_annotation', ascending=False)

mean_annotation_case_df = train_df.groupby('case_num').agg(mean_annotation=('count_annotation','mean')).reset_index()\
    .sort_values('mean_annotation', ascending=False)



fig, ax =plt.subplots(1,2, figsize=(16,6))
sns.barplot(y="case_num", x="count_annotation", orient='h',  order=count_annotation_case_df.case_num.tolist(), data=count_annotation_case_df,ax=ax[0])\
    .set_title("Total annotation by Case number")

ax = sns.barplot(y="case_num", x="mean_annotation", orient='h',  order=mean_annotation_case_df.case_num.tolist(), data=mean_annotation_case_df, ax=ax[1])\
    .set_title("Mean annotation by Case number")


fig.show()

Doing the same analysis by patients number, in this case, we can see that most of annotation them are over 1.

In [None]:
sns.set_theme()
mean_annotation_pn_df = train_df.groupby('pn_num').agg(mean_annotation=('count_annotation','mean')).reset_index()\
    .sort_values('mean_annotation', ascending=False)
mean_annotation_pn_df = mean_annotation_pn_df.iloc[:20,:]


count_annotation_pn_df = train_df.groupby('pn_num').agg(count_annotation=('count_annotation','sum')).reset_index()\
    .sort_values('count_annotation', ascending=False)
count_annotation_pn_df = count_annotation_pn_df.iloc[:20,:]


fig, ax =plt.subplots(1,2, figsize=(24,8))
sns.barplot(x="count_annotation", y="pn_num", orient='h', order=count_annotation_pn_df.pn_num.tolist(),data=count_annotation_pn_df,ax=ax[0])\
        .set_title("Top 20 - Number annotation by Pasient number")

sns.barplot(x="mean_annotation", y="pn_num", orient='h', order=mean_annotation_pn_df.pn_num.tolist(),data=mean_annotation_pn_df,ax=ax[1])\
        .set_title("Top 20 - Mean annotation by Pasient number")

fig.show()


### patient_notes.csv

A collection of about 40,000 Patient Note history portions. Only a subset of these have features annotated. <span style="color:orange">*You may wish to apply unsupervised learning techniques on the notes without annotations*</span>. The patient notes in the test set are not included in the public version of this file.
- pn_num - A unique identifier for each patient note.
- case_num - A unique identifier for the clinical case a patient note represents.
- pn_history - The text of the encounter as recorded by the test taker.

In [None]:
patient_notes_df.head()

In [None]:
patient_notes_df.shape

In [None]:
patient_notes_df.nunique()

In [None]:
print(patient_notes_df[patient_notes_df['pn_num']==16]['pn_history'].values)


### features.csv 

The rubric of features (or key concepts) for each clinical case.
- feature_num - A unique identifier for each feature.
- case_num - A unique identifier for each case.
- feature_text - A description of the feature.

In [None]:
features_df.head()

In [None]:
features_df.shape

In [None]:
features_df.nunique()

In [None]:
features_df.groupby('case_num').agg({ 'feature_num': ['count',pd.Series.nunique], 'feature_text': ['count',pd.Series.nunique]})

Let's see some pair of examples

In [None]:
features_df[features_df['case_num']==0]

In [None]:
features_df[features_df['case_num']==1]

In [None]:
features_df[features_df['case_num']==2]

### Using Spacy to colored annotation and features with ent
Here we take each annotation position and they corresponding feature and color them in order to visualize every patient's annotation

In [None]:
def colored_patient_notes(pn_num):

    colors = {
        '0':'#0000FF',
        '1':'#BC8F8F', 
        '2':'#3d3d3d', 
        '3':'#CD5C5C', 
        '4':'#FA8072', 
        '5':'#CD853F', 
        '6':'#FFD700', 
        '7':'#FF8C00', 
        '8':'#F0E68C', 
        '9':'#BDB76B', 
        '10':'#32CD32', 
        '11':'#00FFFF', 
        '12':'#B0E0E6', 
        '13':'#87CEFA', 
        '14':'#B0C4DE', 
        '15':'#DDA0DD', 
        '16':'#EE82EE', 
        '17':'#FFC0CB'
        }

    ents = []
    feature_list = []
    for i, row in train_df[train_df['pn_num']==pn_num].iterrows(): 
        for loc in ast.literal_eval(row['location']):
            loc = loc.replace(';', ' ').replace(':', ' ')
            loc_splited = loc.split(' ')

            ents.append({
                'start': int(loc_splited[0]),
                'end': int(loc_splited[1]),
                'label': str(int(str(row['feature_num'])[-2:]))

            })

            feature_list.append(row['feature_num'])

    
    print("\nFeatures")
    print('='*80)

    
    for feature_num in set(feature_list):
        try:
            #feature = features_df[features_df['feature_num'] == feature_num]['feature_text'][feature_num]
            feature = features_df[features_df['feature_num'] == feature_num]['feature_text'].values[0]
            #print(colored(feature, color='red'))
            display(Markdown(f'<span style="color: {colors[str(int(str(feature_num)[-2:]))]}">{feature_num} : {feature}</span>'))
        except:
            print(colored("Upps Something is wrong! No features found. ", 'red'))
            pass
    
    print("\nPatient Notes")
    print('='*80)

    

    text = patient_notes_df[patient_notes_df['pn_num']==pn_num]['pn_history'].values[0]
    doc2 = {
        "text": text,
        "ents": ents,
    }

    
        
    options = {"ents": [ent['label'] for ent in ents], "colors": colors}
    spacy.displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True);




In [None]:
colored_patient_notes(pn_num=16)


In [None]:
colored_patient_notes(pn_num=34986)


### Joining datasets

In [None]:
merge_df = pd.merge(train_df, patient_notes_df, how='left', on=['pn_num','case_num'])
merge_df = pd.merge(merge_df, features_df, how='left', on=['feature_num','case_num'])

In [None]:
merge_df.shape

In [None]:
merge_df[merge_df['pn_num']==16]


### test.csv 

Example instances selected from the training set.
- id : Unique identifier for each patient note / feature pair.
- case_num : The case to which this patient note belongs.
- pn_num : The patient note annotated in this row.
- feature_num : The feature annotated in this row.

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df.nunique()

### sample_submission.csv
A sample submission file in the correct format.


In [None]:
submission_df.head()

In [None]:
submission_df.shape

In [None]:
submission_df.nunique()