In [None]:
import pandas as pd 
import numpy as np
import plotly.express as px # visualisations and graphs
import os
import spacy.displacy
import seaborn as sns
import json
import warnings
from IPython.core.display import display, HTML

In [None]:
ROOT = "../input/nbme-score-clinical-patient-notes"

In [None]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
train = pd.read_csv(f"{ROOT}/train.csv")
test = pd.read_csv(f"{ROOT}/test.csv")
features = pd.read_csv(f"{ROOT}/features.csv")
patient_notes = pd.read_csv(f"{ROOT}/patient_notes.csv")
submission = pd.read_csv(f"{ROOT}/sample_submission.csv")


**Features**


In [None]:
features.head()

In [None]:
features.tail()

In [None]:
print(f'Shape of features:{features.shape}, \n'
      f'With unique num of features: {features.feature_num.nunique()} \n'
      f'num of any missed cell in the df: {features.isna().sum().sum()}')

**Notes**

In [None]:
patient_notes.head()

In [None]:
patient_notes.tail()

In [None]:
yo = patient_notes[patient_notes['pn_history'].str.contains('yo')].shape[0]
yo_dots = patient_notes[patient_notes['pn_history'].str.contains('y.o.')].shape[0]
yo_fulls = patient_notes[patient_notes['pn_history'].str.contains('year-old')].shape[0] + \
patient_notes[patient_notes['pn_history'].str.contains('years old')].shape[0] + \
patient_notes[patient_notes['pn_history'].str.contains('years-old')].shape[0] + \
patient_notes[patient_notes['pn_history'].str.contains('year old')].shape[0]

print(f'yo: {yo} \ny.o.: {yo_dots} \nyear old: {yo_fulls}')

In [None]:
print(f'Shape of patient_notes:{patient_notes.shape}, \n'
      f'With unique num of patient notes: {patient_notes.pn_num.nunique()} \n'
      f'num of any missed cell in the df: {patient_notes.isna().sum().sum()}')

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
df_count = patient_notes.groupby('case_num').count()

f, ax = plt.subplots(figsize=(12,8))
ax = sns.barplot(data=df_count, x=df_count.index, y=df_count['pn_num'], palette=sns.color_palette("deep", 10))
plt.xlabel('Case num', fontsize=16)
plt.ylabel('Case count', fontsize=16)
plt.show()

In [None]:
submission.head(4)

In [None]:
train.head(4)

In [None]:
print("Checking Null's",train.isnull().sum())

In [None]:
train.dtypes

In [None]:
test.head(4)

In [None]:
print("Checking Null's",test.isnull().sum())

In [None]:
test.dtypes

 **Merge the datasets**

## So, (annotation, location) is target what we should predict well


In [None]:
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(test.head())

In [None]:
df = train.append(features)
df

In [None]:
display(train.duplicated().sum())
train.tail()

In [None]:
counts_of_notes = patient_notes.groupby("case_num").count()
ind = counts_of_notes.index
fig = px.bar(data_frame= counts_of_notes, y='pn_num', x=ind, text_auto='.2s',
            title="Count Distribution of Different Cases",
            labels={'case_num': 'Case Number', 'pn_num': 'Number of Patients'},
            width=1100, height=700,
            color ='pn_num',
            color_continuous_scale='aggrnyl'
            )
fig.update_layout(
    title_x=0.5,
    xaxis = dict(
        tickmode = 'array',
        tickvals = list(range(0,10)),
        ticktext = ['Case Zero', 'Case One', 'Case Two', 'Case Three', 'Case Four', 'Case Five', 'Case Six', 'Case Seven', 'Case Eight', 'Case Nine']
    )
)

fig.show()

In [None]:
PATIENT_IDX = 74087
patient_df = train[train["pn_num"] == PATIENT_IDX]
patient_df


In [None]:
features[features['feature_num'] == 708]


In [None]:
def annotate_sample(note_num):
    note_num = int(note_num)
    warnings.filterwarnings('ignore')
    patient_df = train[train["pn_num"] == note_num].copy()
    patient_df = patient_df.merge(features[['feature_num', 'feature_text']], on='feature_num')
    # WK: location should be a list of str, which some ";" should be handled and turned to ","
    patient_df["location"] = patient_df["location"].str.replace("'", '"').str.replace(';', '","').apply(json.loads)  # WK: list of str,    annotation = patient_df["feature_text"]
    annotation = patient_df["feature_text"]
    ents = []
    for idx, row in patient_df.iterrows():
        spans = row['location']
        label = row['feature_text']
        for span in spans:
            start_loc = span.split()[0]
            end_loc = span.split()[1]
            ents.append({
                'start': int(start_loc),
                'end': int(end_loc),
                'label': label
            })
    doc = {
        'text': patient_notes[patient_notes["pn_num"] == note_num]["pn_history"].iloc[0],
        "ents": ents
    }
    p1 = sns.color_palette('hls', annotation.nunique(), desat=1).as_hex()
    p2 = sns.color_palette('hls', annotation.nunique(), desat=0.5).as_hex()
    colors = {k: f"linear-gradient(90deg, {c1}, {c2})" for k, c1, c2 in zip(annotation.unique(), p1, p2)}
    options = {"colors": colors}
    spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True)

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [None]:
case_numbers = list(map(str, patient_notes['case_num'].unique()))
case_num_selector = widgets.Dropdown(
    options=case_numbers,
    value=case_numbers[0],
    description='Case No:',
)

def update_note_selector():
    note_numbers = list(
        map(
            str, 
            train[train['case_num'] == int(case_num_selector.value)]['pn_num'].unique()))

    note_num_selector = widgets.Dropdown(
        options=note_numbers,
        value=note_numbers[0],
        description='Note No:',
    )

    return note_num_selector

note_num_selector = update_note_selector()

def on_case_no_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("changed to %s" % change['new'])
        note_numbers = list(
            map(
                str, 
                train[train['case_num'] == int(case_num_selector.value)]['pn_num'].unique()))
        note_num_selector.options=note_numbers
        note_num_selector.value=note_numbers[0]


        
def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("changed to %s" % change['new'])

        

        
case_num_selector.observe(on_case_no_change)
note_num_selector.observe(on_change)

display(case_num_selector)
display(note_num_selector)

In [None]:
# Change the widget value above and re-run this cell to explore labels in the notes
annotate_sample(note_num_selector.value)

Thanks: https://www.kaggle.com/code/weicongkong/eda-handy-patient-notes-label-visualiser

In [None]:
data = test.copy()


In [None]:
for row in data.index:
    case_num = data.loc[row, 'case_num']
    pn_num = data.loc[row, 'pn_num']
    feature_num = data.loc[row, 'feature_num']
    
    # Create Pattern List For The Feature
    df_temp = train[train['feature_num']==feature_num].copy()
    keys = df_temp['annotation'].value_counts().keys()
    pattern_list = [key.replace("[", '').replace("]", '').replace("'", '') for key in keys if len(key)>3]
    
    # Extract Patient Note
    text = patient_notes[(patient_notes['pn_num']==pn_num)&(patient_notes['case_num']==case_num)]['pn_history'].values[0]
    text = text.replace('.', ' ')
    pred = ''
    for pattern in pattern_list:
        pattern = pattern.replace('+', 'P').replace('(', ' ').replace(')', ' ')
        text = text.replace('+', 'P')
        if text.find(pattern)>0:
            pred += str(text.find(pattern))+' '+str(text.find(pattern)+len(pattern))+';'
    pred = pred[:-1]
    data.loc[row, 'location'] = pred
    submission.loc[row, 'id'] = data.loc[row, 'id']
    submission.loc[row, 'location'] = pred

In [None]:
submission

In [None]:
test.to_csv("submission.csv", index=False)