# Visualizing relative positions of annotations within patient notes in training set

This notebook simply shows the distribution of the relative positions of annotations in training set.

We can observe that the majority of features have their own characteristic range of position within patient notes. This implies that there exists some typical *rules* or *structures of notes* shared by all the notes. 

Can this information be used to help model training or post-processing? How do you think? :)

# Startup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pathlib import Path
data_dir = Path('../input/nbme-score-clinical-patient-notes')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import os
import ast

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

mpl.rcParams['figure.dpi'] = 300

plt.rc('figure', figsize=(1.5, 1.5))

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

train = pd.read_csv(data_dir / 'train.csv')
features = pd.read_csv(data_dir / 'features.csv')
test = pd.read_csv(data_dir / 'test.csv')
patient_notes = pd.read_csv(data_dir / 'patient_notes.csv')

train = train.merge(patient_notes, on=['case_num', 'pn_num']).merge(features, on=['feature_num', 'case_num'])
train['pn_history_length'] = train.pn_history.apply(len)

def plot_annotation_position_distribution(case_num):
    features = train[train.case_num == case_num].feature_num.unique()
    feature_texts = train[train.case_num == case_num].feature_text.unique()

    n_rows, n_cols = int(np.ceil(len(feature_texts) / 2)), 2

    fig = plt.figure(figsize=(5 * n_cols, 1 * n_rows))
    for i, (feature, feature_text) in enumerate(zip(features, feature_texts)):
        ax = fig.add_subplot(n_rows, n_cols, i+1)

        midpoints, n_record_with_annotations = [], 0
        for record in train[train.feature_num == feature].to_records():
            locations = ast.literal_eval(record.location)
            if len(locations) > 0:
                n_record_with_annotations += 1

            for spans in locations:
                sub_spans = spans.split(';')

                start = int(sub_spans[0].split()[0])
                end = int(sub_spans[-1].split()[1])
                midpoint = (start + end) / 2

                midpoints.append(midpoint / record.pn_history_length)

        sns.histplot(midpoints, bins=50, binrange=(0, 1), ax=ax)
        ax.set_xlim([0, 1])
        ax.set_title(f'{feature} {feature_text}\n({n_record_with_annotations}/{(train.feature_num == feature).sum()} records with annotations)', loc='right', fontsize=7)
        for d in ['top', 'right']:
            ax.spines[d].set_visible(False)

    fig.subplots_adjust(hspace=2.8)
    fig.supxlabel('Relative position in a patient note', y=0.05)

# Case 0

In [None]:
plot_annotation_position_distribution(case_num=0)

# Case 1

In [None]:
plot_annotation_position_distribution(case_num=1)

# Case 2

In [None]:
plot_annotation_position_distribution(case_num=2)

# Case 3

In [None]:
plot_annotation_position_distribution(case_num=3)

# Case 4

In [None]:
plot_annotation_position_distribution(case_num=4)

# Case 5

In [None]:
plot_annotation_position_distribution(case_num=5)

# Case 6

In [None]:
plot_annotation_position_distribution(case_num=6)

# Case 7

In [None]:
plot_annotation_position_distribution(case_num=7)

# Case 8

In [None]:
plot_annotation_position_distribution(case_num=8)

# Case 9

In [None]:
plot_annotation_position_distribution(case_num=9)