# Import Required Libraries 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
print(pd.__version__)
pd.options.display.max_columns = 300
pd.options.display.max_colwidth = 200
pd.options.display.min_rows = 200
pd.options.display.max_rows = 500

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import ast
import os
from difflib import get_close_matches

import os
import spacy.displacy
import pandas as pd
import seaborn as sns
import json
import warnings
from IPython.core.display import display, HTML

DATA_ROOT = os.path.join(
    '..', 'input', 'nbme-score-clinical-patient-notes')

TRAIN_PATH = os.path.join(DATA_ROOT, 'train.csv')
TEST_PATH = os.path.join(DATA_ROOT, 'test.csv')
FEATURE_PATH = os.path.join(DATA_ROOT, 'features.csv')
PATIENT_NOTES_PATH = os.path.join(DATA_ROOT, 'patient_notes.csv')

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
features = pd.read_csv(FEATURE_PATH)
patient_notes = pd.read_csv(PATIENT_NOTES_PATH)

In [None]:
train = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/train.csv")                    
notes = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv")
features = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/features.csv")
test = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/test.csv")

print("Length of Train and Notes ",len(train), len(notes))
train_merged = pd.merge(train, notes, 
                        on = ["case_num", "pn_num"], 
                        how = "inner")
train_merged = pd.merge(train_merged, features[["feature_num","feature_text"]], 
                        on = ["feature_num"], 
                        how = "left")
print(len(train_merged))
train_merged.head(3)

In [None]:
features.head(10)

### Find Unique cases, pn numbers, features numbers, cases with & without annotations

In [None]:
print(f"Unique case num: {train_merged.case_num.nunique()}")
print(f"Unique pn num: {train_merged.pn_num.nunique()}")
print(f"Unique feature num: {train_merged.feature_num.nunique()}")
print(f"Cases with annotation: {train_merged.location[train_merged.location != '[]'].shape[0]}")
print(f"Cases without annotation: {train_merged.location[train_merged.location == '[]'].shape[0]}")

In [None]:
train_merged['find_f_txt_pn'] = train_merged.apply(lambda x: [el for el in x["feature_text"].lower().replace("-"," ").split(" or ") if el in x["pn_history"].lower()], axis=1)

train_merged[train_merged.location == '[]'][['feature_text','find_f_txt_pn']]\
    .explode('find_f_txt_pn')\
    .groupby(['feature_text','find_f_txt_pn']).size()\
    .reset_index().set_axis(['feature_text','find_f_txt_pn','cnt'],axis='columns')\
    .sort_values(['cnt'],ascending=False)

In [None]:
def annotate_sample(note_num):
    note_num = int(note_num)
    warnings.filterwarnings('ignore')
    patient_df = train[train["pn_num"] == note_num].copy()
    patient_df = patient_df.merge(features[['feature_num', 'feature_text']], on='feature_num')
    # WK: location should be a list of str, which some ";" should be handled and turned to ","
    patient_df["location"] = patient_df["location"].str.replace("'", '"').str.replace(';', '","').apply(json.loads)  # WK: list of str,    annotation = patient_df["feature_text"]
    annotation = patient_df["feature_text"]
    ents = []
    for idx, row in patient_df.iterrows():
        spans = row['location']
        label = row['feature_text']
        for span in spans:
            start_loc = span.split()[0]
            end_loc = span.split()[1]
            ents.append({
                'start': int(start_loc),
                'end': int(end_loc),
                'label': label
            })
    doc = {
        'text': patient_notes[patient_notes["pn_num"] == note_num]["pn_history"].iloc[0],
        "ents": ents
    }
    p1 = sns.color_palette('hls', annotation.nunique(), desat=1).as_hex()
    p2 = sns.color_palette('hls', annotation.nunique(), desat=0.5).as_hex()
    colors = {k: f"linear-gradient(90deg, {c1}, {c2})" for k, c1, c2 in zip(annotation.unique(), p1, p2)}
    options = {"colors": colors}
    spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True)
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
case_numbers = list(map(str, patient_notes['case_num'].unique()))
case_num_selector = widgets.Dropdown(
    options=case_numbers,
    value=case_numbers[0],
    description='Case No:',
)

def update_note_selector():
    note_numbers = list(
        map(
            str, 
            train[train['case_num'] == int(case_num_selector.value)]['pn_num'].unique()))

    note_num_selector = widgets.Dropdown(
        options=note_numbers,
        value=note_numbers[0],
        description='Note No:',
    )

    return note_num_selector

note_num_selector = update_note_selector()

def on_case_no_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("changed to %s" % change['new'])
        note_numbers = list(
            map(
                str, 
                train[train['case_num'] == int(case_num_selector.value)]['pn_num'].unique()))
        note_num_selector.options=note_numbers
        note_num_selector.value=note_numbers[0]


        
def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("changed to %s" % change['new'])

        

        
case_num_selector.observe(on_case_no_change)
note_num_selector.observe(on_change)

display(case_num_selector)
display(note_num_selector)


In [None]:
annotate_sample(note_num_selector.value)

## Number of cases for each feature_text in train data

In [None]:
train_merged\
    .groupby('feature_text').agg({'id':['count']})\
    .reset_index().set_axis(['feature_text','count'],axis='columns')\
    .sort_values('count',ascending=True)\
    .plot(kind='barh',figsize=(12,24),
          x='feature_text', 
          y='count', 
          title ='count of feature_text')

In [None]:
train_merged[['feature_text','annotation']]\
         .query("annotation != '[]'")\
         .assign(annotation = lambda x: [[''] if e == '[]' else ast.literal_eval(e) for e in x['annotation']])\
         .explode('annotation')\
         .assign(annotation = lambda x: x['annotation'].str.lower())\
         .groupby(['feature_text','annotation']).size()\
         .reset_index()\
         .set_axis(['feature_text','annotation', 'cnt'], axis='columns')\
         .assign(rnk = lambda x: x.groupby('feature_text').cnt.transform('rank',method='max', ascending=False))\
         .sort_values(['feature_text','cnt'], ascending=[True, False])\
         .query("rnk<6")\
         .groupby("feature_text").agg({"annotation":lambda x: list(x),
                                       "cnt":lambda x: list(x)})\
         .reset_index()\
         .set_axis(['feature_text','top 5 annotation','top 5 annotation cnt'], 
                   axis='columns')

In [None]:
# Count of feature_text in train data
train_merged.assign(no_ann = lambda x: x['location'] == '[]' )\
    .groupby('feature_text').agg({'no_ann':['count',lambda z:100*np.mean(z)]})\
    .reset_index().set_axis(['feature_text','count','no_ann_pct'],axis='columns')\
    .sort_values('no_ann_pct',ascending=True)\
    .plot(kind='barh',figsize=(12,24),
          x='feature_text', 
          y='no_ann_pct', 
          title ='% missing annotations across feature_text',
          color='red')

In [None]:
train_merged_split_words = \
train_merged[['feature_text','annotation']]\
         .query("annotation != '[]'")\
         .assign(annotation = lambda x: [[''] if e == '[]' else ast.literal_eval(e) for e in x['annotation']])\
         .explode('annotation')\
         .assign(annotation = lambda x: [str(y).lower().split() for y in x['annotation']])\
         .explode('annotation')\
         .groupby(['feature_text','annotation']).size()\
         .reset_index()\
         .set_axis(['feature_text','annotation', 'cnt'], axis='columns')\
         .assign(rnk = lambda x: x.groupby('feature_text').cnt.transform('rank',method='max', ascending=False))\
         .sort_values(['feature_text','cnt'], ascending=[True, False])

train_merged_split_words.query("rnk<6")\
         .groupby("feature_text").agg({"annotation":lambda x: list(x),
                                       "cnt":lambda x: list(x)})\
         .reset_index()\
         .set_axis(['feature_text','top 5 annotation','top 5 annotation cnt'], 
                   axis='columns')

In [None]:
def get_close_matches_(lst):
    similar_lst = []
    while len(lst) > 1:
        ref_word = lst[0]
        lst = lst[1:]
        matches = get_close_matches(ref_word, lst, cutoff = 0.75)
        if len(matches) > 0:
            similar_lst.append( matches+[ref_word] )
            lst = list(set(lst) - set(matches))
    return similar_lst

#get_close_matches_(['ape', 'apple', 'peach', 'puppy','appl'])

In [None]:
train_merged_split_words\
    .groupby("feature_text")\
    .apply(lambda x: list(x["annotation"]))\
    .reset_index()\
    .set_axis(['feature_text','annotation words'], axis='columns')\
    .assign(similar_wrds = lambda x: [get_close_matches_(x_) for x_ in x['annotation words']])\
    [['feature_text','similar_wrds']]\
    .explode('similar_wrds')\
    .assign(len_ = lambda x: [len(x_) if isinstance(x_, list) else 0 for x_ in x['similar_wrds']])\
    .sort_values('len_',ascending=False)[['feature_text','similar_wrds']]

In [None]:

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import os, re, ast, glob, itertools, spacy, transformers, torch
from transformers import AutoTokenizer, AutoConfig, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

os.environ["TOKENIZERS_PARALLELISM"] = "false"

DATA_PATH = "../input/nbme-score-clinical-patient-notes/"
OUT_PATH = "../input/nbme-roberta-large/"
WEIGHTS_FOLDER = "../input/nbme-roberta-large/"

# Prepration

In [None]:


def process_feature_text(text):
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text

def clean_spaces(text):
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    text = re.sub('\r', ' ', text)
    return text

def load_and_prepare_test(root=""):
    patient_notes = pd.read_csv(root + "patient_notes.csv")
    features = pd.read_csv(root + "features.csv")
    df = pd.read_csv(root + "test.csv")

    df = df.merge(features, how="left", on=["case_num", "feature_num"])
    df = df.merge(patient_notes, how="left", on=["case_num", "pn_num"])

    df["pn_history"] = df["pn_history"].apply(lambda x: x.strip())
    df["feature_text"] = df["feature_text"].apply(process_feature_text)
    df["feature_text"] = df["feature_text"].apply(clean_spaces)
    df["clean_text"] = df["pn_history"].apply(clean_spaces)
    df["target"] = ""
    return df

# Preprocessing

In [None]:


def token_pred_to_char_pred(token_pred, offsets):
    char_pred = np.zeros((np.max(offsets), token_pred.shape[1]))
    for i in range(len(token_pred)):
        s, e = int(offsets[i][0]), int(offsets[i][1])
        char_pred[s:e] = token_pred[i]
        if token_pred.shape[1] == 3:
            s += 1
            char_pred[s: e, 1], char_pred[s: e, 2] = (np.max(char_pred[s: e, 1:], 1), np.min(char_pred[s: e, 1:], 1),)
    return char_pred

def labels_to_sub(labels):
    all_spans = []
    for label in labels:
        indices = np.where(label > 0)[0]
        indices_grouped = [list(g) for _, g in itertools.groupby(indices, key=lambda n, c=itertools.count(): n - next(c))]
        spans = [f"{min(r)} {max(r) + 1}" for r in indices_grouped]
        all_spans.append(";".join(spans))
    return all_spans

def char_target_to_span(char_target):
    spans = []
    start, end = 0, 0
    for i in range(len(char_target)):
        if char_target[i] == 1 and char_target[i - 1] == 0:
            if end:
                spans.append([start, end])
            start = i
            end = i + 1
        elif char_target[i] == 1:
            end = i + 1
        else:
            if end:
                spans.append([start, end])
            start, end = 0, 0
    return spans

def post_process_spaces(target, text):
    target = np.copy(target)

    if len(text) > len(target):
        padding = np.zeros(len(text) - len(target))
        target = np.concatenate([target, padding])
    else:
        target = target[:len(text)]

    if text[0] == " ":
        target[0] = 0
    if text[-1] == " ":
        target[-1] = 0

    for i in range(1, len(text) - 1):
        if text[i] == " ":
            if target[i] and not target[i - 1]:
                target[i] = 0

            if target[i] and not target[i + 1]:
                target[i] = 0

            if target[i - 1] and target[i + 1]:
                target[i] = 1
    return target


# Data Tokenization 

In [None]:


def get_tokenizer(name, precompute=False, df=None, folder=None):
    if folder is None:
        tokenizer = AutoTokenizer.from_pretrained(name)
    else:
        tokenizer = AutoTokenizer.from_pretrained(folder)

    tokenizer.name = name

    tokenizer.special_tokens = {
        "sep": tokenizer.sep_token_id,
        "cls": tokenizer.cls_token_id,
        "pad": tokenizer.pad_token_id,
    }

    if precompute:
        tokenizer.precomputed = precompute_tokens(df, tokenizer)
    else:
        tokenizer.precomputed=None
        
    return tokenizer

def precompute_tokens(df, tokenizer):
    feature_texts = df["feature_text"].unique()
    ids = {}
    offsets = {}

    for feature_text in feature_texts:
        encoding = tokenizer(
            feature_text,
            return_token_type_ids=True,
            return_offsets_mapping=True,
            return_attention_mask=False,
            add_special_tokens=False,
        )
        ids[feature_text] = encoding["input_ids"]
        offsets[feature_text] = encoding["offset_mapping"]

    texts = df["clean_text"].unique()

    for text in texts:
        encoding = tokenizer(
            text,
            return_token_type_ids=True,
            return_offsets_mapping=True,
            return_attention_mask=False,
            add_special_tokens=False,
        )

        ids[text] = encoding["input_ids"]
        offsets[text] = encoding["offset_mapping"]
        
    return {"ids": ids, "offsets": offsets}

def encodings_from_precomputed(feature_text, text, precomputed, tokenizer, max_len=300):
    tokens = tokenizer.special_tokens

    if "roberta" in tokenizer.name:
        qa_sep = [tokens["sep"], tokens["sep"]]
    else:
        qa_sep = [tokens["sep"]]

    input_ids = [tokens["cls"]] + precomputed["ids"][feature_text] + qa_sep
    n_question_tokens = len(input_ids)

    input_ids += precomputed["ids"][text]
    input_ids = input_ids[: max_len - 1] + [tokens["sep"]]

    if "roberta" not in tokenizer.name:
        token_type_ids = np.ones(len(input_ids))
        token_type_ids[:n_question_tokens] = 0
        token_type_ids = token_type_ids.tolist()
    else:
        token_type_ids = [0] * len(input_ids)

    offsets = [(0, 0)] * n_question_tokens + precomputed["offsets"][text]
    offsets = offsets[: max_len - 1] + [(0, 0)]

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([tokens["pad"]] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        offsets = offsets + ([(0, 0)] * padding_length)

    encoding = {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "offset_mapping": offsets,
    }

    return encoding

# torch Dataset

In [None]:


class PatientNoteDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.texts = df['clean_text'].values
        self.feature_text = df['feature_text'].values
        self.char_targets = df['target'].values.tolist()

    def __getitem__(self, idx):
        text = self.texts[idx]
        feature_text = self.feature_text[idx]
        char_target = self.char_targets[idx]

        if self.tokenizer.precomputed is None:
            encoding = self.tokenizer(
                feature_text, text,
                return_token_type_ids=True,
                return_offsets_mapping=True,
                return_attention_mask=False,
                truncation="only_second",
                max_length=self.max_len,
                padding="max_length",
            )
            raise NotImplementedError("Fix issues with question offsets.")
        else:
            encoding = encodings_from_precomputed(feature_text, text, self.tokenizer.precomputed, self.tokenizer, max_len=self.max_len)

        return {
            "ids": torch.tensor(encoding["input_ids"], dtype=torch.long),
            "token_type_ids": torch.tensor(encoding["token_type_ids"], dtype=torch.long),
            "target": torch.tensor([0], dtype=torch.float),
            "offsets": np.array(encoding["offset_mapping"]),
            "text": text,
        }

    def __len__(self):
        return len(self.texts)



# Plot Predictions

In [None]:


def plot_annotation(df, pn_num):
    options = {"colors": {}}

    df_text = df[df["pn_num"] == pn_num].reset_index(drop=True)

    text = df_text["pn_history"][0]
    ents = []

    for spans, feature_text, feature_num in df_text[["span", "feature_text", "feature_num"]].values:
        for s in spans:
            ents.append({"start": int(s[0]), "end": int(s[1]), "label": feature_text})

        options["colors"][feature_text] =  f"rgb{tuple(np.random.randint(100, 255, size=3))}"

    doc = {"text": text, "ents": sorted(ents, key=lambda i: i["start"])}
    
    # spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True)




# Model Development

In [None]:


class NERTransformer(nn.Module):
    def __init__(self, model, num_classes=1, config_file=None, pretrained=True):
        super().__init__()
        self.name = model
        self.pad_idx = 1 if "roberta" in self.name else 0

        transformers.logging.set_verbosity_error()

        if config_file is None:
            config = AutoConfig.from_pretrained(model, output_hidden_states=True)
        else:
            config = torch.load(config_file)

        if pretrained:
            self.transformer = AutoModel.from_pretrained(model, config=config)
        else:
            self.transformer = AutoModel.from_config(config)

        self.nb_features = config.hidden_size

        self.logits = nn.Linear(self.nb_features, num_classes)

    def forward(self, tokens, token_type_ids):
        hidden_states = self.transformer(tokens, attention_mask=(tokens != self.pad_idx).long(), token_type_ids=token_type_ids)[-1]

        features = hidden_states[-1]

        logits = self.logits(features)

        return logits




# Load Weights

In [None]:


def load_model_weights(model, filename, verbose=1, cp_folder="", strict=True):
    if verbose:
        print(f"\n -> Loading weights from {os.path.join(cp_folder,filename)}\n")
    try:
        model.load_state_dict(torch.load(os.path.join(cp_folder, filename), map_location="cpu"), strict=strict)
    except RuntimeError:
        model.encoder.fc = torch.nn.Linear(model.nb_ft, 1)
        model.load_state_dict(torch.load(os.path.join(cp_folder, filename), map_location="cpu"), strict=strict)
    return model




# Predict Function

In [None]:


def predict(model, dataset, data_config, activation="softmax"):
    model.eval()
    loader = DataLoader(dataset, batch_size=data_config["val_bs"], shuffle=False, num_workers=2, pin_memory=True)
    preds = []
    with torch.no_grad():
        for data in tqdm(loader):
            ids, token_type_ids = data["ids"], data["token_type_ids"]
            y_pred = model(ids.cuda(), token_type_ids.cuda())
            if activation == "sigmoid":
                y_pred = y_pred.sigmoid()
            elif activation == "softmax":
                y_pred = y_pred.softmax(-1)
            preds += [token_pred_to_char_pred(y, offsets) for y, offsets in zip(y_pred.detach().cpu().numpy(), data["offsets"].numpy())]
    return preds

# Inference Test

In [None]:


def inference_test(df, exp_folder, config, cfg_folder=None):
    preds = []

    if cfg_folder is not None:
        model_config_file = cfg_folder + config.name.split('/')[-1] + "/config.pth"
        tokenizer_folder = cfg_folder + config.name.split('/')[-1] + "/tokenizers/"
    else:
        model_config_file, tokenizer_folder = None, None

    tokenizer = get_tokenizer(config.name, precompute=config.precompute_tokens, df=df, folder=tokenizer_folder)
    dataset = PatientNoteDataset(df, tokenizer, max_len=config.max_len)
    model = NERTransformer(config.name, num_classes=config.num_classes, config_file=model_config_file, pretrained=False).cuda()
    model.zero_grad()

    weights = sorted(glob.glob(exp_folder + "*.pt"))

    for weight in weights:
        model = load_model_weights(model, weight)
        pred = predict(model, dataset, data_config=config.data_config, activation=config.loss_config["activation"])
        preds.append(pred)
    return preds

# Main Code

In [None]:



# if __name__ == "__main__":
#     class Config:
#         # Architecture.
#         name = "roberta-large"
#         num_classes = 1
#         # Texts.
#         max_len = 310
#         precompute_tokens = True
#         # Training.
#         loss_config = {"activation": "sigmoid"}
#         data_config = {"val_bs": 16 if "large" in name else 32, "pad_token": 1 if "roberta" in name else 0}
#         verbose = 1

#     df_test = load_and_prepare_test(root=DATA_PATH)

#     preds = inference_test(df_test, WEIGHTS_FOLDER, Config, cfg_folder=OUT_PATH)[0]

#     df_test["preds"] = preds
#     df_test["preds"] = df_test.apply(lambda x: x["preds"][:len(x["clean_text"])], 1)
#     df_test["preds"] = df_test["preds"].apply(lambda x: (x > 0.5).flatten())

#     try:
#         df_test["span"] = df_test["preds"].apply(char_target_to_span)
#         plot_annotation(df_test, df_test["pn_num"][0])
#     except:
#         pass

#     df_test["preds_pp"] = df_test.apply(lambda x: post_process_spaces(x["preds"], x["clean_text"]), 1)

#     try:
#         df_test["span"] = df_test["preds_pp"].apply(char_target_to_span)
#         plot_annotation(df_test, df_test["pn_num"][0])
#     except:
#         pass

#     # Kaggle Submission.
#     df_test['location'] = labels_to_sub(df_test["preds_pp"].values)

#     sub = pd.read_csv(DATA_PATH + "sample_submission.csv")

#     sub = sub[["id"]].merge(df_test[["id", "location"]], how="left", on="id")

#     sub.to_csv("submission.csv", index=False)

#     sub.head()
