In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import scipy as sp

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Data Exploration**

In [None]:
!pip install sty

In [None]:
import shutil
from pathlib import Path
transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")
input_dir = Path("../input/deberta-v2-3-fast-tokenizer")
convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name
if conversion_path.exists():
    conversion_path.unlink()
shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"
for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()
    shutil.copy(input_dir/filename, filepath)

In [None]:
import os
import gc
import ast
import sys
import copy
import json
import math
import string
import pickle
import random
import itertools
import warnings
import wordcloud
from sty import bg, rs
import matplotlib.pyplot as plt
from collections import namedtuple
import plotly.express as px
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
features = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/features.csv")

In [None]:
train = pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
train

In [None]:
cases = []
num_features = []
for x in set(features['case_num']):
    cases.append(x)
    num_features.append(features['case_num'].tolist().count(x))

In [None]:
patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
all_notes = []
all_notes_len = []
for notes in patient_notes['pn_history']:
    all_notes.append(notes)
    all_notes_len.append(len(notes))
print("Average length of Patient History - ",np.mean(all_notes_len))
fig = px.histogram(x = all_notes_len,  marginal="violin",nbins = 100)
fig.update_layout(template="plotly_white")
fig.update_xaxes(title = "Lenght of patient Notes")
fig.show()

In [None]:
wordcloud_notes = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, max_font_size=120, max_words=5000,
                      width = 600, height = 400,
                      background_color='white').generate(" ".join(all_notes))
fig, ax = plt.subplots(figsize=(14,10))
ax.imshow(wordcloud_notes, interpolation='bilinear')
ax.set_axis_off()
plt.imshow(wordcloud_notes);

In [None]:
for note in patient_notes['pn_history'][:1]:
    print(type(note))
note_lengths = [len(note) for note in patient_notes['pn_history']]
num_notes = []
for x in set(patient_notes['case_num']):
    cases.append(x)
    num_notes.append(patient_notes['case_num'].tolist().count(x))
annotation_lengths = [len(x) for an in train['annotation'].tolist() for x in an]

In [None]:
cases = list(set(train['case_num']))
patient_notes_num = [0 for x in cases]
for x in set(train['pn_num']):
    case_num = train[train['pn_num'] == x]['case_num'].tolist()[0]
    patient_notes_num[case_num]+=1

In [None]:
CaseData = namedtuple('CaseData', 'feature location num')
def getData(patient_note_num, case_num):
    patient_note=""
    for index, row in patient_notes.iterrows():
        if row['pn_num'] == patient_note_num and row['case_num'] == case_num:
            patient_note = row['pn_history']
    case_datas = []
    for index, row in train.iterrows():
        if row['pn_num'] == patient_note_num and row['case_num'] == case_num:
            ft_num = row['feature_num']
            for i in range(len(row['annotation'])):
                case_datas.append(CaseData(row['annotation'][i], list(map(int, row['location'][i].split(" "))), ft_num))
    return patient_note, case_datas

In [None]:
patient_note, case_datas = getData(patient_note_num = 16, case_num = 0)
print(len(case_datas))
last_idx=0
for data in sorted(case_datas, key = lambda case_data : case_data.location[0]):
    print(patient_note[last_idx : data.location[0]], end="")
    print(bg.blue + patient_note[data.location[0] : data.location[1]] + bg.rs, end="")
    last_idx = data.location[1]

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)   
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
seed_everything(seed=42)

In [None]:
def micro_f1(preds, truths):
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)
def spans_to_binary(spans, length=None):
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1    
    return binary
def span_micro_f1(preds, truths):
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths
def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results
def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results
def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions
def get_score(y_true, y_pred):
    return span_micro_f1(y_true, y_pred)

In [None]:
main_dir="../input/nbme-score-clinical-patient-notes/"
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
test = pd.read_csv(main_dir+'test.csv')
submission = pd.read_csv(main_dir+'sample_submission.csv')
features = pd.read_csv(main_dir+'features.csv')
patient_notes = pd.read_csv(main_dir+'patient_notes.csv')
features = preprocess_features(features)
print(f"test.shape: {test.shape}")
print(f"features.shape: {features.shape}")
print(f"patient_notes.shape: {patient_notes.shape}")

In [None]:
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')

In [None]:
class CFG:
    num_workers=4
    path="../input/deberta-v3-large-5-folds-public/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.2
    max_len=354
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]

In [None]:
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
tokenizer = DebertaV2TokenizerFast.from_pretrained('../input/deberta-tokenizer')
CFG.tokenizer = tokenizer

In [None]:
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
    def __len__(self):
        return len(self.feature_texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        return inputs

In [None]:
class ScoringModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0) 
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0] 
        return last_hidden_states
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = ScoringModel(CFG, config_path=CFG.config_path, pretrained=False)
    
    state = torch.load(CFG.path+f"{CFG.model.split('/')[1]}_fold{fold}_best.pth",
                           map_location=torch.device('cpu')) 
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    predictions.append(char_probs)
    del model, state, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()
predictions_v3_l = np.mean(predictions, axis=0)

In [None]:
class CFG:
    num_workers=4
    path="../input/debertalarge/"
    config_path=path+'config.pth'
    model="microsoft/deberta-large"
    batch_size=24
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

In [None]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)    
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    predictions.append(char_probs)
    del model, state, prediction, char_probs; gc.collect()
    torch.cuda.empty_cache()
predictions_v1_l = np.mean(predictions, axis=0)

In [None]:
class CFG:
    num_workers=4
    path="../input/nbme-deberta-base-baseline-train/"
    config_path=path+'config.pth'
    model="microsoft/deberta-base"
    batch_size=24
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]

In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold in CFG.trn_fold:

    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    prediction = prediction.reshape((len(test), CFG.max_len))
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    predictions.append(char_probs)
    del model, state, prediction, char_probs; gc.collect()
    torch.cuda.empty_cache()
predictions_v1_b = np.mean(predictions, axis=0)

In [None]:
w1 = 1
w2 = 0
w3 = 0

In [None]:
predictions = []
for p1, p2, p3 in zip(predictions_v3_l, predictions_v1_l, predictions_v1_b):
    predictions.append(w1*p1 + w2*p2 + w3*p3)

In [None]:
results = get_results(predictions)
submission['location'] = results
display(submission.head())
submission[['id', 'location']].to_csv('submission.csv', index=False)