In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!nvidia-smi

## Library

In [None]:
import os
import json
import torch
import torch.nn as n
import torch.nn.functional as F
import argparse
import importlib
import pandas as pd
import numpy as np

from torch.utils.data import Dataset, DataLoader
from transformers import (AutoTokenizer, 
    AutoConfig, 
    AutoModelForTokenClassification, 
    Trainer, 
    DataCollatorWithPadding
)


## PLM

In [None]:
TOKENIZER = '/kaggle/input/roberta-large/tokenizer'
PLM = '/kaggle/input/roberta-large/checkpoint/'
MAX_LENGTH = 512

## Load Datasets

In [None]:
dir_path = '/kaggle/input/nbme-score-clinical-patient-notes/'

In [None]:
test_df = pd.read_csv(os.path.join(dir_path, 'test.csv'))
patients_df = pd.read_csv(os.path.join(dir_path, 'patient_notes.csv'))
features_df = pd.read_csv(os.path.join(dir_path, 'features.csv'))

test_df = test_df.merge(features_df, on=['feature_num', 'case_num'], how='left')
test_df = test_df.merge(patients_df, on=['pn_num', 'case_num'], how='left')

In [None]:
test_df.head()

## Preprocessing Datasets

In [None]:
feature_text = list(test_df['feature_text'])
pn_history = list(test_df['pn_history'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

In [None]:
inputs = [pn_history[i] + tokenizer.sep_token + feature_text[i] for i in range(len(test_df))]
encoded = tokenizer(inputs,
    return_offsets_mapping=True,
    return_token_type_ids=False,
    truncation=True,
)

In [None]:
input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']

## Datasets

In [None]:
class TestDataset(Dataset) :
    def __init__(self, input_ids, attention_mask) :
        super(TestDataset , self).__init__()
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        
    def __len__(self) :
        return len(self.input_ids)

    def __getitem__(self , idx) :
        return {'input_ids' : self.input_ids[idx], 'attention_mask' : self.attention_mask[idx]}

In [None]:
dataset = TestDataset(input_ids, attention_mask)

## Collator

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=MAX_LENGTH)

## Config & Model

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

config = AutoConfig.from_pretrained(PLM)
model = AutoModelForTokenClassification.from_pretrained(PLM, config=config).to(device)

## Dataloader

In [None]:
data_loader = DataLoader(dataset, batch_size=8, shuffle=False, collate_fn=collator)

## Inference

In [None]:
offset_mapping = encoded.pop('offset_mapping')

In [None]:
model.eval()
predictions = []

for data in data_loader :
    
    data = {k:v.to(device) for k,v in data.items()}
    results = model(**data)
    
    logits = results.logits.detach().cpu().numpy()
    prediction_ids = np.argmax(logits, axis=-1)

    preds = [pred for pred in prediction_ids]
    predictions.extend(preds)

## Postprocessing Datasets

In [None]:
def postprocess(pos_list) :
    idx = 0
    start = 0

    results = []
    while(idx < len(pos_list)) :
        if idx + 1 == len(pos_list) :
            prev_start, prev_end = pos_list[start]
            cur_start, cur_end = pos_list[idx]

            results.append([prev_start, cur_end])
            idx += 1
        else :
            prev_start, prev_end = pos_list[idx]
            cur_start, cur_end = pos_list[idx+1]

            if cur_start == prev_end + 1 or cur_start == prev_end :
                idx += 1
            else :
                span_start = pos_list[start][0]
                span_end = pos_list[idx][1]

                results.append([span_start, span_end])
                start = idx+1
                idx = start
    
    results = [[str(span[0]), str(span[1])] for span in results]
    span_list = [' '.join(span) for span in results]
    return ';'.join(span_list)

In [None]:
locations = []

for i, pred in enumerate(predictions) :
    offset = offset_mapping[i]
    input_ids = encoded['input_ids'][i]
    
    token_start_index = 1
    token_end_index = input_ids.index(tokenizer.sep_token_id)
    
    span_list = []
    for j in range(token_start_index, token_end_index) :
        if pred[j] == 1 :
            span_list.append(offset[j])
            
    span = '' if len(span_list) == 0 else postprocess(span_list)
    locations.append(span)

In [None]:
test_df['location'] = locations
test_df = test_df.drop(columns = ['case_num', 'pn_num', 'feature_num', 'feature_text', 'pn_history'])
test_df.head()

In [None]:
test_df.to_csv('submission.csv', index=False)