In [None]:
import pandas as pd
import numpy as np
import os
import re
import copy
import pickle
import random
import torch 
import torch.nn as nn
import tokenizers
import transformers
import gc
from sklearn import model_selection
from transformers import AutoTokenizer, AutoModelForTokenClassification, get_linear_schedule_with_warmup, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.nn import Sigmoid
from torch.utils.data import DataLoader
from tqdm import tqdm
from IPython.display import display

In [None]:
class args:
    MAX_LEN = 300
    MODEL_PATH = "../input/bertbaseuncased/"
    MODEL_SAVE_PATH = "../input/nbms-dataset/model.bin"
    tokenizer = tokenizers.BertWordPieceTokenizer("../input/bertbaseuncased/vocab.txt",lowercase=True)
    seed = 42
    device="cuda"
    TEST_BATCH_SIZE = 1

In [None]:
def set_seed(seed=args.seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    os.environ["PYTHONHASHSEED"] = str(seed)
    

In [None]:
def process_pn_history(text):
    text = re.sub(r"\r\n"," ",text)
    return text


def process_feature_text(text):
    text = re.sub(r"-"," ",text)
    return text


In [None]:
class NBMEDataset:
    def __init__(self,df,tokenizer):
        self.df = df
        self.pn_history = df.pn_history.values
        self.feature_text = df.feature_text.values
        self.max_len = args.MAX_LEN
        self.tokenizer = tokenizer
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self,item):
        text = self.pn_history[item]
        feature_text = self.feature_text[item]
        
        
        tok_text = self.tokenizer.encode(text,feature_text)
        tok_text_tokens = tok_text.tokens
        tok_text_ids = tok_text.ids
        tok_text_type_ids = tok_text.type_ids
        tok_text_offsets = tok_text.offsets[1:-1]
        
        
        mask = [1] * len(tok_text_ids)
        token_type_ids = tok_text_type_ids
        padding_len = self.max_len - len(tok_text_ids)
        
        ids = tok_text_ids + [0] * padding_len
        mask = mask + [0] * padding_len
        token_type_ids = token_type_ids + [0] * padding_len
        offsets = tok_text.offsets + [(0,0)] * padding_len
        
        
        return {
            "ids":torch.tensor(ids,dtype=torch.long),
            "mask":torch.tensor(mask,dtype=torch.long),
            "token_type_ids":torch.tensor(token_type_ids,dtype=torch.long),
            "offsets":str(offsets)
            
        }

In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(args.MODEL_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.l0 = nn.Linear(768,1)
        
    def forward(self, ids, mask, token_type_ids):
        output = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
        )
        
        
        logits = self.l0(output[0])
        logits = logits.squeeze(-1)
        
        
        return logits

In [None]:
def eval_fn(dataloader, model, device):
    model.eval()
    tk0 = tqdm(dataloader, total=len(dataloader))
    fin_output = []
    fin_offsets = []
    fin_output_location = []
    
    for bi, d in enumerate(tk0):
        #print(len(d))
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        offsets = d["offsets"]
        
        
        
        for i in range(len(offsets)):
            offsets[i] = eval(offsets[i])
        fin_offsets.extend(offsets)
        
        
        ids = ids.to(device)
        token_type_ids = token_type_ids.to(device)
        mask = mask.to(device)
        
        
        o1 = model(
            ids=ids,
            token_type_ids=token_type_ids,
            mask=mask
        )
        
        
        
        ids = ids.cpu().detach().numpy()
        token_type_ids = token_type_ids.cpu().detach().numpy()
        mask = mask.cpu().detach().numpy()
        del ids
        del token_type_ids
        del mask
        
        threshold = 0.5
        
        fin_output.append(torch.sigmoid(o1).cpu().detach().numpy())
        del o1
        
        
        
    fin_offsets = np.array(fin_offsets)
    
    
    fin_output = np.vstack(fin_output)
    
    
    for i in range(len(fin_output)):
        output = [1 if i>=threshold else 0 for i in fin_output[i]]
        offset = fin_offsets[i]
        output_location = []
        start = -1
        for j in range(len(output)):
            if output[j]==1 and start==-1:
                start = offset[j][0]
            if output[j]==0 and start!=-1:
                end = offset[j-1][-1]
                output_location.extend([str(start) + " " + str(end)])
                start=-1
            
        fin_output_location.append(output_location)
    
    
    new_location = [i[0] if len(i)==1 else ";".join(i) for i in fin_output_location]   
    return np.array(new_location)

In [None]:
test_df = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
feature = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
pn_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
sample = pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

feature["feature_text"] = feature["feature_text"].apply(process_feature_text)
pn_notes["pn_history"] = pn_notes["pn_history"].apply(process_pn_history)
test_df = test_df.merge(feature,how="left",on=["feature_num","case_num"])
test_df = test_df.merge(pn_notes,how="left",on=["pn_num","case_num"])

In [None]:
model = BERTBaseUncased()
model.to(args.device)
model.load_state_dict(torch.load(args.MODEL_SAVE_PATH))

test_dataset = NBMEDataset(
    df=test_df,
    tokenizer=args.tokenizer
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=args.TEST_BATCH_SIZE)

pred_location = eval_fn(test_dataloader, model, args.device)

In [None]:
sample_sub = pd.DataFrame({
    "id":sample.id.values,
    "location":pred_location
})

In [None]:
sample_sub.to_csv("submission.csv",index=False)
display(sample_sub)