In [None]:
import pandas as pd
import tqdm.notebook as tqdm
import ast
import re
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import transformers
from transformers import RobertaTokenizerFast
import numpy as np
import os

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
tqdm.tqdm_notebook.pandas()

In [None]:
DEVICE = torch.device('cuda')

MAX_LENGTH = 384
DOC_STRIDE = 128
BATCH_SIZE = 1

In [None]:
ROBERTA_CONFIG = transformers.RobertaConfig(
                  attention_probs_dropout_prob= 0.1,
                  bos_token_id= 0,
                  classifier_dropout= None,
                  eos_token_id= 2,
                  hidden_act= "gelu",
                  hidden_dropout_prob= 0.1,
                  hidden_size= 768,
                  initializer_range= 0.02,
                  intermediate_size= 3072,
                  layer_norm_eps= 1e-05,
                  max_position_embeddings= 514,
                  model_type= "roberta",
                  num_attention_heads= 12,
                  num_hidden_layers= 12,
                  pad_token_id= 1,
                  position_embedding_type= "absolute",
                  transformers_version= "4.17.0",
                  type_vocab_size= 1,
                  use_cache= True,
                  vocab_size= 50265
)

In [None]:
features_data = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
patient_notes_data = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
test_data = pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv')
train_data = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
submission_data = pd.read_csv('../input/nbme-score-clinical-patient-notes/sample_submission.csv')

In [None]:
class NBMETestTensorDataset:
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_masks = tokenized_data['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        
        data = {}
        data['input_ids'] = torch.tensor(self.input_ids[idx])
        data['attention_mask'] = torch.tensor(self.attention_masks[idx])
        return data

In [None]:
def get_start_end_of_second_sequence(sequence):
    
    is_searching_start = True
    is_searching_end = False
    
    sequence_start = -1
    sequence_end = -1
    
    for n in range(len(sequence)):
        
        if is_searching_end == True:
            if sequence[n] == None:
                sequence_end = n
                break
        if is_searching_start == True:
            if sequence[n] == 1:
                sequence_start = n
                is_searching_start = False
                is_searching_end = True
                
    return sequence_start, sequence_end
def tokenize_test_data(datas):
    datas = datas.reset_index().to_dict(orient='index')
    
    tokenized_datas = {
                        'input_ids' : [],
                        'attention_mask' : []
                      }
    for data_idx in tqdm.tqdm_notebook(range(len(datas))):
        data = datas[data_idx]

        tokenized_data = TOKENIZER(text = data['feature_text'], 
                                   text_pair=data['pn_history'],
                                   return_offsets_mapping=True, 
                                   padding='max_length', 
                                   max_length=MAX_LENGTH, 
                                   truncation = 'only_second',stride=DOC_STRIDE
                                      )
        tokenized_datas["input_ids"].append(tokenized_data["input_ids"])
        tokenized_datas["attention_mask"].append(tokenized_data["attention_mask"])
        
    return tokenized_datas

In [None]:
def process_location(row):
    matches = re.findall('(\d+)', str(row))
    start_positions = []
    end_positions = []
    if len(matches) > 0:

        for n in range(0, len(matches), 2):
            start_positions.append(int(matches[n]))
            end_positions.append(int(matches[n+1]))
            
        
    return start_positions, end_positions

def preprocess_features_data(features_data : pd.DataFrame):
    
    data = features_data.copy()

    print(f'Cleaning Feature_text...')
    data['feature_text'] = data['feature_text'].progress_apply(lambda x: re.sub('-OR-', ' or ', x))
    data['feature_text'] = data['feature_text'].progress_apply(lambda x: re.sub('-I-year', ' 1 year', x))
    data['feature_text'] = data['feature_text'].progress_apply(lambda x: re.sub('-', ' ', x))
    data['feature_text'] = data['feature_text'].progress_apply(lambda x: str.strip(x))
    
    return data

def preprocess_patient_notes_data(patient_notes_data : pd.DataFrame):

    data = patient_notes_data.copy()
    print(f'Cleaning Patient_notes...')

    
    data['pn_history'] = data['pn_history'].progress_apply(lambda x : re.sub(' mM ', 'M', x))
    data['pn_history'] = data['pn_history'].progress_apply(lambda x : re.sub('YOF', 'YO F', x))
    data['pn_history'] = data['pn_history'].progress_apply(lambda x : re.sub('yof', 'yo f', x))
    data['pn_history'] = data['pn_history'].progress_apply(lambda x : re.sub('malepresents', 'male presents', x))
    data['pn_history'] = data['pn_history'].progress_apply(lambda x : re.sub('AAF', ' F ', x))
    data['pn_history'] = data['pn_history'].progress_apply(lambda x : re.sub('YOM', 'YO M', x))
    data['pn_history'] = data['pn_history'].progress_apply(lambda x : re.sub(' FM ', ' F ', x))
    data['pn_history'] = data['pn_history'].progress_apply(lambda x : re.sub('17yoMotherwise', '17 yo M otherwise', x))
    
    return data

def process_data(train_data: pd.DataFrame, features_data: pd.DataFrame, patient_notes_data: pd.DataFrame):
    data = train_data.copy()
    
    print(f'Adding Feature_text to data...')
    data['feature_text'] = data[['case_num','feature_num']].progress_apply(lambda x: 
                            features_data.loc[(features_data['case_num'] == x['case_num']) & (features_data['feature_num'] == x['feature_num'])].values[0][2], 
                                                                        axis=1)
    
    print(f'Adding pn_history to data...')
    data['pn_history'] = data[['case_num','pn_num']].progress_apply(lambda x: 
                            patient_notes_data.loc[(patient_notes_data['case_num'] == x['case_num']) & (patient_notes_data['pn_num'] == x['pn_num'])].values[0][2], 
                                                                        axis=1)
    
    return data

In [None]:
TOKENIZER = RobertaTokenizerFast(vocab_file='../input/robertatokenizer/vocab.json',
                                     merges_file='../input/robertatokenizer/merges.txt', 
                                     tokenizer_file='../input/robertatokenizer/tokenizer.json')

In [None]:
p_features_data = preprocess_features_data(features_data)
p_patient_notes_data = preprocess_patient_notes_data(patient_notes_data)
p_test_data = process_data(test_data,p_features_data, p_patient_notes_data)
# p_train_data = process_data(train_data,p_features_data, p_patient_notes_data)

In [None]:
t_test_data = tokenize_test_data(p_test_data)
# t_train_data = tokenize_test_data(p_train_data)

In [None]:
test_nbme_dataset = NBMETestTensorDataset(t_test_data)
# train_nbme_dataset = NBMETestTensorDataset(t_train_data)

test_dl = DataLoader(test_nbme_dataset, BATCH_SIZE)
# train_dl = DataLoader(train_nbme_dataset, BATCH_SIZE)

In [None]:
class NBMEModel(nn.Module):
    def __init__(self,roberta_config):
        super(NBMEModel, self).__init__()
        
        self.loss_fn = nn.functional.binary_cross_entropy_with_logits

        self.roberta = transformers.RobertaModel(roberta_config)
        self.roberta.gradient_checkpointing_enable = True
        
        self.start_pos = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(in_features=768, out_features=1)
        )
        
        self.end_pos = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(in_features=768, out_features=1)
        )

        self.sigmoid_layer = nn.Sigmoid()

    def forward(self, data):
        
        output= self.roberta(data['input_ids'], data['attention_mask'])

        start_logits = self.start_pos(output.last_hidden_state)
        end_logits = self.end_pos(output.last_hidden_state)
        
        loss = None

        loss_start = self.loss_fn(start_logits, data['start_positions'].view(len(data['start_positions']), MAX_LENGTH, -1), reduction='mean')
        loss_end = self.loss_fn(end_logits, data['end_positions'].view(len(data['end_positions']), MAX_LENGTH, -1),reduction='mean')
            
        loss = loss_start + loss_end

        return loss
    
    def predict(self, data):
        
        output= self.roberta(data['input_ids'], data['attention_mask'])
        
        start_logits = self.start_pos(output.last_hidden_state)
        end_logits = self.end_pos(output.last_hidden_state)

        start_sig = self.sigmoid_layer(start_logits)
        end_sig = self.sigmoid_layer(end_logits)
        
        return start_sig, end_sig


In [None]:
model = NBMEModel(ROBERTA_CONFIG)

In [None]:
model.to(DEVICE)

In [None]:
model.load_state_dict(torch.load('../input/nbme-roberta-final/Model_15.pt', map_location=DEVICE))

In [None]:
def predict(raw_data, data_dl, model, num_pred, acc_req, device):
    num = 0
    
    submission = {'id': [],
                  'location' : []
                 }
    model.eval()
    
    with torch.no_grad():
        for data in tqdm.tqdm_notebook(data_dl, total=len(data_dl)):
            for k, v in data.items():
                    data[k] = v.to(device)
            preds_start, preds_end = model.predict(data)

            for count in range(len(preds_start)):

                start_confidence, start_prediction = preds_start[count].topk(num_pred, dim=0)
                end_confidence, end_prediction = preds_end[count].topk(num_pred, dim=0)

                possible_start = []
                possible_end = []

                for n in range(len(start_confidence)):
                    if (start_confidence[n] >= acc_req) & (end_confidence[n] >= acc_req):
                        possible_start.append(start_prediction[n].cpu().detach().numpy()[0])
                        possible_end.append(end_prediction[n].cpu().detach().numpy()[0])
                
                pred_size = len(possible_start) if len(possible_start) < len(possible_end) else len(possible_end)
#                 print(f'PRED : {pred_size}, len_start : {len(possible_start)}, len_end : {len(possible_end)}')
                locations = []
                feature_text = raw_data.loc[num,]['feature_text']
                pn_history = raw_data.loc[num,]['pn_history']

                tokenized_data = TOKENIZER(
#                     feature_text, 
                                            pn_history, 
                                           return_offsets_mapping=True,
#                                            max_length=MAX_LENGTH, 
#                                            truncation = 'only_second'
#                                            ,stride=DOC_STRIDE
                                          )
                offset_mapping = tokenized_data.pop('offset_mapping')

#                 sequence = tokenized_data.sequence_ids()

#                 sequence_start , sequence_end = get_start_end_of_second_sequence(sequence)

                if pred_size > 0:
                    for pred_idx in range(pred_size):
                        if (possible_start[pred_idx] <= possible_end[pred_idx]) & (len(offset_mapping) > (possible_end[pred_idx])):

                                start = possible_start[pred_idx]
                                end = possible_end[pred_idx]

                                char_start = offset_mapping[start][0]
                                char_end = offset_mapping[end][1]
                                if char_end == 0:
                                    char_end = offset_mapping[end - 1][1]
#                                 print(f'Start : {start}, End : {end}, Char_Start : {char_start}, Char_end : {char_end}')
                                if char_start < char_end:
                                    locations.append(f'{char_start} {char_end}')


                submission['id'].append(raw_data.loc[num,]['id'])
                if len(locations) == 0:
                    submission['location'].append(np.nan)
                else:
                    submission['location'].append(";".join(locations))
                num += 1
            
    return submission

In [None]:
train_data[:5]

In [None]:
submission = predict(p_test_data,test_dl, model, 6, 0.10, DEVICE)
df_submission = pd.DataFrame(submission)
df_submission.to_csv('submission.csv', index=False)
df_submission