In [None]:
!nvidia-smi

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import pytorch_lightning as pl 
import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader , Dataset 
import wandb
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import warnings
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")
wandb.login(key = secret_value_0)

In [None]:
run = wandb.init()
artifact = run.use_artifact('som/NBME/model-26j6uz6e:v4', type='model')
artifact_dir = artifact.download()
artifact = run.use_artifact('som/NBME/model-26j6uz6e:v9', type='model')
artifact_dir = artifact.download()

In [None]:
class CFG:
    max_len=466

In [None]:
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

In [None]:
class BaseLineModel(pl.LightningModule):
    def __init__(self,mod_name="microsoft/deberta-base",
                 learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,):
        super().__init__()
        self.save_hyperparameters()
        self.config = AutoConfig.from_pretrained(mod_name, output_hidden_states=True)
        self.base = AutoModel.from_pretrained(mod_name,config=self.config)
        self.lin1 = nn.Linear(self.config.hidden_size, 256)
        self.lin2 = nn.Linear(256, 1)
    
    def forward(self, inputs):
        base_outs = self.base(**inputs)
        outs = self.lin1(base_outs[0])
        outs = self.lin2(outs)
        return outs
    
    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        y_preds = self(inputs)
        loss = nn.BCEWithLogitsLoss(reduction="none")(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()

        return {"loss":loss}
    
    def validation_step(self, batch ,batch_idx):
        inputs, labels = batch
        preds = self(inputs)
        val_loss = nn.BCEWithLogitsLoss(reduction="none")(preds.view(-1, 1), labels.view(-1, 1))
        val_loss = torch.masked_select(val_loss, labels.view(-1, 1) != -1).mean()
        self.log("val_loss", val_loss)
        return {"loss": val_loss, "preds": preds, "labels": labels}

    
    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        return [optimizer]
        

In [None]:
model_1 = BaseLineModel.load_from_checkpoint(checkpoint_path="./artifacts/model-26j6uz6e:v9/model.ckpt")

In [None]:
model_2 = BaseLineModel.load_from_checkpoint(checkpoint_path="./artifacts/model-26j6uz6e:v4/model.ckpt")

In [None]:
test = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

In [None]:
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(test.head())

In [None]:
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        return inputs
    


In [None]:
CFG.num_folds =10
CFG.tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
CFG.batch_size = 8
CFG.num_workers = 0

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
model_1.eval()
model_2.eval()
print("done")

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
prediction_1 = inference_fn(test_loader ,model_1 , device = "cpu")
prediction_2 = inference_fn(test_loader,model_2,device = "cpu")

In [None]:
prediction_1 = prediction_1.reshape((len(test), CFG.max_len))
prediction_2 = prediction_2.reshape((len(test), CFG.max_len))

In [None]:
pred_fin = prediction_1 + prediction_2

In [None]:
pred_fin = pred_fin/2

In [None]:
char_probs_1 = get_char_probs(test['pn_history'].values, prediction_1, CFG.tokenizer)
char_probs_2 = get_char_probs(test['pn_history'].values, prediction_2, CFG.tokenizer)
char_probs_fin = get_char_probs(test['pn_history'].values, pred_fin, CFG.tokenizer)

In [None]:
import itertools

In [None]:
res_1 =  get_results(char_probs_1, th = 0.5)
res_2 =  get_results(char_probs_2, th = 0.5)
res_fin =  get_results(char_probs_fin, th = 0.5)

In [None]:
res_1

In [None]:
res_2

In [None]:
res_fin

In [None]:
ss = pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

In [None]:
ss['location'] = res_1

In [None]:
ss.to_csv("submission.csv", index = False )