In [None]:
!pip install transformers
!pip install sentencepiece

In [2]:
root_dir = '/gdrive/My Drive/NBME/'

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")

convert_file = "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(filename, filepath)

In [5]:
import os
import gc
import ast
import sys
import copy
import json
import math
import string
import pickle
import random
import itertools
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

env: TOKENIZERS_PARALLELISM=true
cpu


In [6]:
class CFG:
    num_workers=4
    path=root_dir
    config_path=root_dir+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.2
    max_len=466
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]

# Utils

In [7]:
def seed_everything(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(seed=CFG.seed)

# Tokenizer

In [8]:
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

tokenizer = DebertaV2TokenizerFast.from_pretrained(root_dir+'deberta_tokenizer')
CFG.tokenizer = tokenizer

# Helper functions for scoring

In [9]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
        
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
        
    return micro_f1(bin_preds, bin_truths)

In [29]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
        
    return truths

def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


In [39]:
train = pd.read_csv(root_dir+'dataset/train.csv')
train.head(2)

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']


In [82]:
train = pd.read_csv(root_dir+'dataset/train.csv')
train_preds = pd.read_csv(root_dir+'dataset/train_all_predicted.csv')
# train['location'] = train['location'].apply(ast.literal_eval)
train = train[train['location'] != '[]']
train.reset_index(drop = True, inplace = True)
train = train[['id', 'location']]
final_train = train.merge(train_preds, on=['id'], how='left')
final_train.dropna(inplace = True)

In [83]:
def create_list_tp(data):
    data = "['"+data+"']"
    return data
final_train['location_x'] = final_train['location_x'].apply(ast.literal_eval)
final_train['location_y'] = final_train['location_y'].apply(create_list_tp)
final_train['location_y'] = final_train['location_y'].apply(ast.literal_eval)
train = final_train[['id', 'location_x']].copy().reset_index(drop = True)
train.columns = ['id', 'location']
train_preds = final_train[['id', 'location_y']].copy().reset_index(drop = True)
train_preds.columns = ['id', 'location']
train_labels = create_labels_for_scoring(train)
train_pred_labels = create_labels_for_scoring(train_preds)

In [86]:
get_score(train_labels, train_pred_labels)

0.9094147632298403

# Data Loading

In [None]:
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features

train = pd.read_csv(root_dir+'dataset/train_with_folds.csv')
train.drop(columns = ['Unnamed: 0'], inplace = True)
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)

def roundgg(data):
    data = str(data).zfill(5)
    return data

def roundgg2(data):
    data = str(data).zfill(3)
    return data

train['id'] = train['pn_num'].apply(roundgg) +'_'+ train['feature_num'].apply(roundgg2)
test = train[['id', 'case_num', 'pn_num', 'feature_num', 'feature_text', 'pn_history', 'fold']].copy()

# test = pd.read_csv(root_dir+'dataset/test.csv')
# submission = pd.read_csv(root_dir+'dataset/sample_submission.csv')
# features = pd.read_csv(root_dir+'dataset/features.csv')
# patient_notes = pd.read_csv(root_dir+'dataset/patient_notes.csv')
# features = preprocess_features(features)
# test = test.merge(features, on=['feature_num', 'case_num'], how='left')
# test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')

# Dataset

In [None]:
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        
        return inputs

# Model

In [None]:
class ScoringModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        
        return output

# Inference

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    
    return predictions

In [None]:
# test_dataset = TestDataset(CFG, test)
# test_loader = DataLoader(test_dataset,
#                          batch_size=CFG.batch_size,
#                          shuffle=False,
#                          num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
# predictions = []
# for fold in CFG.trn_fold:
#     model = ScoringModel(CFG, config_path=CFG.config_path, pretrained=False)
    
#     state = torch.load(CFG.path+f"microsoft-deberta-v3-large_fold{fold}_best.pth",
#                            map_location=torch.device('cpu'))
       
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model, device)
#     prediction = prediction.reshape((len(test), CFG.max_len))
#     char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
#     predictions.append(char_probs)
#     del model, state, prediction, char_probs
#     gc.collect()
#     torch.cuda.empty_cache()
    
# predictions = np.mean(predictions, axis=0)
# np.save(root_dir+'folds_5preds', predictions)

In [None]:
for fold in CFG.trn_fold:
    test_temp = test[test['fold'] == fold].copy()
    test_temp.reset_index(drop = True, inplace = True)
    train_temp = train[train['fold'] == fold].copy()
    train_temp.reset_index(drop = True, inplace = True)
    test_dataset = TestDataset(CFG, test_temp)
    test_loader = DataLoader(test_dataset,
                            batch_size=CFG.batch_size,
                            shuffle=False,
                            num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    predictions = []

    model = ScoringModel(CFG, config_path=CFG.config_path, pretrained=False)

    state = torch.load(CFG.path+f"microsoft-deberta-v3-large_fold{fold}_best.pth",
                            map_location=torch.device('cpu'))
        
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    prediction = prediction.reshape((len(test_temp), CFG.max_len))
    char_probs = get_char_probs(test_temp['pn_history'].values, prediction, CFG.tokenizer)
    np.save(root_dir+f'fold_valid/fold_{fold}prediction', prediction)
    np.save(root_dir+f'fold_valid/fold_{fold}char_probs', char_probs)
    results = get_results(char_probs, th=0.45)
    preds = get_predictions(results)
    valid_labels = create_labels_for_scoring(train_temp)
    score = get_score(valid_labels, preds)
    print(f'Fold: {fold} ---> Score: {score}')
    del model, state, prediction, char_probs, results, preds, valid_labels
    gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/86 [00:00<?, ?it/s]

Fold: 0 ---> Score: 0.8824369861167273


  0%|          | 0/83 [00:00<?, ?it/s]

Fold: 1 ---> Score: 0.886221888847217


  0%|          | 0/94 [00:00<?, ?it/s]

Fold: 2 ---> Score: 0.8643931950979886


  0%|          | 0/91 [00:00<?, ?it/s]

Fold: 3 ---> Score: 0.8850388477093247


  0%|          | 0/95 [00:00<?, ?it/s]

Fold: 4 ---> Score: 0.8842552989259576


In [None]:
for fold in CFG.trn_fold:
    test_temp = test[test['fold'] != fold].copy()
    test_temp.reset_index(drop = True, inplace = True)
    train_temp = train[train['fold'] != fold].copy()
    train_temp.reset_index(drop = True, inplace = True)
    test_dataset = TestDataset(CFG, test_temp)
    test_loader = DataLoader(test_dataset,
                            batch_size=CFG.batch_size,
                            shuffle=False,
                            num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    predictions = []

    model = ScoringModel(CFG, config_path=CFG.config_path, pretrained=False)

    state = torch.load(CFG.path+f"microsoft-deberta-v3-large_fold{fold}_best.pth",
                            map_location=torch.device('cpu'))
        
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    prediction = prediction.reshape((len(test_temp), CFG.max_len))
    char_probs = get_char_probs(test_temp['pn_history'].values, prediction, CFG.tokenizer)
    np.save(root_dir+f'fold_valid/fold_{fold}prediction_train', prediction)
    np.save(root_dir+f'fold_valid/fold_{fold}char_probs_train', char_probs)
    results = get_results(char_probs, th=0.45)
    preds = get_predictions(results)
    valid_labels = create_labels_for_scoring(train_temp)
    score = get_score(valid_labels, preds)
    print(f'Fold: {fold} ---> Score: {score}')
    del model, state, prediction, char_probs, results, preds, valid_labels
    gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/362 [00:00<?, ?it/s]

Fold: 0 ---> Score: 0.9574270131536734


  0%|          | 0/364 [00:00<?, ?it/s]

Fold: 1 ---> Score: 0.9528289327712448


  0%|          | 0/354 [00:00<?, ?it/s]

Fold: 2 ---> Score: 0.9582332304011123


  0%|          | 0/357 [00:00<?, ?it/s]

Fold: 3 ---> Score: 0.9514983391628338


  0%|          | 0/352 [00:00<?, ?it/s]

Fold: 4 ---> Score: 0.9521381449260127


In [None]:
results = get_results(predictions, test th=0.45)
submission['location'] = results

In [None]:
train = pd.read_csv(root_dir+'dataset/train_with_folds.csv')
train.drop(columns = ['Unnamed: 0'], inplace = True)
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
train = train[train['fold'] == 1]
train.reset_index(drop = True, inplace = True)

In [None]:
predictions = np.load(root_dir+'fold_valid/folds_1preds.npy', allow_pickle=True)

In [None]:
mx = []
for i in predictions:
    mx.append(predictions[0].shape[0])

In [None]:
np.max(mx)

950

In [None]:
final = []
for i in predictions:
    i = i.tolist()
    if len(i) < 950:
        for _ in range(950-len(i)):
            i.append(0.0)
    final.append(i)

In [None]:
pd.DataFrame(final).to_csv('fold1predana.csv')

In [None]:
train.reset_index(drop = True, inplace = True)

In [None]:
train.to_csv('fold1preddetail.csv')

In [None]:
results = get_results(predictions, th=0.5)

In [None]:
preds = get_predictions(results)

In [None]:
valid_labels = create_labels_for_scoring(train)

In [None]:
def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
        
    bin_preds = []
    bin_truths = []
    
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue

        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
        
    return micro_f1(bin_preds, bin_truths)

In [None]:
scores = []
for pr, tr in zip(preds, valid_labels):
    if len(pr) == 0 and len(tr) == 0:
        scores.append([0, ''])
    else:
        f = []
        for i in pr:
            i = [str(j) for j in i]
            f.append(' '.join(i))
        scores.append([span_micro_f1([tr], [pr]), ';'.join(f)])

In [None]:
pd.DataFrame(scores).to_csv('fold1scores.csv')

In [None]:
span_micro_f1([preds[0]], [valid_labels[0]])

1.0

In [None]:
[valid_labels[0]]

[[[716, 735]]]

In [None]:
[preds[0]]

[[[716, 735]]]

In [None]:
strt = '''17 yo M college student comes to the clinic due to heart pounding.  He states the he has had episodes of heart pounding for the last 3-4 months.  Nothing makes it better or worse and it goes away on its own.  he denies any chest pain but does report chest pressure during some of the episodes.  He denies warmth or sweating, recent illness, abdominal pain or N/V, tingling in extremities, recent illness or trauma.  He is new to college and has been taking aderol a few times a week.  The last time he took aderol was 2 days ago, which was also the last time he had episode of palpitations.  
ROS: negative except for above
PMH, PSH: none, medications: aderol self administered.  knda, FH: Mother thyroid disease, father heart attach at 52 is okay now.  SH: no changes in weight or appetite, he is on the meal plan and tries to be healthy, he plays intermural soccer and runs, no smoking, alcohol hx.  Triend marijuanna, uses aderol. Stress college'''

In [None]:
strt[315:323]

'sweating'

In [None]:
preds_score = []
for i in predictions:
    preds_score.append([np.max(i), np.mean(i), np.median(i)])

In [None]:
pd.DataFrame(preds_score, columns = ['max', 'mean', 'median']).to_csv('preds.csv')

In [None]:
train.head(2)

Unnamed: 0,pn_num,pn_history,feature_text,annotation,location,case_num,feature_num,fold,id
0,16,HPI: 17yo M presents with palpitations. Patien...,17-year,[17yo],[5 9],0,11,2,00016_011
1,16,HPI: 17yo M presents with palpitations. Patien...,Adderall-use,"[adderall, adderrall, adderrall]","[321 329, 404 413, 652 661]",0,6,2,00016_006


In [None]:
for fold in CFG.trn_fold:
    final = []
    thres = np.arange(0.05,0.95, 0.05)
    predictions = np.load(root_dir+f'fold_valid/folds_{fold}preds.npy', allow_pickle=True)
    for fet in tqdm(train['feature_text']):
        train_temp = train[train['fold'] == fold].copy()
        train_temp.reset_index(drop = True, inplace = True)
        idx = np.array(train_temp[train_temp['feature_text'] == fet].index)
        temp_train = train_temp[train_temp['feature_text'] == fet].copy()
        temp_train.reset_index(drop = True, inplace = True)
        for tho in thres:
            try:
                results = get_results(predictions[idx], th=tho)
                preds = get_predictions(results)
                valid_labels = create_labels_for_scoring(temp_train)
                score = get_score(valid_labels, preds)
                final.append([fet, tho, score])
            except:
                continue
    pd.DataFrame(final, columns = ['fet', 'thres', 'score']).to_csv(root_dir+f'fold_valid/fet_thres_search{fold}.csv')

  0%|          | 0/9901 [00:00<?, ?it/s]

  0%|          | 0/9901 [00:00<?, ?it/s]

  0%|          | 0/9901 [00:00<?, ?it/s]

  0%|          | 0/9901 [00:00<?, ?it/s]

  0%|          | 0/9901 [00:00<?, ?it/s]

In [None]:
results = get_results(predictions, th=0.46)
test['pred_location'] = results

In [None]:
test['act_location'] = train['location']

In [None]:
def con_list(data):
    data = ';'.join(data)
    return data

test['act_location'] = test['act_location'].apply(con_list)

In [None]:
test.to_csv(root_dir+'5foldpreds_ana.csv')

# Submission

In [None]:
results = get_results(predictions, th=0.45)
submission['location'] = results
display(submission.head())
submission[['id', 'location']].to_csv('submission.csv', index=False)

Unnamed: 0,id,location
0,00016_000,696 724
1,00016_001,668 693
2,00016_002,203 217
3,00016_003,70 91;176 183
4,00016_004,222 258
