In [None]:
!git lfs install

In [None]:
!git clone https://huggingface.co/sagteam/rubert-base-cased-mcn

In [1]:
from vectorization import ConceptVectorizer
import json
from tools.parse_RDRS import simple_parse_sagnlp_RDRS
from dataset import SimpleMedNormDataset
import torch
from models import CADEC_SoTa
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
import pandas as pd

In [2]:
USE_CUDA = torch.cuda.is_available()

<h2>Prepare data</h2>

In [3]:
with open('./rubert-base-cased-mcn/ConceptVectorizer/concept_to_idx.json', 'r') as map_f:
    concept_to_idx = json.load(map_f)
    idx_to_concept = {v:k for k,v in concept_to_idx.items()}
    
RDRS_df = pd.read_csv('./Data/Full_corps/RDRS.csv')
RDRS_df = RDRS_df[(RDRS_df['tag']=='ADR')&(RDRS_df['fold id']==4)]
test_phrases, test_concepts = RDRS_df['mention'].to_list(), RDRS_df['pt code'].apply(lambda x: str(x)).to_list()

In [4]:
ds_ts = SimpleMedNormDataset(test_phrases, test_concepts, 'sagteam/rubert-base-cased-mcn', concept_to_idx, use_cuda=USE_CUDA)
tsloader = torch.utils.data.DataLoader(ds_ts, batch_size=1, shuffle=False)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


<h2>Evaluate model on 4 RDRS fold, ADR subpart</h2>

In [5]:
meddra_thesaurus_embeddings_path = './rubert-base-cased-mcn/ConceptVectorizer/thesaurus_embeddings_meddra_origin.pt'
if not USE_CUDA:
    thesaurus_embeddings = torch.load(meddra_thesaurus_embeddings_path, map_location=torch.device('cpu'))
else:
    thesaurus_embeddings = torch.load(meddra_thesaurus_embeddings_path, './rubert-base-cased-mcn/thesaurus_embeddings_meddra_origin.pt')
net = CADEC_SoTa('./rubert-base-cased-mcn', thesaurus_embeddings)
device = 'cpu' if not USE_CUDA else 'cuda'
net.to(device)
net.eval()
model_answers = []
real_codes = []
for data in tqdm(tsloader):
    inputs = data['tokenized_phrases']
    with torch.no_grad():
        outputs_dict = net(inputs)
        pred_meddra_code = idx_to_concept[int(outputs_dict['output'].argmax())]
    model_answers.append(pred_meddra_code)
    real_codes.extend(data['label_codes'])

  0%|          | 0/984 [00:00<?, ?it/s]

In [6]:
print(f"f1-micro: {f1_score(real_codes, model_answers, average='micro')}")

f1-micro: 0.7134146341463414
