In [1]:

import os
import sys
import argparse
import math
import numpy as np
import timeit
import torch
import torch.utils.data as data_utils
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.model_selection import train_test_split
sys.path.append("/home/jovyan/20230406_ArticleClassifier/ArticleClassifier")

import src.general.global_variables as gv
from src.general.utils import cc_path

sys.path.append(gv.PROJECT_PATH)


sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname('data_loader.py'), os.path.pardir)))
from src.data.data_loader import DataLoader as OwnDataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler



In [2]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
from transformers import BertTokenizer, BertModel, AutoTokenizer
MAX_LEN = 512
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # create empty lists to store outputs
    input_ids = []
    attention_masks = []
    
    #for every sentence...
    
    for sent in tqdm(data):
        # 'encode_plus will':
        # (1) Tokenize the sentence
        # (2) Add the `[CLS]` and `[SEP]` token to the start and end
        # (3) Truncate/Pad sentence to max length
        # (4) Map tokens to their IDs
        # (5) Create attention mask
        # (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text = sent,   #preprocess sentence
            add_special_tokens = True,         #Add `[CLS]` and `[SEP]`
            max_length= MAX_LEN  ,             #Max length to truncate/pad
            pad_to_max_length = True,          #pad sentence to max length 
            return_attention_mask= True,       #Return attention mask 
            truncation=True
        )
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
        
    #convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    return input_ids,attention_masks

In [3]:
def load_bert_model(model_path):
    do_lower_case = True
    if model_path == 'scibert_scivocab_uncased':
        model = BertModel.from_pretrained(model_version)
    else:
        model = torch.load(cc_path(model_path))

    return model.base_model

# # path options
# 'scibert_scivocab_uncased'
# f'models/embedders/finetuned_bert_56k_20e_3lay_best_iter.pt'
# f'models/embedders/litcovid_finetuned_bert_56k_20e_3lay_best_iter_meta.pt'
# f'models/embedders/litcovid_pretrained_best_iter_meta_stopwords.pt'
# f'models/baselines/paula_finetuned_bert_56k_10e_tka.pt')

model_path = f'models/embedders/litcovid_pretrained_best_iter_meta_stopwords.pt'
BERTmodel = load_bert_model(model_path)


In [4]:
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')


In [5]:
def load_all_canary_data():
    # load all the data
    loc_dict = {
        'processed_csv': cc_path('data/processed/canary/articles_cleaned.csv'),
        'abstract_embeddings': cc_path('data/processed/canary/embeddings_fasttext_20230410.csv'),
        'keyword_network': cc_path('data/processed/canary/keyword_network_weighted.pickle'),
        'author_network': cc_path('data/processed/canary/author_network.pickle')
    }
    data_loader = OwnDataLoader(loc_dict)
    processed_df = data_loader.load_processed_csv()

    processed_df['pui'] = processed_df['pui'].astype(str)

    label_columns = processed_df.loc[:, ~processed_df.columns.isin(
        ['file_name', 'title', 'keywords', 'abstract', 'abstract_2', 'authors', 'organization', 'chemicals',
         'num_refs', 'date-delivered', 'labels_m', 'labels_a'])]
    label_columns.loc[:, label_columns.columns.difference(['pui'])] = label_columns.loc[
                                                                      :, label_columns.columns.difference(['pui'])].astype(str)

    with open(cc_path(f'data/train_indices.txt')) as f:
        train_puis = f.read().splitlines()
    with open(cc_path(f'data/val_indices.txt')) as f:
        val_puis = f.read().splitlines()
    with open(cc_path(f'data/test_indices.txt')) as f:
        test_puis = f.read().splitlines()

    return processed_df, train_puis, val_puis, test_puis

def generate_canary_embedding_text(df):
    df['str_keywords'] = df['keywords'].str.replace('[', ' ').str.replace(']', ' ').str.replace(', ', ' ').str.replace("'", '')
    df['embedding_text'] = df['title'] + df['str_keywords'] + df['abstract']

    return df

processed_df, train_puis, val_puis, test_puis = load_all_canary_data()
processed_df = generate_canary_embedding_text(processed_df)




--------------------------------------------------
Loading data...


In [5]:
# Load the custom dataset
def load_all_litcovid_data():
    loc_dict = {
        'processed_csv': cc_path('data/processed/litcovid/litcovid_articles_cleaned.csv'),
        'scibert_embeddings': cc_path('data/processed/litcovid/litcovid_embeddings_scibert_finetuned_20230529_meta_stopwords.csv'),
        'keyword_network': cc_path('data/processed/litcovid/litcovid_keyword_network_weighted.pickle'),
        'xml_embeddings': cc_path('data/processed/litcovid/litcovid_embeddings_xml_20230518_68.ftr'),
        'label_network': cc_path('data/processed/litcovid/litcovid_label_network_weighted.pickle')
    }
    data_loader = OwnDataLoader(loc_dict)
    processed_df = data_loader.load_processed_csv()
    processed_df.dropna(subset=['abstract'], inplace=True)

    label_columns = processed_df.loc[:, ~processed_df.columns.isin(
        ['file_name', 'title', 'keywords', 'abstract', 'abstract_2', 'authors', 'organization', 'chemicals',
         'num_refs', 'date-delivered', 'labels_m', 'labels_a', 'journal', 'pub_type', 'doi', 'label', 'label_m', 'list_label'])]
    label_columns.loc[:, label_columns.columns.difference(['pui'])] = label_columns.loc[:,
        label_columns.columns.difference(['pui'])].astype(int)

    with open(cc_path(f'data/litcovid_train_indices.txt')) as f:
        train_puis = f.read().splitlines()
    with open(cc_path(f'data/litcovid_val_indices.txt')) as f:
        val_puis = f.read().splitlines()
    with open(cc_path(f'data/litcovid_test_indices.txt')) as f:
        test_puis = f.read().splitlines()

    return processed_df, train_puis, val_puis, test_puis

def generate_litcovid_embedding_text(df):

    df['str_keywords'] = df['keywords'].str.replace('[', ' ').str.replace(']', ' ').str.replace(', ', ' ').str.replace("'", '')
    df['embedding_text'] = df['title'] + " " + df['journal'] + " " + df['pub_type'].str.replace(';', ' ') + " " + df['str_keywords'] + df['abstract']
    return df

processed_df, train_puis, val_puis, test_puis = load_all_litcovid_data()
processed_df = generate_litcovid_embedding_text(processed_df)


Start loading data...


100%|██████████| 24946/24946 [00:27<00:00, 919.91it/s] 
100%|██████████| 6236/6236 [00:06<00:00, 969.06it/s] 
100%|██████████| 2489/2489 [00:02<00:00, 973.47it/s] 


In [8]:
def generate_dataloader_objects(label_columns, processed_df, train_puis, val_puis, test_puis, batch_size=32):
    train_set, train_masks = preprocessing_for_bert(processed_df.loc[processed_df.pui.isin(train_puis), 'embedding_text'])
    val_set, val_masks = preprocessing_for_bert(processed_df.loc[processed_df.pui.isin(val_puis), 'embedding_text'])
    test_set, test_masks = preprocessing_for_bert(processed_df.loc[processed_df.pui.isin(test_puis), 'embedding_text'])

    train_labels = torch.tensor(label_columns.loc[processed_df.pui.isin(train_puis), label_columns.columns.difference(['pui'])].to_numpy(dtype=np.int8))
    val_labels = torch.tensor(label_columns.loc[processed_df.pui.isin(val_puis), label_columns.columns.difference(['pui'])].to_numpy(dtype=np.int8))
    test_labels = torch.tensor(label_columns.loc[processed_df.pui.isin(test_puis), label_columns.columns.difference(['pui'])].to_numpy(dtype=np.int8))

    train_data = TensorDataset(train_set.to(device),train_masks.to(device), train_labels.to(device))
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    val_data = TensorDataset(val_set.to(device), val_masks.to(device), val_labels.to(device))
    val_sampler = RandomSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    test_data = TensorDataset(test_set.to(device), test_masks.to(device), test_labels.to(device))
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    return {'train': train_dataloader, 'val': val_dataloader, 'test': test_dataloader}, {'train': train_set, 'val': val_set, 'test': test_set}

In [None]:

dataloaders, datasets = generate_dataloader_objects(label_columns, processed_df, train_puis, val_puis, test_puis, batch_size=32)

In [7]:
train_labels

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0]], dtype=torch.int8)

In [9]:


# create Network structure

In [10]:

class BasicModule(nn.Module):
    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name = str(type(self))

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def save(self, path=None):
        if path is None:
            raise ValueError('Please specify the saving road!!!')
        torch.save(self.state_dict(), path)
        return path


# In[9]:


def get_embedding_layer(embedding_weights):
    word_embeddings = nn.Embedding(num_embeddings=embedding_weights.size(0), embedding_dim=embedding_weights.size(1))
    word_embeddings.weight.data.copy_(embedding_weights)
    word_embeddings.weight.requires_grad = False  # not train
    return word_embeddings


class Hybrid_XML(BasicModule):
    def __init__(self, num_labels=3714, vocab_size=30001, embedding_size=300, embedding_weights=None,
                 max_seq=300, hidden_size=256, d_a=256, label_emb=None):
        super(Hybrid_XML, self).__init__()
        self.embedding_size = embedding_size
        self.num_labels = num_labels
        self.max_seq = max_seq
        self.hidden_size = hidden_size

        self.bert = BERTmodel
        for name, param in self.bert.named_parameters():
            param.required_grad = False
            if 'layer.11' in name:
                param.requires_grad = True
            else:
                param.requires_grad = False

        # interaction-attention layer
        self.key_layer = torch.nn.Linear(self.embedding_size, self.hidden_size)
        self.query_layer = torch.nn.Linear(7, self.hidden_size)

        # self-attn layer
        self.linear_first = torch.nn.Linear(self.embedding_size, d_a)
        self.linear_second = torch.nn.Linear(d_a, self.num_labels)

        # weight adaptive layer
        self.linear_weight1 = torch.nn.Linear(self.embedding_size, 1)
        self.linear_weight2 = torch.nn.Linear(self.embedding_size, 1)

        # shared for all attention component
        self.linear_final = torch.nn.Linear(768, self.hidden_size)
        self.decrease_emb_size = torch.nn.Linear(self.embedding_size, 768)
        self.output_layer = torch.nn.Linear(self.hidden_size, 1)

        label_embedding = torch.FloatTensor(self.num_labels, 7)
        
#         label_emb = torch.nn.functional.pad(label_emb, pad=(0, 384-52), mode='constant', value=0)
        if label_emb is None:
            nn.init.xavier_normal_(label_embedding)
        else:
            label_embedding.copy_(label_emb)
        self.label_embedding = nn.Parameter(label_embedding, requires_grad=False)

    def init_hidden(self, batch_size):
        if torch.cuda.is_available():
            return (
            torch.zeros(2, batch_size, self.hidden_size).cuda(), torch.zeros(2, batch_size, self.hidden_size).cuda())
        else:
            return (torch.zeros(2, batch_size, self.hidden_size), torch.zeros(2, batch_size, self.hidden_size))

    def forward(self, input_ids, attention_mask, embedding_generation=False):

#         emb = self.word_embeddings(x)

#         hidden_state = self.init_hidden(emb.size(0))
#         output, hidden_state = self.lstm(emb, hidden_state)  # [batch,seq,2*hidden]
        
        output = self.bert(input_ids=input_ids,
                            attention_mask = attention_mask)[0]

        # get attn_key
        attn_key = self.key_layer(output)  # [batch,seq,hidden]
        attn_key = attn_key.transpose(1, 2)  # [batch,hidden,seq]
        
        # get attn_query
        label_emb = self.label_embedding.expand(
            (attn_key.size(0), self.label_embedding.size(0), self.label_embedding.size(1)))  # [batch,L,label_emb]
        label_emb = self.query_layer(label_emb)  # [batch,L,label_emb]
        
        # attention
        similarity = torch.bmm(label_emb, attn_key)  # [batch,L,seq]
        similarity = F.softmax(similarity, dim=2)
        out1 = torch.bmm(similarity, output)  # [batch,L,label_emb]

        # self-attn output
        self_attn = torch.tanh(self.linear_first(output))  # [batch,seq,d_a]
        self_attn = self.linear_second(self_attn)  # [batch,seq,L]
        self_attn = F.softmax(self_attn, dim=1)
        self_attn = self_attn.transpose(1, 2)  # [batch,L,seq]
        out2 = torch.bmm(self_attn, output)  # [batch,L,hidden]

        factor1 = torch.sigmoid(self.linear_weight1(out1))
        factor2 = torch.sigmoid(self.linear_weight2(out2))
        factor1 = factor1 / (factor1 + factor2)
        factor2 = 1 - factor1

        out = factor1 * out1 + factor2 * out2
        
        out = self.decrease_emb_size(out)
        
        if embedding_generation:
            return out
        
        out = F.relu(self.linear_final(out))
        out = torch.sigmoid(self.output_layer(out).squeeze(-1))  # [batch,L]

        return out


In [11]:
def get_label_embeddings(path, embedding_size):
    label_emb = np.zeros(embedding_size)
    label_index_mapping = {}
    with open(cc_path(path)) as f:
        for index, i in enumerate(f.readlines()):
            if index == 0:
                continue
            i = i.rstrip('\n')
            n = i.split(',')[0]
            content = i.split(',')[1].split(' ')
            label_index_mapping[index-1] = n
            label_emb[index-1] = [float(value) for value in content]

    # label_emb = (label_emb - label_emb.mean()) / label_emb.std()
    label_emb = torch.from_numpy(label_emb).float()
    return label_emb

label_emb = get_label_embeddings(f'notebooks/litcovid_label_embedding_window3.txt', embedding_size=(7, 7))


In [12]:
print(label_emb)

tensor([[-0.0077,  0.0034,  0.0729,  0.1289, -0.1331, -0.1019,  0.0924],
        [ 0.1282, -0.0717, -0.0537,  0.1057, -0.0221, -0.0650,  0.0938],
        [-0.0694, -0.0260,  0.0412,  0.0144, -0.1186, -0.1352,  0.1047],
        [ 0.0725,  0.0966,  0.0110,  0.0909, -0.0488, -0.0137,  0.0826],
        [-0.1075, -0.0562, -0.1073, -0.0133,  0.1363, -0.1045, -0.0334],
        [-0.0277,  0.1154, -0.0847,  0.0007, -0.0679, -0.1372,  0.0715],
        [-0.1251, -0.0627, -0.0005, -0.0042, -0.1094,  0.1374,  0.0712]])


In [17]:
# # canary
# model = Hybrid_XML(num_labels=7, vocab_size=0, embedding_size=768, embedding_weights=0,
#                    max_seq=200, hidden_size=16, d_a=52, label_emb=label_emb).to(device)

# litcovid 
model = Hybrid_XML(num_labels=7, vocab_size=0, embedding_size=768, embedding_weights=0,
                   max_seq=200, hidden_size=16, d_a=7, label_emb=label_emb).to(device)

In [18]:
print(model)

Hybrid_XML(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [19]:
from sklearn.metrics import f1_score, recall_score, precision_score
import copy
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=0.00001, weight_decay=1e-4)
criterion = torch.nn.BCELoss(reduction='mean')

In [20]:

epoch = 60
best_acc = 0.0
pre_acc = 0.0

# if not os.path.isdir('./rcv_log'):
#     os.makedirs('./rcv_log')
# trace_file='./rcv_log/trace_rcv.txt'



num_labels = 7

def train_epoch(model, dataloader, dataset, pbar_description, optimizer, num_labels, batch_size):
    model.train()
    train_loss = 0
    train_score = 0
    predictions = np.zeros((len(dataset), num_labels))
    real_labels = np.zeros((len(dataset), num_labels))
    for i, (data, atts, labels) in (pbar := tqdm(enumerate(dataloader), position=0)):
        # print('new batch: ', i)
        optimizer.zero_grad()

        # data = data.cuda()
        # labels = labels.cuda()

        pred = model(data, atts)
        loss = criterion(pred, labels.float()) / pred.size(0)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_loss += float(loss)
#         train_score += f1_score(labels.detach().cpu().numpy(), np.round(pred.detach().cpu().numpy()), average='macro', zero_division=0)
        predictions[i*batch_size: (i+1)*batch_size, :] = np.round(pred.detach().cpu().numpy())
        real_labels[i*batch_size: (i+1)*batch_size, :] = labels.detach().cpu().numpy()
        pbar.set_description(pbar_description + f', train_loss: {loss}')

    train_score = f1_score(real_labels, predictions, average='micro', zero_division=0)
    train_loss /= i + 1

    return train_loss, train_score

def evaluation(model, dataloader, dataset, pbar_description, optimizer, num_labels, batch_size):
        test_loss = 0
        test_predictions = np.zeros((len(dataset), num_labels))
        test_real_labels = np.zeros((len(dataset), num_labels))

        model.eval()
        with torch.no_grad():
            for i, (data, atts, labels) in enumerate(dataloader):
                # data = data.cuda()
                # labels = labels.cuda()
                pred = model(data, atts)
                loss = criterion(pred, labels.float()) / pred.size(0)

                # metric
                labels_cpu = labels.data.cpu().numpy()
                pred_cpu = np.round(pred.data.cpu().numpy())

                test_loss += float(loss)
                test_predictions[i*batch_size: (i+1)*batch_size, :] = pred_cpu
                test_real_labels[i*batch_size: (i+1)*batch_size, :] = labels_cpu

        batch_num = i + 1
        test_loss /= batch_num
    #     test_score /= batch_num
        test_score = f1_score(test_real_labels, test_predictions, average='micro', zero_division=0)

        return test_score, test_loss, test_predictions, test_real_labels


def train_model(model, dataloaders, datasets, optimizer, num_labels, batch_size):
    best_val_score = 0
    not_improved = 0

    val_loss = 0
    val_score  = 0

    for ep in range(1, epoch + 1):

        pbar_description = f"epoch {ep}, train_loss = {train_loss:.4f}, test_loss = {val_loss:.4f}, train_f1 = {train_score:.4f}, val_f1 = {val_score:.4f}"

        train_loss, train_score, batch_num = train_epoch(model, dataloaders['train'], datasets['train'], pbar_description, optimizer, num_labels, batch_size)

        val_score, val_loss, _, _ = evaluation(model, dataloaders['val'], datasets['val'], pbar_description, optimizer, num_labels, batch_size)

        print('The current test score: ', val_score)
        if val_score > best_val_score:
            best_val_score = val_score
            best_model = copy.deepcopy(model)
            not_improved = 0
        else:
            not_improved += 1

        if not_improved == 5:
            break

        return best_model

best_model = train_model(model, dataloaders, datasets, optimizer, num_labels, batch_size)


epoch 1, train_loss = 0.0000, test_loss = 0.0000, train_f1 = 0.0000, test_f1 = 0.0000, train_loss: 0.004096423275768757: : 780it [05:47,  2.24it/s] 


The current test score:  0.8868498086016662


epoch 2, train_loss = 0.0059, test_loss = 0.0043, train_f1 = 0.8059, test_f1 = 0.8868, train_loss: 0.0040702312253415585: : 780it [05:48,  2.24it/s]


The current test score:  0.8829308909242298


epoch 3, train_loss = 0.0027, test_loss = 0.0043, train_f1 = 0.9209, test_f1 = 0.8829, train_loss: 0.004857152700424194: : 780it [05:48,  2.24it/s] 


The current test score:  0.8861720538156644


epoch 4, train_loss = 0.0027, test_loss = 0.0041, train_f1 = 0.9180, test_f1 = 0.8862, train_loss: 0.0035712234675884247: : 780it [05:48,  2.24it/s]


The current test score:  0.8792535675082327


epoch 5, train_loss = 0.0029, test_loss = 0.0044, train_f1 = 0.9126, test_f1 = 0.8793, train_loss: 0.0022972405422478914: : 780it [05:48,  2.24it/s]


The current test score:  0.880641592920354


epoch 6, train_loss = 0.0029, test_loss = 0.0043, train_f1 = 0.9098, test_f1 = 0.8806, train_loss: 0.0076494961977005005: : 780it [05:48,  2.24it/s]


The current test score:  0.8902195608782435


epoch 7, train_loss = 0.0030, test_loss = 0.0037, train_f1 = 0.9082, test_f1 = 0.8902, train_loss: 0.0035466747358441353: : 780it [05:48,  2.24it/s]


The current test score:  0.885441660621617


epoch 8, train_loss = 0.0031, test_loss = 0.0041, train_f1 = 0.9048, test_f1 = 0.8854, train_loss: 0.00452516321092844: : 780it [05:48,  2.24it/s]  


The current test score:  0.8921067575241342


epoch 9, train_loss = 0.0032, test_loss = 0.0038, train_f1 = 0.9019, test_f1 = 0.8921, train_loss: 0.00777384964749217: : 780it [05:48,  2.24it/s]  


The current test score:  0.8889526542324246


epoch 10, train_loss = 0.0032, test_loss = 0.0037, train_f1 = 0.9018, test_f1 = 0.8890, train_loss: 0.002711153356358409: : 780it [05:48,  2.24it/s] 


The current test score:  0.8774193548387097


epoch 11, train_loss = 0.0032, test_loss = 0.0043, train_f1 = 0.9011, test_f1 = 0.8774, train_loss: 0.004980372730642557: : 780it [05:48,  2.24it/s] 


The current test score:  0.8878552385274263


epoch 12, train_loss = 0.0033, test_loss = 0.0039, train_f1 = 0.8987, test_f1 = 0.8879, train_loss: 0.006095762364566326: : 780it [05:48,  2.24it/s] 


The current test score:  0.8863328822733424


epoch 13, train_loss = 0.0033, test_loss = 0.0039, train_f1 = 0.8987, test_f1 = 0.8863, train_loss: 0.005344081670045853: : 780it [05:48,  2.24it/s] 


The current test score:  0.8897507557177894


In [21]:
test_score, test_loss, test_predictions, test_real_labels = evaluation(model, dataloaders['test'], datasets['test'], pbar_description, optimizer, num_labels, batch_size)


78it [00:22,  3.48it/s]


In [22]:
#     test_score /= batch_num
macro_f1_test_score = f1_score(test_real_labels, test_predictions, average='macro', zero_division=0)
micro_f1_test_score = f1_score(test_real_labels, test_predictions, average='micro', zero_division=0)
macro_recall_test_score = recall_score(test_real_labels, test_predictions, average='macro', zero_division=0)
micro_recall_test_score = recall_score(test_real_labels, test_predictions, average='micro', zero_division=0)
macro_precision_test_score = precision_score(test_real_labels, test_predictions, average='macro', zero_division=0)
micro_precision_test_score = precision_score(test_real_labels, test_predictions, average='micro', zero_division=0)

In [23]:
print(macro_f1_test_score, macro_recall_test_score, macro_precision_test_score, micro_f1_test_score, micro_recall_test_score, micro_precision_test_score)

0.8408344272973511 0.8876959043060806 0.8134234067703165 0.8970251716247141 0.9255555555555556 0.870201096892139


In [54]:
torch.cuda.empty_cache()

In [25]:
torch.save(best_model, cc_path(f'models/xml_embedding/litcovid_xlm_embedder_20230529_stopwords.pt'))


In [7]:
best_model = torch.load(cc_path(f'models/xml_embedding/litcovid_xlm_embedder_20230518_all_data.pt'), map_location=device)


In [24]:
del train_data, train_sampler, train_dataloader, val_data, val_sampler, val_dataloader, test_data, test_dataloader

In [26]:
emb_batch_size = 256
# abstracts_to_embed = np.array(processed_df['embedding_text'], dtype=int)

# embedding_data = data_utils.TensorDataset(torch.from_numpy(abstracts_to_embed).type(torch.LongTensor), 
#                                           torch.from_numpy(puis_to_embed).type(torch.LongTensor))
# final_data = data_utils.DataLoader(embedding_data, emb_batch_size, drop_last=False)


full_set = processed_df.dropna(subset=['embedding_text'])

puis_to_embed = np.array(full_set.loc[:, 'pui'].to_list(), dtype=int)

final_set, final_masks = preprocessing_for_bert(full_set.loc[:, 'embedding_text'])
final_data = TensorDataset(final_set.to(device), final_masks.to(device),  torch.from_numpy(puis_to_embed).type(torch.LongTensor).to(device))
final_dataloader = DataLoader(final_data, batch_size=emb_batch_size)

100%|██████████| 33671/33671 [00:37<00:00, 905.97it/s] 


In [27]:
import pandas as pd
import gc

In [28]:
embedding_columns =  [f'd_{i}' for i in range(7*768)]
xml_embedding_df = pd.DataFrame(columns=embedding_columns, index=full_set['pui'].to_numpy(dtype=int)).astype(np.float16)
# xml_embedding_df['embedding'] = xml_embedding_df['embedding'].astype(object)
np.set_printoptions(threshold = 100000000000000)


In [29]:
num_of_embedding_dim = 104
best_model.eval()


with torch.no_grad():
    for i, (data, att_masks, pui) in enumerate(tqdm(final_dataloader)):
        pred = best_model(data, att_masks, embedding_generation=True)
        right_puis =  list(pui.detach().cpu().numpy())
        numpy_preds = pred.detach().cpu().numpy()
        # print(numpy_preds.reshape(numpy_preds.shape[0], numpy_preds.shape[1] * numpy_preds.shape[2]).shape)
        # for idx_batch in range(numpy_preds.shape[0]):
        xml_embedding_df.loc[right_puis, :] = numpy_preds.reshape(numpy_preds.shape[0], numpy_preds.shape[1] * numpy_preds.shape[2])
                
        gc.collect()
        torch.cuda.empty_cache()



100%|██████████| 132/132 [06:19<00:00,  2.88s/it]


In [30]:
xml_embedding_df.reset_index(inplace=True)


  xml_embedding_df.reset_index(inplace=True)


In [24]:
xml_embedding_df.iloc[0]

index     6.245314e+08
d_0       1.072229e+00
d_1       1.634674e-01
d_2      -1.355221e+00
d_3       1.438000e+00
              ...     
d_3531    1.978726e+00
d_3532   -3.448576e-01
d_3533   -1.053946e+00
d_3534    7.694158e+00
d_3535    9.000473e+00
Name: 0, Length: 3537, dtype: float64

In [31]:

xml_embedding_df.to_feather(cc_path('data/processed/litcovid/litcovid_embeddings_xml_20230529_768.ftr'))

In [61]:
pd.options.display.width = 1000

In [100]:
xml_embedding_df

Unnamed: 0,d_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_42,d_43,d_44,d_45,d_46,d_47,d_48,d_49,d_50,d_51
624531411,"[-0.016383378, -0.11197758, -0.037253555, -0.2...","[0.010617012, -0.09705223, -0.01200425, -0.120...","[0.028110279, -0.08243386, -0.016131265, -0.18...","[0.0022158436, -0.11320748, -0.010183357, -0.1...","[0.005926796, -0.10500478, -0.010475147, -0.12...","[0.0113420375, -0.09601049, -0.014044623, -0.1...","[0.016601732, -0.09619771, -0.009100688, -0.14...","[0.11544543, -0.005443737, -0.005527966, -0.18...","[0.02586015, -0.086351044, -0.010884014, -0.16...","[0.009882777, -0.09707534, -0.021866538, -0.18...",...,"[0.18155344, 0.026362767, -0.01475883, -0.2147...","[0.0069437977, -0.09799223, -0.02816795, -0.23...","[0.18892401, 0.041114308, -0.00097465096, -0.1...","[0.11469094, -0.015334345, 0.006475803, -0.087...","[0.027374564, -0.08158906, -0.01762917, -0.206...","[0.04705087, -0.065758094, -0.018019354, -0.21...","[0.3474198, 0.15047382, 0.0039090053, -0.13961...","[0.11787519, -0.0064049475, -0.0025091413, -0....","[0.069682136, -0.054890573, -0.011223326, -0.1...","[-0.011351064, -0.100169934, -0.034848228, -0...."
625340088,"[0.06821822, -0.05620522, -0.065800205, -0.222...","[0.10693053, -0.017095909, -0.044121563, -0.09...","[0.10340122, -0.019759048, -0.055330824, -0.16...","[0.044841465, -0.09599258, -0.047074445, -0.14...","[0.042860292, -0.08393292, -0.043275695, -0.10...","[0.038487554, -0.07838504, -0.053617507, -0.16...","[0.05473292, -0.075368986, -0.042547263, -0.12...","[0.18368945, 0.09457679, -0.079515636, -0.1797...","[0.1327912, 0.019922812, -0.06676929, -0.15574...","[0.1424106, 0.018941574, -0.052223623, -0.1458...",...,"[0.24319527, 0.12634419, -0.05570019, -0.18991...","[0.12861286, 0.009207115, -0.063367575, -0.198...","[0.24438602, 0.14896263, -0.07208059, -0.14055...","[0.1749304, 0.08700106, -0.05070314, -0.066946...","[0.100104034, -0.02964319, -0.0711903, -0.2004...","[0.13491955, 0.01794957, -0.067108504, -0.1916...","[0.41909015, 0.33678198, -0.07781066, -0.12629...","[0.17230125, 0.06469397, -0.08423577, -0.16590...","[0.15537813, 0.036530755, -0.06381854, -0.1704...","[0.02809963, -0.08056283, -0.07836892, -0.2682..."
625805682,"[0.028528668, -0.060206894, 0.07548873, -0.348...","[0.042353593, -0.07522996, 0.045501076, -0.324...","[0.043712128, -0.07069669, 0.053283475, -0.341...","[0.0912251, -0.05125811, 0.106232695, -0.31711...","[0.071796775, -0.050604787, 0.09280021, -0.337...","[0.076959535, -0.04865962, 0.090105645, -0.341...","[0.077228464, -0.051366046, 0.099378556, -0.32...","[0.051017053, -0.06567388, 0.10099919, -0.2431...","[0.04003511, -0.047263972, 0.09858238, -0.3281...","[0.054769102, -0.05078943, 0.08115368, -0.2868...",...,"[0.18210159, 0.063631855, 0.05955499, -0.32273...","[0.057935737, -0.034007467, 0.08288933, -0.324...","[0.21283509, 0.080757335, 0.084314525, -0.2392...","[0.099942975, -0.049863294, 0.13453224, -0.256...","[0.046176944, -0.0769093, 0.06805101, -0.29428...","[0.037154716, -0.05500753, 0.088356555, -0.317...","[0.33445185, 0.20716941, 0.06621084, -0.234082...","[0.07703457, -0.032216277, 0.1353079, -0.22812...","[0.06902366, -0.051700268, 0.06834428, -0.3177...","[0.03009884, -0.06154553, 0.08346843, -0.34348..."
626662493,"[0.059955165, -0.02994335, -0.03311316, -0.143...","[0.10139962, 0.009669423, -0.022478202, -0.038...","[0.0790334, -0.01266022, -0.02287779, -0.10124...","[0.030228794, -0.07793985, -0.028091416, -0.13...","[0.036995795, -0.06270175, -0.0315825, -0.1039...","[0.027064249, -0.0709749, -0.031441428, -0.146...","[0.043158136, -0.059222713, -0.027855176, -0.1...","[0.13726412, 0.05951462, 0.010710327, -0.14864...","[0.083741456, 0.0010779575, 0.00037083589, -0....","[0.12572579, 0.037525304, -0.019792218, -0.073...",...,"[0.15025625, 0.0521947, -0.026349314, -0.17327...","[0.11670811, 0.030976295, -0.02641234, -0.1190...","[0.19017154, 0.094812796, -0.003392328, -0.117...","[0.13538006, 0.04384046, 0.006757291, -0.05630...","[0.07334437, -0.024985045, -0.021543493, -0.13...","[0.097782776, 0.006623201, -0.026629627, -0.12...","[0.3373683, 0.24063022, -0.0021311226, -0.1236...","[0.15725388, 0.042757872, -0.008270113, -0.101...","[0.12123768, 0.029323883, -0.025152106, -0.115...","[0.022587577, -0.067165665, -0.03826805, -0.19..."
626822402,"[0.27494726, -0.061647326, 0.18791562, -0.2993...","[0.3305084, -0.046140015, 0.18127444, -0.38189...","[0.30549046, -0.052407146, 0.18561853, -0.3563...","[0.33089703, -0.05900829, 0.19056454, -0.28315...","[0.3212989, -0.053632278, 0.18682936, -0.28965...","[0.33208808, -0.05664587, 0.18075578, -0.28696...","[0.32238388, -0.05355747, 0.18911934, -0.28512...","[0.2843451, -0.075362384, 0.17306942, -0.26655...","[0.26771063, -0.066218354, 0.17441167, -0.2618...","[0.28459507, -0.07167025, 0.17930306, -0.29999...",...,"[0.34554872, -0.114436746, 0.16829908, -0.2782...","[0.25267428, -0.080995634, 0.17711401, -0.2627...","[0.3340537, -0.093892336, 0.1561747, -0.314642...","[0.3147732, -0.065012425, 0.17388539, -0.25686...","[0.29047766, -0.05335179, 0.18333776, -0.34369...","[0.2530854, -0.07009541, 0.18177888, -0.267674...","[0.3673926, -0.15189306, 0.14653346, -0.250014...","[0.27901053, -0.060020022, 0.20105505, -0.2875...","[0.2838174, -0.074696735, 0.17902729, -0.29193...","[0.25677127, -0.058577746, 0.19382006, -0.2792..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011621972,,,,,,,,,,,...,,,,,,,,,,
2011622024,,,,,,,,,,,...,,,,,,,,,,
2011622065,,,,,,,,,,,...,,,,,,,,,,
2011626864,,,,,,,,,,,...,,,,,,,,,,
