# Getting the Embedding Sequences 
For the following models:
1) NLP Baseline
2) KG Baseline 
3) STonKGs

In [1]:
# Imports 
import getpass
import os
import sys
import time

import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
from transformers import BertModel, BertTokenizer

from stonkgs.constants import (
    CELL_LINE_DIR,
    CELL_TYPE_DIR,
    EMBEDDINGS_PATH,
    DISEASE_DIR,
    LOCATION_DIR,
    MISC_DIR,
    NLP_MODEL_TYPE,
    ORGAN_DIR,
    PRETRAINED_STONKGS_DUMMY_PATH,
    RANDOM_WALKS_PATH,
    SPECIES_DIR,
    VISUALIZATIONS_DIR,
)
from stonkgs.models.kg_baseline_model import _prepare_df, INDRAEntityDataset
from stonkgs.models.nlp_baseline_model import INDRAEvidenceDataset
from stonkgs.models.stonkgs_model import STonKGsForPreTraining

Record details

In [2]:
print(getpass.getuser())
print(sys.version)
print(time.asctime())

hbalabin
3.8.8 (default, Feb 24 2021, 21:46:12) 
[GCC 7.3.0]
Thu Jul  1 13:13:41 2021


## 0. Helper functions

In [3]:
def preprocess_stonkgs_data(unprocessed_df):
    sep_id = 102
    kg_name_to_idx = {key: i for i, key in enumerate(embeddings_dict.keys())}
    
    # Convert random walk sequences to list of numeric indices
    random_walk_idx_dict = {k: [kg_name_to_idx[node] for node in v] for k, v in random_walks_dict.items()}
    
    # Get the length of the text or entity embedding sequences (2 random walks + 2 = entity embedding sequence length)
    random_walk_length = len(next(iter(random_walk_idx_dict.values())))
    half_length = random_walk_length * 2 + 2
    
    # Initialize the preprocessed data
    fine_tuning_preprocessed = []

    # Log progress with a progress bar
    for _, row in tqdm(
        unprocessed_df.iterrows(),
        total=unprocessed_df.shape[0],
        desc='Preprocessing the fine-tuning dataset',
    ):
        # 1. "Token type IDs": 0 for text tokens, 1 for entity tokens
        token_type_ids = [0] * half_length + [1] * half_length

        # 2. Tokenization for getting the input ids and attention masks for the text
        # Use encode_plus to also get the attention mask ("padding" mask)
        encoded_text = tokenizer.encode_plus(
            row['evidence'],
            padding='max_length',
            truncation=True,
            max_length=half_length,
        )
        text_token_ids = encoded_text['input_ids']
        text_attention_mask = encoded_text['attention_mask']

        # 3. Get the random walks sequence and the node indices, add the SEP (usually with id=102) in between
        # Use a sequence of UNK tokens if the node is not contained in the dictionary of the nodes from pre-training
        random_w_source = random_walk_idx_dict[
            row['source']
        ] if row['source'] in random_walk_idx_dict.keys() else [unk_id] * random_walk_length
        random_w_target = random_walk_idx_dict[
            row['target']
        ] if row['target'] in random_walk_idx_dict.keys() else [unk_id] * random_walk_length
        random_w_ids = random_w_source + [sep_id] + random_w_target + [sep_id]

        # 4. Total attention mask (attention mask is all 1 for the entity sequence)
        attention_mask = text_attention_mask + [1] * half_length

        # 5. Total input_ids = half text ids + half entity ids
        input_ids = text_token_ids + random_w_ids

        # Add all the features to the preprocessed data
        fine_tuning_preprocessed.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,  # Remove the MLM, ELM and NSP labels since it's not needed anymore
        })

    # Put the preprocessed data into a dataframe
    fine_tuning_preprocessed_df = pd.DataFrame(fine_tuning_preprocessed)

    return fine_tuning_preprocessed_df

## 1. Load some example sequences

In [4]:
task_dir = SPECIES_DIR
number_unique_tags = 3
dataset_version = "species_no_duplicates.tsv"
number_entries = 800

In [5]:
task_specific_dataset = pd.read_csv(os.path.join(task_dir, dataset_version), sep="\t", index_col=None)
if "Unnamed: 0" in task_specific_dataset.columns.values:
    task_specific_dataset.drop(columns=["Unnamed: 0"], inplace=True)

Filter out unseen nodes

In [6]:
embeddings_dict = _prepare_df(EMBEDDINGS_PATH)
random_walks_dict = _prepare_df(RANDOM_WALKS_PATH)

In [7]:
task_specific_dataset = task_specific_dataset[
    task_specific_dataset['source'].isin(embeddings_dict.keys()) & task_specific_dataset['target'].isin(embeddings_dict.keys())
].reset_index(drop=True)

In [8]:
task_specific_dataset['class'].value_counts()

9606     18633
10090     2857
10116      275
Name: class, dtype: int64

Sample the present classes equally

In [68]:
sampled_df = pd.DataFrame()

for cls in np.unique(task_specific_dataset["class"]):
    cls_specific_samples = task_specific_dataset[task_specific_dataset['class'] == cls].sample(
        n=number_entries//number_unique_tags)
    sampled_df = sampled_df.append(cls_specific_samples)
    
sampled_df.reset_index(drop=True, inplace=True)

In [69]:
sampled_df["class"].value_counts()

9606     266
10090    266
10116    266
Name: class, dtype: int64

## 2. Load all three models 
1) NLP Baseline

In [46]:
nlp_baseline = BertModel.from_pretrained(NLP_MODEL_TYPE)
tokenizer = BertTokenizer.from_pretrained(NLP_MODEL_TYPE, model_max_length=512)
labels = sampled_df["class"].tolist()

In [53]:
evidence = tokenizer(sampled_df.iloc[0]["evidence"], return_tensors="pt", padding='max_length', truncation=True)
dummy_nlp_example = (nlp_baseline(**evidence, output_hidden_states=True).last_hidden_state[0], torch.tensor(labels[0]))
print(dummy_nlp_example[0].shape)
dummy_nlp_example

torch.Size([512, 768])


(tensor([[ 0.3622,  0.0237, -0.3031,  ..., -0.2709,  0.2806, -0.0564],
         [ 0.1962, -0.0894,  0.1676,  ...,  0.1158,  0.2761, -0.2423],
         [ 0.4463, -0.1143,  0.1978,  ...,  0.3631,  0.5647, -0.0344],
         ...,
         [ 0.0092,  0.2662, -0.1773,  ...,  0.1612,  0.2934,  0.2623],
         [ 0.3508,  0.0314, -0.0435,  ...,  0.3190,  0.2344,  0.1220],
         [ 0.6748, -0.2057,  0.3180,  ..., -0.6108, -0.0396, -0.7527]],
        grad_fn=<SelectBackward>),
 tensor(9606))

2) KG Baseline (embedding dict)

Since it's based on static embeddings, we only need the "INDRAEntityDataset"

In [13]:
kg_baseline = INDRAEntityDataset(
    embeddings_dict,
    random_walks_dict,
    sampled_df["source"],
    sampled_df["target"],
    sampled_df["class"],
)

In [57]:
dummy_kg_example = kg_baseline[0]
print(dummy_kg_example[0].shape)
dummy_kg_example

torch.Size([254, 768])


(tensor([[ 0.0353,  0.0467, -0.0145,  ..., -0.0841, -0.0041, -0.0330],
         [ 0.2181, -0.9316, -0.0855,  ...,  0.4753, -0.5085, -0.0165],
         [-0.0089, -0.0398,  0.2235,  ...,  0.8814, -0.5135, -0.6484],
         ...,
         [-0.0089, -0.0398,  0.2235,  ...,  0.8814, -0.5135, -0.6484],
         [ 0.1518,  0.2211,  0.3233,  ..., -0.0662,  0.5883,  0.0953],
         [-0.3169,  0.0883,  0.3486,  ...,  0.1628, -0.3894, -0.1904]]),
 tensor(9606))

3) STonKGs (LARGE)

In [15]:
stonkgs = STonKGsForPreTraining.from_pretrained(
    pretrained_model_name_or_path=PRETRAINED_STONKGS_DUMMY_PATH,
)

In [16]:
stonkgs_data = preprocess_stonkgs_data(sampled_df)

Preprocessing the fine-tuning dataset: 100%|██████████| 798/798 [00:01<00:00, 794.03it/s] 


In [58]:
data_entry = {key: torch.tensor([value]) for key, value in dict(stonkgs_data.iloc[0]).items()}
dummy_stonkgs_example = (stonkgs(**data_entry, return_dict=True).hidden_states[0], torch.tensor(labels[0]))
print(dummy_stonkgs_example[0].shape)
dummy_stonkgs_example

torch.Size([512, 768])


(tensor([[ 0.2761, -0.2027,  0.7588,  ..., -0.0876,  0.6474,  0.7084],
         [-0.5997,  0.2369,  0.6981,  ...,  0.8298,  0.0673,  0.3779],
         [-0.4040,  0.4752,  1.0936,  ..., -0.1941, -0.5627,  1.1466],
         ...,
         [-0.8811,  0.7375,  0.1416,  ...,  1.6385, -0.0081, -0.2853],
         [ 0.1901,  0.8254,  0.1284,  ...,  0.8028, -1.2105, -1.4304],
         [-1.7002,  0.2221,  0.2442,  ..., -0.7222, -0.6268,  1.1990]],
        grad_fn=<SelectBackward>),
 tensor(9606))

## 3. Get the embeddings

1. NLP embeddings

In [70]:
def get_nlp_embeddings(list_of_indices):
    """Returns a list of (embedding_sequence, label) pairs."""
    all_embed_sequences = []
    
    for idx in list_of_indices:
        nlp_evidence = tokenizer(sampled_df.iloc[idx]["evidence"], return_tensors="pt", padding='max_length', truncation=True)
        nlp_hidden_states = (nlp_baseline(**nlp_evidence, output_hidden_states=True).last_hidden_state[0],
                             torch.tensor(sampled_df.iloc[idx]["class"]))
        all_embed_sequences.append(nlp_hidden_states)
        
    return all_embed_sequences

2. KG embeddings

In [62]:
def get_kg_embeddings(list_of_indices):
    """Returns a list of (embedding_sequence, label) pairs."""
    all_embed_sequences = []
    
    for idx in list_of_indices:
        all_embed_sequences.append(kg_baseline[idx])
        
    return all_embed_sequences

3. STonKGs

In [63]:
def get_stonkgs_embeddings(list_of_indices):
    """Returns a list of (embedding_sequence, label) pairs."""
    all_embed_sequences = []
    
    for idx in list_of_indices:
        data_entry = {key: torch.tensor([value]) for key, value in dict(stonkgs_data.iloc[idx]).items()}
        stonkgs_hidden_states = (stonkgs(**data_entry, return_dict=True).hidden_states[0],
                                 torch.tensor(sampled_df.iloc[idx]["class"]))
        all_embed_sequences.append(stonkgs_hidden_states)
        
    return all_embed_sequences

Testing the functions

In [77]:
get_nlp_embeddings([1,5,15])
get_kg_embeddings([105,150,400])
get_kg_embeddings([50,20,600])

[(tensor([[ 0.0272,  0.0478, -0.0016,  ..., -0.0485,  0.0049, -0.0204],
          [ 0.2156,  0.3641, -0.0936,  ..., -0.0673, -0.5435, -0.2532],
          [ 0.4665,  0.1521,  0.0888,  ...,  0.6257, -0.1580, -0.1126],
          ...,
          [ 0.1234,  0.1534, -0.1528,  ..., -0.3529, -0.1285,  0.0032],
          [-0.3890,  0.3860,  0.3015,  ..., -0.3013,  0.1033, -0.1343],
          [-0.9419, -1.2984,  0.9635,  ...,  0.3786, -0.6852,  0.2591]]),
  tensor(9606)),
 (tensor([[ 1.1285e-02,  6.1332e-02,  5.4512e-04,  ..., -7.6015e-02,
           -1.5385e-03, -3.7382e-02],
          [-3.0418e-01,  8.4933e-02,  5.8552e-02,  ..., -5.3119e-01,
            1.6921e-01,  3.8191e-02],
          [-2.8218e-01, -2.7288e-01,  1.7001e-01,  ...,  1.4877e-01,
           -2.5559e-02, -9.1580e-02],
          ...,
          [-6.3383e-01, -1.7323e+00,  6.2708e-01,  ...,  7.5040e-01,
           -5.1296e-01, -1.4195e-02],
          [-1.3717e-01,  3.3589e-01,  4.2335e-01,  ...,  9.9479e-01,
            7.6927e-01