In [1]:
import json
import os

from huggingface_hub import hf_hub_download
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, DistilBertModel, BertModel, AutoModel, XLMRobertaModel
from sklearn.metrics.pairwise import cosine_similarity

tqdm.pandas()

## 1. Loading the Pre-trained Model

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
# model = BertModel.from_pretrained("google-bert/bert-base-uncased")

# pretrained on 880M words
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# pretrained on 1.2B words and fine-tuned on 300M words
# tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
# model = AutoModel.from_pretrained("medicalai/ClinicalBERT")

# tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")
# model = BertModel.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")

# alternate way for loading bluebert
# tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16")
# model = AutoModel.from_pretrained("bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16")

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base").to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [3]:
# # from transformers import BertTokenizer, BertForPreTraining
# from transformers import BertTokenizer, BertModel


# # load the trained model from huggingface
# repo_id = "bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16"
# bert_bin = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin")
# bert_config = hf_hub_download(repo_id=repo_id, filename="config.json")
# bert_vocab = hf_hub_download(repo_id=repo_id, filename="vocab.txt")

# pretrained_model_path = "./pretrained_model"
# os.system(f"mkdir -p {pretrained_model_path}")
# os.system(f"cp {bert_bin} {pretrained_model_path}")
# os.system(f"cp {bert_config} {pretrained_model_path}")
# os.system(f"cp {bert_vocab} {pretrained_model_path}")
# os.system(f"ls -alh {pretrained_model_path}")


# tokenizer = BertTokenizer.from_pretrained(pretrained_model_path)
# model = BertModel.from_pretrained(pretrained_model_path)
# model.eval()

# # input_text = "This is a sample input."
# # inputs = tokenizer(input_text, return_tensors="pt")
# # outputs = model(**inputs)
# # print(outputs)

In [4]:
model = model.to(device)

## 2. Loading the Pre-processed Dataset

In [5]:
repo_id = "alibababeig/nlp-hw4-dataset"
dataset_filename = "PMC-dataset-preprocessed.tar.gz"
dataset_path = hf_hub_download(repo_id=repo_id, filename=dataset_filename)

os.system(f"cp {dataset_path} ./")
os.system(f"tar -xf {dataset_filename}")
os.system(f"rm {dataset_filename}")

PMC-dataset-preprocessed.tar.gz:   0%|          | 0.00/70.4M [00:00<?, ?B/s]

0

In [6]:
dataset_dir = "PMC-dataset-preprocessed"
filenames = os.listdir(dataset_dir)
filepaths = [os.path.join(dataset_dir, f) for f in filenames
             if os.path.isfile(os.path.join(dataset_dir, f))]
filepaths = sorted(filepaths)

data_list = []
for filepath in tqdm(filepaths):
    with open(filepath, "r") as fp:
        data_record = json.load(fp)
    data_list.append(data_record)

  0%|          | 0/122564 [00:00<?, ?it/s]

## 3. Doing some more Pre-processing on the Dataset :))

In [7]:
df = pd.DataFrame(data_list)
df["textual_data"] = df["full_title"] + " " + df["abstract"]
display(df)

Unnamed: 0,full_title,abstract,textual_data
0,Dissociation in performance of children with A...,Attention deficit hyperactivity disorder (ADHD...,Dissociation in performance of children with A...
1,Maladaptation and the Paradox of Robustness in...,Background Organisms use a variety of mechanis...,Maladaptation and the Paradox of Robustness in...
2,Moderate Neonatal Stress Decreases Within-Grou...,Background The significance of behavioral neur...,Moderate Neonatal Stress Decreases Within-Grou...
3,Procollagen Triple Helix Assembly: An Unconven...,Fibers composed of type I collagen triple heli...,Procollagen Triple Helix Assembly: An Unconven...
4,Wnt and TGF-β Expression in the Sponge Amphim...,Background The origin of metazoan development ...,Wnt and TGF-β Expression in the Sponge Amphim...
...,...,...,...
122559,Laparoscopic repair of ventral / incisional he...,"Despite its significant prevalence, there is l...",Laparoscopic repair of ventral / incisional he...
122560,Attenuated Stress Response to Acute Restraint ...,Arginine vasopressin (AVP) synthesised in the ...,Attenuated Stress Response to Acute Restraint ...
122561,Asthma and Allergic Diseases in Pregnancy: A ...,Asthma and allergic disorders can affect the c...,Asthma and Allergic Diseases in Pregnancy: A ...
122562,Structural Analysis of HIV-1 Maturation Using ...,"HIV-1 buds form infected cells in an immature,...",Structural Analysis of HIV-1 Maturation Using ...


In [8]:
clean_df = df.copy()
clean_df.replace("", np.nan, inplace=True)
clean_df.dropna(subset=["full_title", "abstract"], inplace=True)
clean_df.reset_index(drop=True, inplace=True)

# clean_df = clean_df.iloc[:200]  # FIXME: Remove this line later.

display(clean_df)

Unnamed: 0,full_title,abstract,textual_data
0,Dissociation in performance of children with A...,Attention deficit hyperactivity disorder (ADHD...,Dissociation in performance of children with A...
1,Maladaptation and the Paradox of Robustness in...,Background Organisms use a variety of mechanis...,Maladaptation and the Paradox of Robustness in...
2,Moderate Neonatal Stress Decreases Within-Grou...,Background The significance of behavioral neur...,Moderate Neonatal Stress Decreases Within-Grou...
3,Procollagen Triple Helix Assembly: An Unconven...,Fibers composed of type I collagen triple heli...,Procollagen Triple Helix Assembly: An Unconven...
4,Wnt and TGF-β Expression in the Sponge Amphim...,Background The origin of metazoan development ...,Wnt and TGF-β Expression in the Sponge Amphim...
...,...,...,...
111244,Laparoscopic repair of ventral / incisional he...,"Despite its significant prevalence, there is l...",Laparoscopic repair of ventral / incisional he...
111245,Attenuated Stress Response to Acute Restraint ...,Arginine vasopressin (AVP) synthesised in the ...,Attenuated Stress Response to Acute Restraint ...
111246,Asthma and Allergic Diseases in Pregnancy: A ...,Asthma and allergic disorders can affect the c...,Asthma and Allergic Diseases in Pregnancy: A ...
111247,Structural Analysis of HIV-1 Maturation Using ...,"HIV-1 buds form infected cells in an immature,...",Structural Analysis of HIV-1 Maturation Using ...


## 4. Creating Dataset and Dataloader

In [9]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self._df = df
        self._tokenizer = tokenizer
        self._max_length = max_length

    def __len__(self):
        return len(self._df)

    def __getitem__(self, idx):
        txt = self._df["textual_data"][idx]
        tokens = self._tokenizer(
            txt,
            padding="max_length",
            truncation=True,
            max_length=self._max_length,
            return_tensors="pt",
        )
        tokens = {k: v.squeeze(0) for k, v in tokens.items()}
        return tokens

In [10]:
max_length = 512
batch_size = 32
num_workers = 1

dataset = MyDataset(clean_df, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

## 5. Computing CLS Token's Embedding for Data Records

In [11]:
model = model.to(device)
model.eval()
cls_tokens = []
for batch in tqdm(dataloader):
    input_ids = batch["input_ids"].to(device)
    att_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, att_mask)
        
    if "pooler_output" in outputs:
        cls_embedding = outputs.pooler_output
    elif "last_hidden_state" in outputs:
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze()
    else:
        raise Exception("No CLS Token found in the given model")
        
    cls_embedding = cls_embedding.cpu().numpy().tolist()
    cls_tokens += cls_embedding

print(len(cls_tokens))

  0%|          | 0/3477 [00:00<?, ?it/s]

111249


## 6. Adding the Computed CLS Embeddings to the Dataset

In [12]:
cls_tokens = [np.asarray(cls) for cls in cls_tokens]
clean_df['cls'] = cls_tokens
display(clean_df)

Unnamed: 0,full_title,abstract,textual_data,cls
0,Dissociation in performance of children with A...,Attention deficit hyperactivity disorder (ADHD...,Dissociation in performance of children with A...,"[-0.05976047366857529, 0.2253987044095993, 0.1..."
1,Maladaptation and the Paradox of Robustness in...,Background Organisms use a variety of mechanis...,Maladaptation and the Paradox of Robustness in...,"[-0.04111213609576225, 0.2373562753200531, 0.1..."
2,Moderate Neonatal Stress Decreases Within-Grou...,Background The significance of behavioral neur...,Moderate Neonatal Stress Decreases Within-Grou...,"[-0.05662888288497925, 0.2256246656179428, 0.1..."
3,Procollagen Triple Helix Assembly: An Unconven...,Fibers composed of type I collagen triple heli...,Procollagen Triple Helix Assembly: An Unconven...,"[-0.05236496776342392, 0.2418883591890335, 0.1..."
4,Wnt and TGF-β Expression in the Sponge Amphim...,Background The origin of metazoan development ...,Wnt and TGF-β Expression in the Sponge Amphim...,"[-0.056209903210401535, 0.23780013620853424, 0..."
...,...,...,...,...
111244,Laparoscopic repair of ventral / incisional he...,"Despite its significant prevalence, there is l...",Laparoscopic repair of ventral / incisional he...,"[-0.057635437697172165, 0.2483735978603363, 0...."
111245,Attenuated Stress Response to Acute Restraint ...,Arginine vasopressin (AVP) synthesised in the ...,Attenuated Stress Response to Acute Restraint ...,"[-0.04109165444970131, 0.22945654392242432, 0...."
111246,Asthma and Allergic Diseases in Pregnancy: A ...,Asthma and allergic disorders can affect the c...,Asthma and Allergic Diseases in Pregnancy: A ...,"[-0.050739776343107224, 0.24599896371364594, 0..."
111247,Structural Analysis of HIV-1 Maturation Using ...,"HIV-1 buds form infected cells in an immature,...",Structural Analysis of HIV-1 Maturation Using ...,"[-0.0647159069776535, 0.24797338247299194, 0.1..."


In [13]:
clean_df.to_json("PMC-dataset-with-CLS.json", double_precision=15)

## 7. Loading the Fully Pre-processed Dataset

In [14]:
# repo_id = "alibababeig/nlp-hw4-dataset"
# # dataset_filename = "PMC-dataset-with-CLS-BioClinicalBERT.json"  # IMPORTANT: Set this according to the selected model.
# # dataset_filename = "PMC-dataset-with-CLS-BlueBERT.json"  # IMPORTANT: Set this according to the selected model.
# dataset_filename = "PMC-dataset-with-CLS-ClinicalBERT.json"  # IMPORTANT: Set this according to the selected model.

# dataset_path = hf_hub_download(repo_id=repo_id, filename=dataset_filename)

# os.system(f"cp {dataset_path} ./PMC-dataset-with-CLS.json")

In [15]:
# loaded_df = pd.read_json("PMC-dataset-with-CLS.json")
# display(loaded_df)

In [16]:
# import matplotlib.pyplot as plt

# text_lengths = loaded_df['textual_data'].str.split().str.len()
# # print(text_lengths)
# text_lengths.plot.hist(bins=50)
# plt.xlabel('Length of textual_data')
# plt.ylabel('Number of rows')
# plt.title('Distribution of Overall Textual Data Length')
# plt.show()

In [17]:
# print(loaded_df['full_title'][111247])
# print()
# print(loaded_df['abstract'][111247])

## 8. Computing the CLS Token's Embedding for the Input Query

In [18]:
# def encode_text(text, tokenizer, model, max_length=512):
#     tokens = tokenizer(
#         text,
#         padding="max_length",
#         truncation=True,
#         max_length=max_length,
#         return_tensors="pt",
#     ).to(device)

#     with torch.no_grad():
#         outputs = model(**tokens)

#     if "pooler_output" in outputs:
#         cls_embedding = outputs.pooler_output
#     elif "last_hidden_state" in outputs:
#         cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze()
#     else:
#         raise Exception("No CLS Token found in the given model")
        

#     return cls_embedding.cpu()

In [19]:
# def cosine_similarity(query, dataset):
#     query_norm = query / np.linalg.norm(query)
#     dataset_norm = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]    
#     similarities = np.dot(dataset_norm, query_norm)
#     return similarities 

# def MSE_similarity(query, dataset):
#     dists = ((dataset - query) ** 2).sum(axis=1)
#     return (1.0 / dists) # inverse of distance scores are equivalent to similarity

# def k_nearest_embeddings(query, dataset, k, similarity_metric=cosine_similarity):
#     similarities = similarity_metric(query, dataset)
    
#     # Get the indices of the top k highest similarities
#     nearest_indices = np.argpartition(similarities, -k)[-k:]
    
#     # Sort these indices by the actual similarities
#     nearest_indices = nearest_indices[np.argsort(similarities[nearest_indices])[::-1]]
    
#     # Get the top k similarities and corresponding embeddings
#     top_k_similarities = similarities[nearest_indices]
#     top_k_embeddings = dataset[nearest_indices]
    
#     return nearest_indices, top_k_embeddings, top_k_similarities


# k = 3
# query = "Structural Analysis of HIV-1 Maturation Using Cryo-Electron Tomography"
# # query = "Structural Analysis of HIV-1 Maturation Using Cryo-Electron Tomography HIV-1 buds form infected cells in an immature, non-infectious form. Maturation into an infectious virion requires proteolytic cleavage of the Gag polyprotein at five positions, leading to a dramatic change in virus morphology. Immature virions contain an incomplete spherical shell where Gag is arranged with the N-terminal MA domain adjacent to the membrane, the CA domain adopting a hexameric lattice below the membrane, and beneath this, the NC domain and viral RNA forming a disordered layer. After maturation, NC and RNA are condensed within the particle surrounded by a conical CA core. Little is known about the sequence of structural changes that take place during maturation, however. Here we have used cryo-electron tomography and subtomogram averaging to resolve the structure of the Gag lattice in a panel of viruses containing point mutations abolishing cleavage at individual or multiple Gag cleavage sites. These studies describe the structural intermediates correlating with the ordered processing events that occur during the HIV-1 maturation process. After the first cleavage between SP1 and NC, the condensed NC-RNA may retain a link to the remaining Gag lattice. Initiation of disassembly of the immature Gag lattice requires cleavage to occur on both sides of CA-SP1, while assembly of the mature core also requires cleavage of SP1 from CA. Author Summary HIV-1 buds from the plasma membrane of infected cells in an immature form with the polyprotein Gag as its major component. Maturation into an infectious form requires cleavage of Gag in five positions. This process is an important target for antiretroviral drugs. Here we studied changes in the structure of the virus that occur during maturation, making use of virus variants in which different combinations of cleavage sites were mutated to prevent cleavage at those sites. We used cryo-electron tomography and sub-tomogram averaging to visualise the arrangement of Gag in 3D. We show that the fastest cleavage event leads to condensation of the RNA genome complexed with viral proteins. This inner RNA/protein structure appears to maintain a link with the remaining Gag lattice. Processing on both sides of CA-SP1, the main structural module of Gag, is required for disassembly of the immature Gag lattice, while removal of SP1 is needed in addition for mature core formation. The results provide structural correlates of the ordered processing events during HIV-1 maturation and shed light on the mechanism of action of bevirimat, an inhibitor of CA-SP1 cleavage in clinical trials."
# # query = "protein in the nucleus of a cell"

# cls_emb = encode_text(query, tokenizer, model).numpy().squeeze()

In [20]:
# nearest_indices, _, nearest_similarities = k_nearest_embeddings(cls_emb, np.asarray(loaded_df['cls'].tolist()), k, similarity_metric=MSE_similarity)
# print("Row indices of the k nearest embeddings:", nearest_indices)
# print("MSE similarities of the k nearest embeddings:", nearest_similarities)
# mins_mse = loaded_df.iloc[nearest_indices]
# mins_mse.reset_index(drop=True, inplace=True)
# display(mins_mse)

In [21]:
# nearest_indices, _, nearest_similarities = k_nearest_embeddings(cls_emb, np.asarray(loaded_df['cls'].tolist()), k, similarity_metric=cosine_similarity)
# print("Row indices of the k nearest embeddings:", nearest_indices)
# print("Cosine similarities of the k nearest embeddings:", nearest_similarities)
# mins_cosine = loaded_df.iloc[nearest_indices]
# mins_cosine.reset_index(drop=True, inplace=True)
# display(mins_cosine)

In [22]:
# idx = 0
# print(mins_cosine['full_title'][idx])
# print(mins_cosine['abstract'][idx])

## 0. Debugging Stuff :))

In [23]:
# # Sample documents
# # documents = ["football goal", "doctor patient", "thief rob", "tall tree"]
# documents = ["extreme headache due to acute sinusitis", "visual impairment caused by macular degeneration", "ultrasound tissue scanning device", "beautiful tall trees with green leaves"]
# doc_embeddings = [encode_text(doc, tokenizer, model) for doc in documents]

# # Query
# query = "painkillers"
# query_embedding = encode_text(query, tokenizer, model)

# # Convert embeddings to numpy arrays
# query_embedding_np = query_embedding.numpy().reshape(1, -1)
# doc_embeddings_np = [doc_emb.numpy().reshape(1, -1) for doc_emb in doc_embeddings]


# # Compute cosine similarities
# similarities = [
#     cosine_similarity(query_embedding_np, doc_emb_np).item()
#     for doc_emb_np in doc_embeddings_np
# ]

# # Rank documents by similarity
# ranked_docs = sorted(zip(similarities, documents), reverse=True, key=lambda x: x[0])

# print(f"query: {query}")

# print("Ranked Documents:")
# for score, doc in ranked_docs:
#     print(f"Score: {score:.4f}, Document: {doc}")

In [24]:
# !rm -rf pretrained_model
# !rm -rf PMC-dataset-preprocessed

In [25]:
!mv PMC-dataset-with-CLS.json PMC-dataset-with-CLS-XLMRoberta.json

In [26]:
from huggingface_hub import HfApi

# generate a token from Profile > Setting > Access Tokens with write access
api = HfApi(
    token="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",
)
api.upload_file(
    path_or_fileobj="./PMC-dataset-with-CLS-XLMRoberta.json",
    path_in_repo="PMC-dataset-with-CLS-XLMRoberta.json",
    repo_id="alibababeig/nlp-hw4-dataset",
    repo_type="model",
)

PMC-dataset-with-CLS-XLMRoberta.json:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alibababeig/nlp-hw4-dataset/commit/901a164c87c3ed190a034ce7c4adb51aeb01f144', commit_message='Upload PMC-dataset-with-CLS-XLMRoberta.json with huggingface_hub', commit_description='', oid='901a164c87c3ed190a034ce7c4adb51aeb01f144', pr_url=None, pr_revision=None, pr_num=None)