In [1]:
!pip install datasets faiss-cpu transformers tensorflow gensim

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━

In [4]:
from datasets import load_dataset
from gensim.utils import simple_preprocess

# Load the PubMedQA dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Extract questions and associated abstracts for embedding
questions = ds['train']['question']
abstracts = ds['train']['context']
print(f"Loaded {len(abstracts)} abstracts from PubMedQA.")

# Extract 'contexts' field from each abstract and preprocess it
def preprocess_text(text):
    # Use gensim's simple_preprocess to tokenize and clean text
    tokens = simple_preprocess(text, deacc=True)  # deacc=True removes punctuation
    return ' '.join(tokens)

# Apply preprocessing to each abstract
cleaned_abstracts = []
for abstract in abstracts:
    # Concatenate all sections within 'contexts' into a single string
    full_text = ' '.join(abstract['contexts'])
    cleaned_text = preprocess_text(full_text)
    cleaned_abstracts.append(cleaned_text)

print("Text preprocessing completed.")


Loaded 1000 abstracts from PubMedQA.
Text preprocessing completed.


In [6]:
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
import numpy as np

# Set TensorFlow to use GPU if available
device_name = tf.config.experimental.list_logical_devices('GPU')
print("Using device:", "GPU" if device_name else "CPU")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = TFAutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Function to generate embeddings with GPU in batches
def get_embeddings_in_batches(texts, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenize and process each batch
        inputs = tokenizer(batch_texts, return_tensors="tf", padding=True, truncation=True, max_length=512)

        # Move computation to GPU if available
        with tf.device('/GPU:0' if device_name else '/CPU:0'):
            outputs = model(inputs).last_hidden_state
            batch_embeddings = tf.reduce_mean(outputs, axis=1)  # Mean pooling

        # Collect batch embeddings
        all_embeddings.extend(batch_embeddings.numpy())

    return np.array(all_embeddings)

# Generate embeddings for all abstracts
embeddings = get_embeddings_in_batches(cleaned_abstracts)
print(f"Generated embeddings for {len(embeddings)} abstracts.")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Using device: GPU


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Generated embeddings for 1000 abstracts.


In [7]:
import faiss

# Define the dimension of embeddings
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)  # L2 similarity for retrieval

# Ensure embedding_matrix has the correct shape for FAISS
embedding_matrix = embeddings.reshape(-1, embedding_dim).astype('float32')

# Add embeddings to FAISS index
index.add(embedding_matrix)
print(f"Total embeddings indexed: {index.ntotal}")


Total embeddings indexed: 1000


In [10]:
def retrieve_similar_abstracts(query, top_k=5):
    # Embed the query
    query_embedding = get_embeddings_in_batches([query])

    # Search FAISS for similar documents
    distances, indices = index.search(query_embedding.reshape(1, -1), top_k)

    # Retrieve and print the top similar abstracts
    results = []
    for idx in indices[0]:
        results.append(cleaned_abstracts[idx])
    return results

# Test the function with a sample question
sample_query = "Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?"
similar_abstracts = retrieve_similar_abstracts(sample_query)

print("Top similar abstracts:")
for i, abstract in enumerate(similar_abstracts, 1):
    print(f"{i}. {abstract}")


Top similar abstracts:
1. the technique of induced sputum has allowed to subdivide asthma patients into inflammatory phenotypes according to their level of granulocyte airway infiltration there are very few studies which looked at detailed sputum and blood cell counts in large cohort of asthmatics divided into inflammatory phenotypes the purpose of this study was to analyze sputum cell counts blood leukocytes and systemic inflammatory markers in these phenotypes and investigate how those groups compared with healthy subjects we conducted retrospective cross sectional study on asthmatics recruited from the university asthma clinic of liege and compared them with healthy subjects asthmatics were classified into inflammatory phenotypes the total non squamous cell count per gram of sputum was greater in mixed granulocytic and neutrophilic phenotypes as compared to eosinophilic asthma and healthy subjects sputum eosinophils in absolute values and percentages were increased in all asthma phe