In [12]:
from pymongo import MongoClient
from sklearn.neighbors import NearestNeighbors
import numpy as np
from pathlib import Path
import os
import json
from tqdm import tqdm

# Connect to MongoDB
client = MongoClient('localhost', 27017)
db = client['PubMed']
collection = db['all_docs']

In [13]:
collections = db.list_collection_names()
print("Collections in der Datenbank:", collections)

Collections in der Datenbank: ['Docs']


Loading 10% of the PubMed corpus into MongoDB (6.8 GB).

In [14]:
# Define the path for the source directory
source_directory = Path('C:/Users/linus/big_data/pubmed/chunk/')

# Check if the source directory exists
if not source_directory.exists():
    print("The source directory does not exist.")
else:
    # Iterate through each file in the source directory
    for file_name in tqdm(os.listdir(source_directory)):
        if file_name.endswith('.jsonl'):
            source_file = source_directory / file_name
            
            # Create a list to store the documents
            documents = []
            
            # Open and read the JSONL file containing the PubMed documents
            with open(source_file, 'r') as json_file:
                for line in json_file:
                    data = json.loads(line)
                    documents.append(data)
                    
                    if len(documents) == 1000: # Bulk loading 1000 docs
                        collection.insert_many(documents)
                        documents = []  # Clear the list after insertion

            # Insert any remaining documents
            if documents:
                collection.insert_many(documents)

print("Data loading complete")

100%|██████████| 1166/1166 [30:55<00:00,  1.59s/it]

Data loading complete





- Inserted 3.2 mio documents in 4 min 13 sec into collection. 
- Inserted 23.9 mio documents in 30 min 55 sec into collection.

Now creating text index on collection for full text search using TF-IDF Ranking

In [15]:
# create text index
#collection.create_index([('content', 'text')])
print(list(collection.index_information()))

['_id_', 'content_text']


- Index for 3.2 mio docs took 6 min and 28 sec
- Index for 23.9 mio docs took 77 min and 33 sec

### Full Text Search

Creating query function for TF-IDF ranked results

In [19]:
def search_TF_IDF(query):
    results = collection.find({"$text": {"$search": query}}).limit(100)
    return results

### Hybrid Search

Initializing Embedder to embedd query

In [3]:
from Embedding import TextEmbedder
embedder = TextEmbedder()

Defining functions for hybrid search

In [4]:
def find_nearest_embeddings(query_embedding, embeddings, n=10):
    # Initialization of KNN
    nearest_neighbors = NearestNeighbors(n_neighbors=n, algorithm='auto').fit(embeddings)
    # Find k nearest neighbors
    distances, indices = nearest_neighbors.kneighbors([query_embedding])
    return indices[0], distances[0]

def search_and_retrieve_embeddings(query, k):

    results = collection.find(
        {"$text": {"$search": query}},
        {'PMID': 1, 'embeddings': 1, '_id': 0}
    ).limit(k)
    
    # Extrcat PMIDs
    pmids = []
    embeddings = []
    for doc in results:
        pmids.append(doc['PMID'])
        embeddings.append(doc['embeddings'])

    # Konvertieren Sie die Embeddings-Liste in ein NumPy-Array
    embeddings_array = np.array(embeddings)

    return pmids, embeddings_array

conducting TF-IDF ranking

In [22]:
pmid_liste = search_TF_IDF("Is it possible to visualize subtahalamic nucleus by using transcranial ultrasound?")

In [23]:
for i in pmid_liste:
    print(i)

{'_id': ObjectId('6617c760fb3d2cdc7c0ab02e'), 'id': 'pubmed23n0045_2126', 'title': 'Distribution of somatostatin-28 (1-12) in the cat brainstem: an immunocytochemical study.', 'content': 'We studied the distribution of somatostatin-28 (1-12)-immunoreactive fibers and cell bodies in the cat brainstem. A moderate density of cell bodies containing the peptide was observed in the ventral nucleus of the lateral lemniscus, accessory dorsal tegmental nucleus, retrofacial nucleus and in the lateral reticular nucleus, whereas a low density of such perikarya was found in the interpeduncular nucleus, nucleus incertus, nucleus sagulum, gigantocellular tegmental field, nucleus of the trapezoid body, nucleus praepositus hypoglosii, lateral and magnocellular tegmental fields, nucleus of the solitary tract, nucleus ambiguous and in the nucleus intercalatus. Moreover, a moderate density of somatostatin-28 (1-12)-immunoreactive processes was found in the dorsal nucleus of the raphe, dorsal tegmental nuc

Hybrid search

In [8]:
such_text = "List signaling molecules (ligands) that interact with the receptor EGFR?"

# retrieving 1000 docs incl. embedding based on full text search
pmids, embedding_matrix = search_and_retrieve_embeddings(such_text, 1000)

# transform query into bioBERT embedding
suchtext_embedding = embedder.embed(such_text)

# find 10 nearest embeddings in small corpus (1000) using KNN
indices, distances = find_nearest_embeddings(suchtext_embedding, embedding_matrix, 10)

# show results
for i, index in enumerate(indices):
    print(f"{i + 1}: PMID = {pmids[index]}, Distanz = {distances[i]}")

1: PMID = 2212053, Distanz = 6.397121366951664
2: PMID = 1312153, Distanz = 6.632475605271574
3: PMID = 2324400, Distanz = 6.636741492047409
4: PMID = 1377727, Distanz = 6.655961150640634
5: PMID = 2002128, Distanz = 6.695614806634344
6: PMID = 2376212, Distanz = 6.723127154803946
7: PMID = 2673976, Distanz = 6.73307600296995
8: PMID = 2002220, Distanz = 6.7608914712881285
9: PMID = 2002216, Distanz = 6.783224931817288
10: PMID = 2034916, Distanz = 6.803948796184079
