In [15]:
from pymongo import MongoClient
from sklearn.neighbors import NearestNeighbors
import numpy as np
from pathlib import Path
import os
import json
from tqdm import tqdm

# Connect to MongoDB
client = MongoClient('localhost', 27018)
db = client['RAG']
collection = db['PubMedEmbedded']

In [16]:
collections = db.list_collection_names()
print("Collections in der Datenbank:", collections)

Collections in der Datenbank: ['PubMedEmbedded']


Load data into MongoDB

In [7]:
# Define the path for the source directory
source_directory = Path('C:/Users/linus/big_data/pubmed/first100JSONLembedded/')

# Check if the source directory exists
if not source_directory.exists():
    print("The source directory does not exist.")
else:
    # Iterate through each file in the source directory
    for file_name in tqdm(os.listdir(source_directory)):
        if file_name.endswith('.jsonl'):
            source_file = source_directory / file_name
            
            # Create a list to store the documents
            documents = []
            
            # Open and read the JSONL file containing the PubMed documents
            with open(source_file, 'r') as json_file:
                for line in json_file:
                    data = json.loads(line)
                    documents.append(data)
                    
                    if len(documents) == 1000: # Bulk loading 1000 docs
                        collection.insert_many(documents)
                        documents = []  # Clear the list after insertion

            # Insert any remaining documents
            if documents:
                collection.insert_many(documents)

print("Data loading complete")

100%|██████████| 100/100 [20:12<00:00, 12.12s/it]

Indexing complete





Creating text index on collection for full text search using TF-IDF Ranking

In [21]:
# create text index
tqdm(collection.create_index([('content', 'text')]))
print(list(collection.index_information()))

  0%|          | 0/12 [00:00<?, ?it/s]

['_id_', 'content_text']





### Full Text Search

Creating query function for TF-IDF ranked results

In [22]:
def search_TF_IDF(query):
    results = collection.find({"$text": {"$search": query}}).limit(10)
    return results

### Hybrid Search

Initializing Embedder to embedd query

In [3]:
from Embedding import TextEmbedder
embedder = TextEmbedder()

Defining functions for hybrid search

In [4]:
def find_nearest_embeddings(query_embedding, embeddings, n=10):
    # Initialization of KNN
    nearest_neighbors = NearestNeighbors(n_neighbors=n, algorithm='auto').fit(embeddings)
    # Find k nearest neighbors
    distances, indices = nearest_neighbors.kneighbors([query_embedding])
    return indices[0], distances[0]

def search_and_retrieve_embeddings(query, k):

    results = collection.find(
        {"$text": {"$search": query}},
        {'PMID': 1, 'embeddings': 1, '_id': 0}
    ).limit(k)
    
    # Extrcat PMIDs
    pmids = []
    embeddings = []
    for doc in results:
        pmids.append(doc['PMID'])
        embeddings.append(doc['embeddings'])

    # Konvertieren Sie die Embeddings-Liste in ein NumPy-Array
    embeddings_array = np.array(embeddings)

    return pmids, embeddings_array

conducting TF-IDF ranking

In [23]:
pmid_liste = search_TF_IDF("Is Rheumatoid Arthritis more common in men or women?")

In [24]:
for i in pmid_liste:
    print(i)

{'_id': ObjectId('661682eb9531bb32b9a5a191'), 'id': 'pubmed23n0070_3063', 'title': 'Radiographic changes in the temporomandibular joint of patients with rheumatoid arthritis, psoriatic, arthritis, and ankylosing spondylitis.', 'content': 'Sixty-one subjects with rheumatoid arthritis, 61 with psoriatic arthritis, 61 with ankylosing spondylitis, and 77 healthy controls were examined using orthopantomography to determine the frequency of radiographic changes in the condyle of the temporomandibular joint. Radiographic changes were found significantly more often in subjects with rheumatoid arthritis (66%), psoriatic arthritis (38%), and ankylosing spondylitis (30%) than in controls (12%). Subjects with rheumatoid arthritis also had significantly more radiographic changes, especially cortical erosions and subcortical cysts, than subjects with psoriatic arthritis or ankylosing spondylitis. It may be concluded that rheumatoid arthritis is a more severe disease than psoriatic arthritis or ankyl

Hybrid search

In [8]:
such_text = "List signaling molecules (ligands) that interact with the receptor EGFR?"

# retrieving 1000 docs incl. embedding based on full text search
pmids, embedding_matrix = search_and_retrieve_embeddings(such_text, 1000)

# transform query into bioBERT embedding
suchtext_embedding = embedder.embed(such_text)

# find 10 nearest embeddings in small corpus (1000) using KNN
indices, distances = find_nearest_embeddings(suchtext_embedding, embedding_matrix, 10)

# show results
for i, index in enumerate(indices):
    print(f"{i + 1}: PMID = {pmids[index]}, Distanz = {distances[i]}")

1: PMID = 2212053, Distanz = 6.397121366951664
2: PMID = 1312153, Distanz = 6.632475605271574
3: PMID = 2324400, Distanz = 6.636741492047409
4: PMID = 1377727, Distanz = 6.655961150640634
5: PMID = 2002128, Distanz = 6.695614806634344
6: PMID = 2376212, Distanz = 6.723127154803946
7: PMID = 2673976, Distanz = 6.73307600296995
8: PMID = 2002220, Distanz = 6.7608914712881285
9: PMID = 2002216, Distanz = 6.783224931817288
10: PMID = 2034916, Distanz = 6.803948796184079
