In [1]:
import json
from elasticsearch import Elasticsearch, helpers
import urllib3
import os


elstic_password = os.getenv('ELASTIC_PASSWORD')

es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elstic_password),
    verify_certs=False
)

urllib3.disable_warnings()

  _transport = transport_class(


### Indexing Documents into Elasticsearch

### Indexing Documents with Embeddings into Elasticsearch for Vector Similarity Search

initialize the index with the appropriate mapping for the dense vector field.

In [5]:
# drop the index if it already exists
# if es.indices.exists(index='pubmed_emb_index'):
#    es.indices.delete(index='pubmed_emb_index')

In [6]:
# Define the index name
index_name = "pubmed_emb_index"

# Check if the index already exists
if not es.indices.exists(index=index_name):
    # Define the mapping
    mapping = {
        "mappings": {
            "properties": {
                "embeddings": {"type": "dense_vector", "dims": 768}  # Adjust the dimension size as needed
                # Add other field mappings as necessary
            }
        }
    }
    
    # Create the index with the defined mapping
    es.indices.create(index=index_name, body=mapping)

load the JSONL files containing the PubMed documents, extract the embeddings, and index the documents into Elasticsearch.

In [7]:
from pathlib import Path
import os
import json
from tqdm import tqdm

# Definieren Sie den Pfad zum Quellverzeichnis
source_directory = Path('C:/Users/linus/big_data/pubmed/chunk_embedded/')

index_name = "pubmed_emb_index"

def bulk_index_documents(source_directory, index_name):
    if not source_directory.exists():
        print("The source directory does not exist.")
        return

    actions = []  # Eine Liste, um die zu indexierenden Dokumente zu speichern

    # Iterieren durch jede Datei im Quellverzeichnis
    for file_name in tqdm(os.listdir(source_directory)):
        if file_name.endswith('.jsonl'):
            source_file = source_directory / file_name
            
            # Öffnen und Lesen der JSONL-Datei, die die PubMed-Dokumente enthält
            with open(source_file, 'r') as json_file:
                for line in json_file:
                    try:
                        doc = json.loads(line)
                        action = {
                            "_index": index_name,
                            "_source": doc
                        }
                        actions.append(action)

                        # Wenn die Anzahl der gesammelten Dokumente 200 erreicht, führen Sie die Bulk-Operation aus
                        if len(actions) == 400:
                            helpers.bulk(es, actions)
                            actions = []  # Zurücksetzen der Aktionen für den nächsten Batch
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON: {e}")
                    except Exception as e:
                        print(f"An error occurred: {e}")

    # Indexieren Sie alle verbleibenden Dokumente
    if actions:
        helpers.bulk(es, actions)

    print('Indexing complete')

# Rufen Sie die Funktion auf, um die Dokumente zu indizieren
bulk_index_documents(source_directory, index_name)

100%|██████████| 50/50 [31:25<00:00, 37.71s/it]


Indexing complete


In [8]:
count_result = es.count(index='pubmed_emb_index')

# Print the count
print(f"Index contains 50 JSONL Chunks with {count_result['count']} documents.")

Index contains 50 JSONL Chunks with 709071 documents.


In [11]:
response = es.indices.stats(index='pubmed_emb_index')
index_size = response['_all']['total']['store']['size_in_bytes']

print(f"Die Größe des Indexes ist {round(index_size/1000000000, 2)} GB.")

Die Größe des Indexes ist 20.13 GB.


### Define query functions for BM25 and Vector Similarity Search

Define a function to perform a BM25 search using the match query.

In [12]:
# Define a search query
def bm25_search(query: str, k: int = 5):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        }
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_index', body=query)

Initialize the text embedder and define a function to convert a query to a vector using the bioBERT embeddings.

In [13]:
from Embedding import TextEmbedder
embedder = TextEmbedder()

Define a function to perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents.

In [14]:
def query_to_vector(text, embedder):
    embedding = embedder.embed(text)
    return embedding


def cosine_similarity(index, query: str, k: int = 5):
    query_vector = query_to_vector(query, embedder)
    
    query = {
        "size": k,  # Anzahl der zurückzugebenden Ergebnisse
        "query": {
            "script_score": {
                "query": {"match_all": {}},  # Sie können dies durch eine spezifischere Abfrage ersetzen
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0",
                    # +1.0, um sicherzustellen, dass alle Werte positiv sind
                    "params": {"query_vector": query_vector}
                }
            }
        }
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index=index, body=query)

#### Perform BM25 and Vector Similarity Searches

first, perform a BM25 search using the match query.

In [17]:
# Perform a search
results = bm25_search("List signaling molecules (ligands) that interact with the receptor EGFR?", k=10)

# Print the results
for hit in results['hits']['hits']:
    print(f"Score: {hit['_score']}, PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}") 

Score: 25.100676, PMID: 286303, Title: Molecules in mammalian brain that interact with the colchicine site on tubulin.
Score: 17.860699, PMID: 83828, Title: Interaction of phenolsulphonphthalein dyes with rabbit plasma and rabbit serum albumin.
Score: 17.588778, PMID: 33989, Title: Regulation of adenylate cyclase of neuroblastoma x glioma hybrid cells by alpha-adrenergic receptors. I. Inhibition of adenylate cyclase mediated by alpha receptors.
Score: 16.900421, PMID: 277503, Title: Effect of perceived control on stress reduction in adult dental patients.
Score: 16.361063, PMID: 278976, Title: Functional consequences of ligand-dependent conformational changes in trypsin-solubilized and in membrane particle constrained-acetylcholinesterase.
Score: 16.233095, PMID: 95173, Title: A theoretical model for adhesion between cells mediated by multivalent ligands.
Score: 15.64755, PMID: 74731, Title: Corticotropin-like peptides in central nerves and in endocrine cells of gut and pancreas.
Score

#### Vector Similarity Search
now, perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents.

In [18]:
# Define the index name
index_name = "pubmed_emb_index"

# Führen Sie die Abfrage aus
results = cosine_similarity(index_name, "List signaling molecules (ligands) that interact with the receptor EGFR?", k=10)

for hit in results['hits']['hits']:
    print(f"Score: {hit['_score']},  PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}")

Score: 1.9210962,  PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.
Score: 1.9161748,  PMID: 1329870, Title: The junction between cytokines and cell adhesion.
Score: 1.9159176,  PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by recombinant-DNA technology.
Score: 1.9153762,  PMID: 1283879, Title: The role of integrin adhesion receptors in gingival wound healing.
Score: 1.9147724,  PMID: 1476596, Title: Candidate natural killer cell receptors.
Score: 1.914465,  PMID: 1356015, Title: The role of erbB-2 and its ligands in growth control of malignant breast epithelium.
Score: 1.9139742,  PMID: 1421421, Title: Can the insulin-like growth factors regulate breast cancer growth?
Score: 1.9132408,  PMID: 233264, Title: Hormonal regulation of peptide receptors and target cell responses.
Score: 1.9123619,  PMID: 1374612, Title: The mechanism of action of cyclosporin A and FK506.
Score: 1.911896,  PMID: 1462099, Titl