In [8]:
from elasticsearch import Elasticsearch
import urllib3
import os


elastic_password = os.getenv('ELASTIC_PASSWORD_SERVER2')

es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elastic_password),
    verify_certs=False,
    ca_certs="C:/Users/linus/http_ca.crt",
    request_timeout=60
)

index_name = "pubmed_index"

urllib3.disable_warnings()

print(es.info())

{'name': '54bb3107153d', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'tl0aceAXRDaQ4NbSC-Je7Q', 'version': {'number': '8.13.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '16cc90cd2d08a3147ce02b07e50894bc060a4cbf', 'build_date': '2024-04-05T14:45:26.420424304Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [12]:
# checking number of documents in the index
es.count(index=index_name)

ObjectApiResponse({'count': 1795307, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### Define query functions for BM25 and Vector Similarity Search

Define a function to perform a BM25 search using the match query.

In [13]:
# Define a search query
def bm25_search(query: str, k: int = 5):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        },
        "_source": ["PMID", "title"]
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_index', body=query)

Initialize the text embedder and define a function to convert a query to a vector using the bioBERT embeddings.

In [14]:
from Embedding import TextEmbedder
embedder = TextEmbedder()

def query_to_vector(text, embedder):
    embedding = embedder.embed(text)
    return embedding

Define a function to perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents.

In [4]:
def cosine_similarity(index, query: str, k: int = 5):
    query_vector = query_to_vector(query, embedder)
    
    query = {
        "size": k,  # Anzahl der zurückzugebenden Ergebnisse
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0",
                    # +1.0, um sicherzustellen, dass alle Werte positiv sind
                    "params": {"query_vector": query_vector}
                }
            }
        }
    }
    return es.search(index=index, body=query)

In [5]:
def knn_search(index, query: str, k: int = 10):
    # Wandeln Sie die Abfrage in einen Vektor um
    query_vector = query_to_vector(query, embedder)
    
    # Konstruieren Sie die k-NN-Suche
    knn_query = { 
        "knn": {
            "field": "embeddings",  # Das Feld, das die Vektoren enthält
            "query_vector": query_vector,
            "k": k,
            "num_candidates": 100 
        }
    }
    
    # Führen Sie die k-NN-Suche aus
    return es.search(index=index, body=knn_query)

#### Perform BM25 and Vector Similarity Searches

first, perform a BM25 search using the match query.

In [21]:
results = bm25_search("is the Epstein-Bar virus deadly?", k=100)

In [22]:
# Print the results
for hit in results['hits']['hits']:
    print(f"Score: {hit['_score']}, PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}")

Score: 20.796297, PMID: 1389547, Title: [Acute viral hepatitis A and C: report of a case].
Score: 19.720222, PMID: 78913, Title: EBV antigens in lymphocytes of patients with exudative tonsillitis, infectious mononucleosis and Hodgkin's disease.
Score: 19.435518, PMID: 2075630, Title: An outbreak of acute hepatitis A infection in rural Saudi Arabia.
Score: 18.543007, PMID: 1847985, Title: Detection of Epstein-Barr virus DNA in formalin-fixed paraffin-embedded tissue of nasopharyngeal carcinoma using polymerase chain reaction and in situ hybridization.
Score: 18.529373, PMID: 2847613, Title: Epstein-Barr virus in the bone marrow of patients with aplastic anemia.
Score: 18.523855, PMID: 3008616, Title: Chronic Epstein-Barr virus infection associated with fever and interstitial pneumonitis. Clinical and serologic features and response to antiviral chemotherapy.
Score: 18.41818, PMID: 2541162, Title: Impaired late suppression of Epstein-Barr virus (EBV)-induced immunoglobulin synthesis: a c

#### Vector Similarity Search
now, perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents.

In [30]:
# Führen Sie die Abfrage aus
results = cosine_similarity(index_name, "List signaling molecules (ligands) that interact with the receptor EGFR?", k=10)

for hit in results['hits']['hits']:
    print(f"Score: {hit['_score']},  PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}")

Score: 1.9210962,  PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.
Score: 1.9196633,  PMID: 2042633, Title: The Egr family of nuclear signal transducers.
Score: 1.9190896,  PMID: 1633422, Title: Crossed signals: oncogenic transcription factors.
Score: 1.9177192,  PMID: 2955833, Title: Phorbol esters as signal transducers and tumor promoters.
Score: 1.9172626,  PMID: 2824532, Title: PDGF-like growth factors in autocrine stimulation of growth.
Score: 1.9162145,  PMID: 1622545, Title: The regulation and function of p21ras in T cells.
Score: 1.9161748,  PMID: 1329870, Title: The junction between cytokines and cell adhesion.
Score: 1.9161192,  PMID: 1645965, Title: Overexpression of human TRK proto-oncogene into mouse cells using an inducible vector system.
Score: 1.9159867,  PMID: 1675819, Title: The expanding family of guanylyl cyclases.
Score: 1.9159176,  PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by 

#### k-NN Search
perform a k-NN search using the k-NN search API.

In [45]:
results = knn_search(index_name, "List signaling molecules (ligands) that interact with the receptor EGFR?", k=10)

for hit in results['hits']['hits']:
    print(f"Score: {hit['_score']},  PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}")

Score: 0.96054816,  PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.
Score: 0.9598316,  PMID: 2042633, Title: The Egr family of nuclear signal transducers.
Score: 0.9595448,  PMID: 1633422, Title: Crossed signals: oncogenic transcription factors.
Score: 0.9588597,  PMID: 2955833, Title: Phorbol esters as signal transducers and tumor promoters.
Score: 0.9586312,  PMID: 2824532, Title: PDGF-like growth factors in autocrine stimulation of growth.
Score: 0.95810723,  PMID: 1622545, Title: The regulation and function of p21ras in T cells.
Score: 0.9580873,  PMID: 1329870, Title: The junction between cytokines and cell adhesion.
Score: 0.9579934,  PMID: 1675819, Title: The expanding family of guanylyl cyclases.
Score: 0.9579588,  PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by recombinant-DNA technology.
Score: 0.9578283,  PMID: 2103500, Title: Cellular and viral ligands that interact with the EGF receptor.


### ELSER - Elastic Search Retrieval 

ELSER is a sparse vector representation for semantic retrieval developed by Elastic. Instead of dense vector representations, ELSER uses sparse vectors to represent text data. 