In [28]:
from elasticsearch import Elasticsearch
import urllib3
import os


elastic_password = os.getenv('ELASTIC_PASSWORD_SERVER')

es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elastic_password),
    verify_certs=False,
    ca_certs="C:/Users/linus/http_ca.crt",
    request_timeout=60
)

index_name = "pubmed_emb_index"

urllib3.disable_warnings()

### Define query functions for BM25 and Vector Similarity Search

Define a function to perform a BM25 search using the match query.

In [4]:
# Define a search query
def bm25_search(query: str, k: int = 5):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        }
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_emb_index', body=query)

Initialize the text embedder and define a function to convert a query to a vector using the bioBERT embeddings.

In [6]:
from Embedding import TextEmbedder
embedder = TextEmbedder()

def query_to_vector(text, embedder):
    embedding = embedder.embed(text)
    return embedding

Define a function to perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents.

In [39]:
def cosine_similarity(index, query: str, k: int = 5):
    query_vector = query_to_vector(query, embedder)
    
    query = {
        "size": k,  # Anzahl der zurückzugebenden Ergebnisse
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0",
                    # +1.0, um sicherzustellen, dass alle Werte positiv sind
                    "params": {"query_vector": query_vector}
                }
            }
        }
    }
    return es.search(index=index, body=query)

In [44]:
def knn_search(index, query: str, k: int = 10):
    # Wandeln Sie die Abfrage in einen Vektor um
    query_vector = query_to_vector(query, embedder)
    
    # Konstruieren Sie die k-NN-Suche
    knn_query = { 
        "knn": {
            "field": "embeddings",  # Das Feld, das die Vektoren enthält
            "query_vector": query_vector,
            "k": k,
            "num_candidates": 100 
        }
    }
    
    # Führen Sie die k-NN-Suche aus
    return es.search(index=index, body=knn_query)

#### Perform BM25 and Vector Similarity Searches

first, perform a BM25 search using the match query.

In [31]:
# Perform a search
results = bm25_search("List signaling molecules (ligands) that interact with the receptor EGFR?", k=10)

# Print the results
for hit in results['hits']['hits']:
    print(f"Score: {hit['_score']}, PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}") 

Score: 29.752335, PMID: 1349015, Title: Heterodimerization of c-erbB2 with different epidermal growth factor receptor mutants elicits stimulatory or inhibitory responses.
Score: 29.646183, PMID: 2009534, Title: Genes for epidermal growth factor receptor, transforming growth factor alpha, and epidermal growth factor and their expression in human gliomas in vivo.
Score: 29.362312, PMID: 2181668, Title: EGF receptor and erbB-2 tyrosine kinase domains confer cell specificity for mitogenic signaling.
Score: 28.399578, PMID: 1538276, Title: The epidermal growth factor receptor in human pancreatic cancer.
Score: 26.986626, PMID: 1965067, Title: Receptor tyrosine kinases: genetic evidence for their role in Drosophila and mouse development.
Score: 26.858585, PMID: 1965146, Title: The role of phosphatases in signal transduction.
Score: 26.646797, PMID: 1537871, Title: Expression of the human epidermal growth factor receptor in a murine T-cell hybridoma. A transmembrane protein tyrosine kinase ca

#### Vector Similarity Search
now, perform a vector similarity search using the cosine similarity between the query vector and the embeddings in the indexed documents.

In [30]:
# Führen Sie die Abfrage aus
results = cosine_similarity(index_name, "List signaling molecules (ligands) that interact with the receptor EGFR?", k=10)

for hit in results['hits']['hits']:
    print(f"Score: {hit['_score']},  PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}")

Score: 1.9210962,  PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.
Score: 1.9196633,  PMID: 2042633, Title: The Egr family of nuclear signal transducers.
Score: 1.9190896,  PMID: 1633422, Title: Crossed signals: oncogenic transcription factors.
Score: 1.9177192,  PMID: 2955833, Title: Phorbol esters as signal transducers and tumor promoters.
Score: 1.9172626,  PMID: 2824532, Title: PDGF-like growth factors in autocrine stimulation of growth.
Score: 1.9162145,  PMID: 1622545, Title: The regulation and function of p21ras in T cells.
Score: 1.9161748,  PMID: 1329870, Title: The junction between cytokines and cell adhesion.
Score: 1.9161192,  PMID: 1645965, Title: Overexpression of human TRK proto-oncogene into mouse cells using an inducible vector system.
Score: 1.9159867,  PMID: 1675819, Title: The expanding family of guanylyl cyclases.
Score: 1.9159176,  PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by 

#### k-NN Search
perform a k-NN search using the k-NN search API.

In [45]:
results = knn_search(index_name, "List signaling molecules (ligands) that interact with the receptor EGFR?", k=10)

for hit in results['hits']['hits']:
    print(f"Score: {hit['_score']},  PMID: {hit['_source']['PMID']}, Title: {hit['_source']['title']}")

Score: 0.96054816,  PMID: 1501243, Title: Epidermal growth factor receptor: elements of intracellular communication.
Score: 0.9598316,  PMID: 2042633, Title: The Egr family of nuclear signal transducers.
Score: 0.9595448,  PMID: 1633422, Title: Crossed signals: oncogenic transcription factors.
Score: 0.9588597,  PMID: 2955833, Title: Phorbol esters as signal transducers and tumor promoters.
Score: 0.9586312,  PMID: 2824532, Title: PDGF-like growth factors in autocrine stimulation of growth.
Score: 0.95810723,  PMID: 1622545, Title: The regulation and function of p21ras in T cells.
Score: 0.9580873,  PMID: 1329870, Title: The junction between cytokines and cell adhesion.
Score: 0.9579934,  PMID: 1675819, Title: The expanding family of guanylyl cyclases.
Score: 0.9579588,  PMID: 1368709, Title: Analysing lymphokine-receptor interactions of IL-1 and IL-2 by recombinant-DNA technology.
Score: 0.9578283,  PMID: 2103500, Title: Cellular and viral ligands that interact with the EGF receptor.


### ELSER - Elastic Search Retrieval 

ELSER is a sparse vector representation for semantic retrieval developed by Elastic. Instead of dense vector representations, ELSER uses sparse vectors to represent text data. 