In [None]:
!pip install transformers torch numpy llama_index opensearch-py bert-score llama-index-embeddings-huggingface llama-index-embeddings-instructor sacrebleu
!python -m spacy download de_core_news_lg

Collecting llama_index
  Downloading llama_index-0.10.58-py3-none-any.whl.metadata (11 kB)
Collecting opensearch-py
  Downloading opensearch_py-2.6.0-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.2.2-py3-none-any.whl.metadata (769 bytes)
Collecting llama-index-embeddings-instructor
  Downloading llama_index_embeddings_instructor-0.1.3-py3-none-any.whl.metadata (810 bytes)
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m390.1 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_r

In [None]:
# Standard Library
import math
import os
import re

# Third-Party Libraries
import numpy as np
import requests
import sacrebleu
import spacy
import torch
from bert_score import score
from opensearchpy import OpenSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline

# Specific Libraries
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from openai import OpenAI

In [None]:
# Connect to OpenSearch instance
dbclient = OpenSearch(
    hosts=[{'host': 'opensearch-ds.ifi.uni-heidelberg.de', 'port': 443}],
    http_auth=('ryousaf', 'i5am3SHER4locked'),  # Replace with your credentials
    use_ssl=True,
    verify_certs=False,
    ssl_show_warn=False
)

# Load the spacy model for keyword extraction
nlp = spacy.load("de_core_news_lg")

# Define stopwords
STOPWORDS = set([
    "mg", "symptome", "behandlung"
])

def extract_keyword_ner(query):
    doc = nlp(query)

    # Extract named entities
    keywords = [ent.text for ent in doc.ents]

    # If no named entities found, fall back to extracting nouns and proper nouns
    if not keywords:
        keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]

    # Filter out stopwords (ensure case-insensitive comparison)
    filtered_keywords = [keyword for keyword in keywords if keyword.lower() not in STOPWORDS]

    # Return the list of filtered keywords
    print(filtered_keywords)
    return filtered_keywords

# Search for all documents related to query
def find_title(index, keyword):

    concatenated_text = ""
    for word in keyword:
        # Define the search query
        query = {
            "query": {
                "match": {
                    "title": word
                }
            }
        }

        try:
            # Execute the search query
            response = dbclient.search(index=index, body=query)

            # Check if we got any hits
            if response['hits']['total']['value'] > 0:
                for hit in response['hits']['hits']:
                    text = hit['_source']['text']
                    concatenated_text += text + "\n"  # Add each hit's text to the concatenated_text
                    print(f"Title: {hit['_source']['title']}")
                    #print(f"Text: {text}")
                    print(f"URL: {hit['_source']['url']}")
                    print("-" * 80)
            else:
                print("Keine Ergebnisse gefunden")
        except Exception as e:
            print(f"Fehler beim Ausführen der Suchanfrage: {e}")

    return concatenated_text

query = "Welche gemeinsamen Symptome haben Diabetes und Mumps?"
keywords = extract_keyword_ner(query)
docs = find_title("kic_apothekenumschau_simple_speech_articles", keywords)

['Diabetes', 'Mumps']
Title: Diabetes Typ 1
URL: https://www.apotheken-umschau.de/einfache-sprache/krankheiten/diabetes-typ-1-862049.html
--------------------------------------------------------------------------------
Title: Diabetes Typ 2
URL: https://www.apotheken-umschau.de/einfache-sprache/krankheiten/diabetes-typ-2-868609.html
--------------------------------------------------------------------------------
Title: Mumps
URL: https://www.apotheken-umschau.de/einfache-sprache/krankheiten/mumps-937211.html
--------------------------------------------------------------------------------
Diabetes Typ 1 Dieser Text informiert in Einfacher Sprache zum Thema: Diabetes Typ 1. Von Forschungsstelle Leichte Sprache , 21.04.2022 Was ist Diabetes Typ 1? Was passiert bei Diabetes Typ 1? Woran können Sie Diabetes Typ 1 erkennen? Was sind die Ursachen von Diabetes Typ 1? Wo bekommen Sie noch mehr Informationen? Was ist Diabetes Typ 1? Diabetes Typ 1 ist eine Autoimmunerkrankung. Das heißt: Das Imm

In [None]:
# Helper functions

# Chunking text passages
def chunk_text(text, chunk_size):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Any embedding model can be used here
def get_embedding_model():
    embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large")
    return embed_model

# Compute the embeddings
def get_embeddings(embed_model, text: str):
    embeddings = embed_model.get_text_embedding(text)
    return embeddings

def dot_product(vec1, vec2):
    return sum(a * b for a, b in zip(vec1, vec2))

def magnitude(vec):
    return math.sqrt(sum(v**2 for v in vec))

def cosine_similarity(vec1, vec2):
    dot_prod = dot_product(vec1, vec2)
    mag_vec1 = magnitude(vec1)
    mag_vec2 = magnitude(vec2)

    if mag_vec1 == 0 or mag_vec2 == 0:
        return 0  # Handle division by zero

    return dot_prod / (mag_vec1 * mag_vec2)

In [None]:
def execute_query(query, index):
    keywords = extract_keyword_ner(query)

    # Perform the search using the extracted keywords
    text = ""
    for word in keywords:
        text += find_title(index, word)

    # Chunk size set to 128
    chunks = chunk_text(text, chunk_size=512)

    # Generate embeddings
    embd_model = get_embedding_model()
    vdb = []
    for chunk in chunks:
        embd = get_embeddings(embd_model, chunk)
        vdb.append(embd)

    # Compute cosine similarities
    q_embd = get_embeddings(embd_model, query)
    ratings = [cosine_similarity(q_embd, x) for x in vdb]
    k = 5
    idx = np.argpartition(ratings, -k)[-k:]  # Indices not sorted

    # Concatenate top-5 relevant chunks
    relevant_info = " ".join([chunks[i] for i in idx])

    return relevant_info

In [None]:
def response(query, relevant_info, index):
    # Generate an answer using OpenAI's API
    responseStd = client.chat.completions.create(
        messages=[
            {'role': 'system', 'content': 'Du bist ein intelligenter Agent. Dir wird eine Frage gestellt und Du erhälst die relevanten Informationen. Deine Aufgabe besteht darin, die Frage mit den bereitgestellten Informationen zu beantworten.'},
            {'role': 'assistant', 'content': 'Relevante Informationen: ' + relevant_info},
            {'role': 'user', 'content': query},
        ],
        model=GPT_MODEL,
        # Generate the most likely response with minimal randomness. Deterministic and predictable.
        temperature=0,
    )

    responseEasy1 = client.chat.completions.create(
        messages=[
            {'role': 'system', 'content': 'Du bist ein intelligenter Agent. Dir wird eine Frage gestellt und Du erhälst die relevanten Informationen. Deine Aufgabe besteht darin, die Frage mit den bereitgestellten Informationen in leichter Sprache zu beantworten.'},
            {'role': 'assistant', 'content': 'Relevante Informationen: ' + relevant_info},
            {'role': 'user', 'content': query},
        ],
        model=GPT_MODEL,
        temperature=0,
    )


    responses = {
        "Standard response": responseStd.choices[0].message.content,
        "Simple prompt": responseEasy1.choices[0].message.content,
    }

    # Write the responses to a text file
    if index == "kic_apothekenumschau_articles":
        file_name = f"data/{query}.txt"
    elif index == "kic_apothekenumschau_simple_speech_articles":
        file_name = f"data/{query}-simple.txt"

    with open(file_name, "w", encoding="utf-8") as f:
        for key, value in responses.items():
            f.write(f"{key}:\n{value}\n\n")

    print(f"Responses saved to {file_name}")

    return responses


In [None]:
GPT_MODEL = "gpt-3.5-turbo"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "sk-proj-nG7Aj0B6nhRht4EwEYB7T3BlbkFJzr4aIVHGH57ztjG4LUZY"))

In [None]:
query = "Was ist sekundärer Diabetes?"
#relevant_info1 = execute_query(query, "kic_apothekenumschau_simple_speech_articles")
#simpleResponses = response(query, relevant_info1, "kic_apothekenumschau_simple_speech_articles")

relevant_info2 = execute_query(query, "kic_apothekenumschau_articles")
responses = response(query, relevant_info2, "kic_apothekenumschau_articles")

['Diabetes', 'Typ']
Title: Diabetes Typ 1
URL: https://www.apotheken-umschau.de/einfache-sprache/krankheiten/diabetes-typ-1-862049.html
--------------------------------------------------------------------------------
Title: Diabetes Typ 2
URL: https://www.apotheken-umschau.de/einfache-sprache/krankheiten/diabetes-typ-2-868609.html
--------------------------------------------------------------------------------
Title: Diabetes Typ 1
URL: https://www.apotheken-umschau.de/einfache-sprache/krankheiten/diabetes-typ-1-862049.html
--------------------------------------------------------------------------------
Title: Diabetes Typ 2
URL: https://www.apotheken-umschau.de/einfache-sprache/krankheiten/diabetes-typ-2-868609.html
--------------------------------------------------------------------------------
Responses saved to data/Was sind die Symptome von Diabetes Typ 2?-simple.txt
['Diabetes', 'Typ']
Title: Diabetes
URL: https://www.apotheken-umschau.de/krankheiten-symptome/diabetes/
----------