In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import re
import time
import json
import chromadb
import time
import pandas as pd

from dotenv import load_dotenv

from llama_index.llms.groq import Groq
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


os.environ["TOKENIZERS_PARALLELISM"] = "false"

def rdf_data_indexing_to_chromadb(model, api_key, temperature,embed_model, rag_files, collection_name):
    """
    Indexing the RDF data to ChromaDB
    """
    llm = Groq(model= model, api_key=api_key)
    # create client and a new collection
    chroma_client = chromadb.EphemeralClient()
    # clear past collection
    try:
        chroma_client.delete_collection(collection_name)
    except:
        pass
    # create new collection
    chroma_collection = chroma_client.create_collection(collection_name)

    # define embedding function
    embed_model = HuggingFaceEmbedding(model_name=embed_model)

    # load documents from a specific path(file or folders)
    print(f"Loading data from {rag_files}...")
    documents = SimpleDirectoryReader(rag_files).load_data()
    print(f"Data loaded successfully!")
    
    # set up ChromaVectorStore and load in data
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    print("Indexing data...")
    index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context, embed_model=embed_model
    )
    print("Data indexed successfully!")

    Settings.llm = Groq(model = model, temperature=temperature)
    query_engine = index.as_query_engine(llm)
    print("Query Engine created successfully!")
    return query_engine


def get_response_from_llm(query_engine, prompt):
    """""
    Get response from LLM
    """
    response = query_engine.query(prompt)
    return response


def read_test_questions(question_file):
    """
    Read 513 test questions from sciqa benchmark dataset
    """
    # data = pd.read_csv("xueli_data/test_questions.csv")
    data = pd.read_csv(question_file)
    questions = data['question'].tolist()
    question_ids = data['id'].tolist()
    return questions, question_ids

In [None]:
load_dotenv()

# model = "llama-3.3-70b-versatile"
model = "deepseek-r1-distill-llama-70b"
api_key = os.getenv('GROQ_API_KEY')
temperature = 0.5
embed_model = "BAAI/bge-small-en"
rag_files = "xueli_data/rdf-dump/"
collection_name = 'qa-kg'

# rdf data indexing
print("Indexing RDF data to ChromaDB...")
starttime = time.time()
query_engine = rdf_data_indexing_to_chromadb(model, api_key, temperature,embed_model, rag_files, collection_name)
endtime = time.time()
print(f"Time taken for indexing: {endtime - starttime} seconds")
print("RDF data indexed successfully!\n")

In [None]:
questions, question_ids = read_test_questions("xueli_data/test_questions.csv")
print(f"Total number of questions: {len(questions)}")
# print(f"The first question is: {questions[0]}")
# print(f"The first question id is: {question_ids[0]}")

output_dir = "results/rag_groq/prompt2"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


for i in range(0, len(questions)):
    print(f"Processing the {i}/513 question...............")
    question = questions[i]
    question_id = question_ids[i]

    prompt = """Given a natural language question, your task is to retrieve the top 5 most similar candidate entities or properties from the provided ORKG RDF data dump in Turtle format. Follow these steps to complete the task:

    1. Extract Relevant Terms:
    - Identify key entities and properties explicitly or implicitly mentioned in the question.
    - Entities typically include datasets, models, or concepts (e.g., "ACL Anthology dataset", "models").
    - Properties typically describe relationships or actions (e.g., "evaluated on").

    2. Compute Similarity:
    - Retrieve the top 10 most similar entities or properties from the RDF data dump by calculating cosine similarity scores based on their textual representations.

    3. Select Top Candidates:
    - For each extracted entity or property, return the 5 most relevant matches with the highest similarity scores.

    ### Example Question:
    The input natural language question is: "What models are being evaluated on the ACL Anthology dataset?"

    ### Expected JSON Output Format
    Return a JSON object where results are grouped by the extracted terms. The structure should be:

    {
    "question": "What models are being evaluated on the ACL Anthology dataset?",
    "extracted_terms": [
        {
        "rdfs:label": "ACL Anthology dataset",
        "uri": "orkgr:ACL_Anthology_dataset",
        "type": "Dataset",
        "rdf:type": "orkgc:Resource"
        },
        {
        "rdfs:label": "models",
        "type": "Class",
        "rdf:type": "rdfs:Class"
        },
        {
        "rdfs:label": "evaluated on",
        "type": "Property",
        "rdf:type": "orkgp:Predicate"
        }
    ],
    "candidates": {
        "ACL Anthology dataset": [
        {
            "uri": "orkgr:ACL_Dataset",
            "rdfs:label": "ACL Anthology Corpus",
            "score": 0.92,
            "type": "Dataset",
            "rdf:type": "orkgc:Resource"
        },
        {
            "uri": "orkgr:NLP_Benchmark",
            "rdfs:label": "NLP Benchmark Dataset",
            "score": 0.89,
            "type": "Dataset",
            "rdf:type": "orkgc:Resource"
        }
        ],
        "models": [
        {
            "uri": "orkgc:BERT",
            "rdfs:label": "BERT",
            "score": 0.95,
            "type": "Model",
            "rdf:type": "rdfs:Class"
        },
        {
            "uri": "orkgc:GPT",
            "rdfs:label": "GPT",
            "score": 0.91,
            "type": "Model",
            "rdf:type": "rdfs:Class"
        }
        ],
        "evaluated on": [
        {
            "uri": "orkgp:Evaluation_Method",
            "rdfs:label": "Evaluation Method",
            "score": 0.90,
            "type": "Property",
            "rdf:type": "orkgp:Predicate"
        },
        {
            "uri": "orkgp:Performance_Assessment",
            "rdfs:label": "Performance Assessment",
            "score": 0.87,
            "type": "Property",
            "rdf:type": "orkgp:Predicate"
        }
        ]
    }
    }

    ### Additional Refinements:
    - **Use RDF Prefixes for URIs**:
    - `"orkgp:"` for properties (predicates).
    - `"orkgc:"` for classes (concepts).
    - `"orkgr:"` for specific named resources (datasets, papers, models, etc.).
    - `"rdf:"` for general RDF terms.
    - `"rdfs:"` for schema-related terms.

    - **Replace 'label' with `rdfs:label`**:
    - Ensure the label of each entity or property is stored under `"rdfs:label"`.

    - **Include Entity Types (`rdf:type`)**:
    - `"rdfs:Class"` for general categories.
    - `"orkgp:Predicate"` for relationships/properties.
    - `"orkgc:Resource"` for dataset, models, or research objects.

    - Ensure:
    - Extracted terms are accurate representations of key entities and properties in the question.
    - The similarity scores reflect the degree of relevance.
    - The results are structured correctly for further processing in SPARQL query generation.

    The input natural language question is: "{question}"
    """
    # print(f"User Query: , {prompt}")
    print("Getting response from LLM...\n")

    try:
        response = get_response_from_llm(query_engine, prompt)
    except Exception as e:
        print(f"Error: {e}")
        # sleep for 15 minutes
        # print("Sleeping for 15 minutes because of rate limits...")
        # time.sleep(900)
        time.sleep(10)
        response = get_response_from_llm(query_engine, prompt)
    # print(f"Response for question {question_id} is: {response}")
    print("------------------------------------------------------------")

    # save the response to a txt file in the output directory
    with open(f"{output_dir}/{question_id}.txt", "w") as f:
        f.write(str(response))
    print(f"Response for question {question_id} is saved successfully!")
    print("------------------------------------------------------------\n")
print("All questions are processed")