## Translating DBpedia queries to Wikidata with LLMs

In [4]:
# import all the necessary libraries
import re
import json
from openai import OpenAI

Create dataset for asking the LLMs

In [None]:
# Load the dataset from 100_complete_entries.json
with open("100_complete_entries.json", "r") as file:
    data = json.load(file)

# Create input dataset for the LLM
llm_inputs = []

for entry in data:
    # Extract required fields
    question = entry.get("question", "")  # Natural language question
    dbpedia_query = entry.get("dbpedia_query", "")  # SPARQL query for DBpedia
    
    # Extract entities and relations (ER2) in Wikidata
    er2 = [
        {
            "dbpedia_id": er["dbpedia_id"],
            "wikidata_ids": er["wikidata_ids"]
        }
        for er in entry.get("mapped_entities_relations", {}).get("entities_relations", [])
        if er["wikidata_ids"]  # Only include non-empty Wikidata mappings
    ]
    
    # Skip if there are no valid Wikidata mappings
    if not er2:
        continue

    # Construct the input for the LLM
    llm_input = {
        "context": {
            "natural_language_question": question,
            "sparql_query_kg1": dbpedia_query,
            "kg1_name": "DBpedia",
            "kg2_name": "Wikidata",
            "er2": er2,
            "instruction": "Given the information above, produce a SPARQL query for KG2."
        }
    }
    llm_inputs.append(llm_input)

# Save the processed dataset to a new JSON file
with open("llm_input_dataset.json", "w") as file:
    json.dump(llm_inputs, file, indent=4)

print(f"Processed dataset saved to 'llm_input_dataset.json'.")

Processed dataset saved to 'llm_input_dataset.json'.


## First test with 10 sample queries
Using meta-llama-3.1-8b-instruct for the first test of translating the queries from DBpedia to Wikidata. The connection works and the model is able to translate the queries. The output is quite large, because the model also explains what as been done. Should the prompt be changed to get a more concise output, only the complete query?

**meta-llama-3.1-8b-instruct is the smallest available model at Academic Cloud https://chat-ai.academiccloud.de/chat, with 8billion parameters**

In [None]:
# Set up the LLM API connection
api_key = '###'
base_url = "https://chat-ai.academiccloud.de/v1"
model = "meta-llama-3.1-8b-instruct"  # Replace with the appropriate model

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

# Load the input dataset
with open("llm_input_dataset.json", "r") as file:
    llm_input_data = json.load(file)

# Initialize the list to store the responses
translated_dataset = []

# Limit to the first 10 queries
limited_input_data = llm_input_data[:10]

# Query the LLM for each entry in the limited dataset
for entry in limited_input_data:
    context = entry["context"]
    
    # Create the prompt for the LLM
    prompt = (
        f"Context:\n"
        f"Natural Language Question: {context['natural_language_question']}\n"
        f"SPARQL Query for KG1 ({context['kg1_name']}):\n"
        f"{context['sparql_query_kg1']}\n"
        f"Knowledge Graph 1 Name: {context['kg1_name']}\n"
        f"Knowledge Graph 2 Name: {context['kg2_name']}\n"
        f"Entity and Relation Mapping (ER2):\n{json.dumps(context['er2'], indent=2)}\n"
        f"Instruction: {context['instruction']}"
    )
    
    try:
        # Query the LLM
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            model=model
        )
        
        # Extract the generated SPARQL query from the response
        response_text = response.choices[0].message.content
        
        # Append the response to the translated dataset
        translated_entry = {
            "context": context,
            "sparql_query_kg2": response_text  # Add the translated SPARQL query
        }
        translated_dataset.append(translated_entry)
    
    except Exception as e:
        print(f"Error querying LLM for question ID {context.get('natural_language_question', 'unknown')}: {e}")
        continue

# Save the translated dataset to a new JSON file
with open("translated_llm_output_10_queries_meta-llama-3.1-8b.json", "w") as file:
    json.dump(translated_dataset, file, indent=4)

print("Translated SPARQL queries for 10 questions saved to 'translated_llm_output_10_queries_meta-llama-3.1-8b.json'.")

Translated SPARQL queries for 10 questions saved to 'translated_llm_output_10_queries.json'.


## Test for Mistral-Large-Instruct 
The model includes 123billion parameters and is the largest available model at Academic Cloud https://chat-ai.academiccloud.de

In [None]:
# Set up the LLM API connection
api_key = '###'
base_url = "https://chat-ai.academiccloud.de/v1"
model = "mistral-large-instruct"  # Updated to use the new model

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

# Load the input dataset
with open("llm_input_dataset.json", "r") as file:
    llm_input_data = json.load(file)

# Initialize the list to store the responses
translated_dataset = []

# Limit to the first 10 queries
limited_input_data = llm_input_data[:10]

# Query the LLM for each entry in the limited dataset
for entry in limited_input_data:
    context = entry["context"]
    
    # Create the prompt for the LLM
    prompt = (
        f"Context:\n"
        f"Natural Language Question: {context['natural_language_question']}\n"
        f"SPARQL Query for KG1 ({context['kg1_name']}):\n"
        f"{context['sparql_query_kg1']}\n"
        f"Knowledge Graph 1 Name: {context['kg1_name']}\n"
        f"Knowledge Graph 2 Name: {context['kg2_name']}\n"
        f"Entity and Relation Mapping (ER2):\n{json.dumps(context['er2'], indent=2)}\n"
        f"Instruction: {context['instruction']}"
    )
    
    try:
        # Query the LLM
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            model=model
        )
        
        # Extract the generated SPARQL query from the response
        response_text = response.choices[0].message.content
        
        # Append the response to the translated dataset
        translated_entry = {
            "context": context,
            "sparql_query_kg2": response_text  # Add the translated SPARQL query
        }
        translated_dataset.append(translated_entry)
    
    except Exception as e:
        print(f"Error querying LLM for question ID {context.get('natural_language_question', 'unknown')}: {e}")
        continue

# Save the translated dataset to a new JSON file
with open("translated_llm_output_10_queries_mistral.json", "w") as file:
    json.dump(translated_dataset, file, indent=4)

print("Translated SPARQL queries for 10 questions saved to 'translated_llm_output_10_queries_mistral.json'.")

Translated SPARQL queries for 10 questions saved to 'translated_llm_output_10_queries_mistral.json'.


## Testing meta-llama-3.1-8b-instruct with 100 queries

In [None]:
# Set up the LLM API connection
api_key = '###'
base_url = "https://chat-ai.academiccloud.de/v1"
model = "meta-llama-3.1-8b-instruct"  # Replace with the appropriate model

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

# Load the input dataset
with open("llm_input_dataset.json", "r") as file:
    llm_input_data = json.load(file)

# Initialize the list to store the responses
translated_dataset = []

# Query the LLM for each entry in the dataset
for entry in llm_input_data:
    context = entry["context"]
    
    # Create the prompt for the LLM
    prompt = (
        f"Context:\n"
        f"Natural Language Question: {context['natural_language_question']}\n"
        f"SPARQL Query for KG1 ({context['kg1_name']}):\n"
        f"{context['sparql_query_kg1']}\n"
        f"Knowledge Graph 1 Name: {context['kg1_name']}\n"
        f"Knowledge Graph 2 Name: {context['kg2_name']}\n"
        f"Entity and Relation Mapping (ER2):\n{json.dumps(context['er2'], indent=2)}\n"
        f"Instruction: {context['instruction']}"
    )
    
    try:
        # Query the LLM
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            model=model
        )
        
        # Extract the generated SPARQL query from the response
        response_text = response.choices[0].message.content
        
        # Append the response to the translated dataset
        translated_entry = {
            "context": context,
            "sparql_query_kg2": response_text  # Add the translated SPARQL query
        }
        translated_dataset.append(translated_entry)
    
    except Exception as e:
        print(f"Error querying LLM for question ID {context.get('natural_language_question', 'unknown')}: {e}")
        continue

# Save the translated dataset to a new JSON file
with open("translated_llm_output_meta-llama-3.1-8b.json", "w") as file:
    json.dump(translated_dataset, file, indent=4)

print("Translated SPARQL queries for all questions saved to 'translated_llm_output_meta-llama-3.1-8b.json'.")


Translated SPARQL queries for all questions saved to 'translated_llm_output_meta-llama-3.1-8b.json'.


## Testing Mistral-Large-instruct with 100 queries

In [None]:
# Set up the LLM API connection
api_key = '###'
base_url = "https://chat-ai.academiccloud.de/v1"
model = "mistral-large-instruct"

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

# Load the input dataset
with open("llm_input_dataset.json", "r") as file:
    llm_input_data = json.load(file)

# Initialize the list to store the responses
translated_dataset = []

# Query the LLM for each entry in the dataset
for entry in llm_input_data:
    context = entry["context"]
    
    # Create the prompt for the LLM
    prompt = (
        f"Context:\n"
        f"Natural Language Question: {context['natural_language_question']}\n"
        f"SPARQL Query for KG1 ({context['kg1_name']}):\n"
        f"{context['sparql_query_kg1']}\n"
        f"Knowledge Graph 1 Name: {context['kg1_name']}\n"
        f"Knowledge Graph 2 Name: {context['kg2_name']}\n"
        f"Entity and Relation Mapping (ER2):\n{json.dumps(context['er2'], indent=2)}\n"
        f"Instruction: {context['instruction']}"
    )
    
    try:
        # Query the LLM
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            model=model
        )
        
        # Extract the generated SPARQL query from the response
        response_text = response.choices[0].message.content
        
        # Append the response to the translated dataset
        translated_entry = {
            "context": context,
            "sparql_query_kg2": response_text  # Add the translated SPARQL query
        }
        translated_dataset.append(translated_entry)
    
    except Exception as e:
        print(f"Error querying LLM for question ID {context.get('natural_language_question', 'unknown')}: {e}")
        continue

# Save the translated dataset to a new JSON file
with open("translated_llm_output_mistral.json", "w") as file:
    json.dump(translated_dataset, file, indent=4)

print("Translated SPARQL queries for all questions saved to 'translated_llm_output_mistral.json'.")


Translated SPARQL queries for all questions saved to 'translated_llm_output_mistral.json'.


## Analysis of the results
For now it can be seen that the extraction process for the larger model (mistral-large-instruct) is working better than for the smaller model (meta-llama-3.1-8b-instruct). The output of the smaller model is not that structured making it harder to extract the queries. On the first sight it also seems that the queries from the larger model are working better.


Extracting the sprql queries from LLM output for **mistral-large-instruct**.

In [26]:
# Load the input JSON file
file_path = "translated_llm_output_mistral.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Function to extract SPARQL query from the `sparql_query_kg2` field
def extract_sparql_query(entry):
    raw_query = entry.get("sparql_query_kg2", "")
    # Extract content between SPARQL code blocks (```sparql and ``` or similar markers)
    match = re.search(r"```sparql\n(.*?)\n```", raw_query, re.DOTALL)
    if match:
        return match.group(1).replace("\n", " ").strip()  # Remove newline characters and trim whitespace
    return None

# Extract SPARQL queries
queries_with_context = []
for entry in data:
    sparql_query = extract_sparql_query(entry)
    if sparql_query:
        queries_with_context.append({
            "natural_language_question": entry["context"]["natural_language_question"],
            "sparql_query_kg2": sparql_query
        })

# Save the extracted queries to a new JSON file
output_file = "mistral_wiki_trans_sparql_queries.json"
with open(output_file, "w") as file:
    json.dump(queries_with_context, file, indent=4)

print(f"Extracted SPARQL queries for the first 10 entries have been saved to {output_file}.")

Extracted SPARQL queries for the first 10 entries have been saved to mistral_wiki_trans_sparql_queries.json.


Extracting the sprql queries from LLM output for **lama-3.1-8b-instruct**.

Working best so far for lama-3.1-8b-instruct but still need adjustments to extract the queries more reliably. 

In [27]:
# Load the JSON file
input_file_path = 'translated_llm_output_meta-llama-3.1-8b.json'
output_file_path = 'lama_wiki_trans_sparql_queries.json'

with open(input_file_path, 'r') as file:
    data = json.load(file)

# Function to extract clean SPARQL queries
def extract_sparql_query(sparql_raw):
    # Extract query inside code block or with SPARQL syntax markers
    sparql_match = re.search(r"```sparql\n(.*?)\n```|PREFIX.*?WHERE\s*\{.*?\}", sparql_raw, re.DOTALL)
    if sparql_match:
        # Clean query by removing extra whitespace and comments
        query = sparql_match.group(0)
        query = re.sub(r"#.*", "", query)  # Remove comments
        query = re.sub(r"\s+", " ", query).strip()  # Remove extra spaces/newlines
        return query
    return None

# Process the data to extract required fields
result = []
for entry in data:
    context = entry.get('context', {})
    natural_language_question = context.get('natural_language_question', None)
    sparql_query_raw = entry.get('sparql_query_kg2', '')

    # Extract and clean SPARQL query
    sparql_query = extract_sparql_query(sparql_query_raw)

    if natural_language_question and sparql_query:
        result.append({
            "natural_language_question": natural_language_question,
            "sparql_query": sparql_query
        })

# Save the cleaned and extracted data to a new JSON file
with open(output_file_path, 'w') as output_file:
    json.dump(result, output_file, indent=4)

print(f"Cleaned and extracted data has been saved to {output_file_path}.")


Cleaned and extracted data has been saved to lama_wiki_trans_sparql_queries.json.


Not working that well but could maybe help later on. 

In [None]:
# Load the JSON file
file_path = "translated_llm_output_meta-llama-3.1-8b.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Function to extract SPARQL query from the sparql_query_kg2 field
def extract_sparql_query(sparql_text):
    try:
        # Find the start of the SPARQL query
        start_idx = sparql_text.find("```sparql")
        if start_idx == -1:
            return None  # No SPARQL query found
        
        # Find the end of the SPARQL query
        end_idx = sparql_text.find("```", start_idx + 8)
        if end_idx == -1:
            end_idx = len(sparql_text)
        
        # Extract the SPARQL query
        sparql_query = sparql_text[start_idx + 8:end_idx].strip()
        return sparql_query
    except Exception as e:
        print(f"Error extracting SPARQL query: {e}")
        return None

# Extract SPARQL queries for all entries
extracted_queries = []
for entry in data:
    sparql_text = entry.get("sparql_query_kg2", "")
    sparql_query = extract_sparql_query(sparql_text)
    if sparql_query:
        extracted_queries.append(sparql_query)

# Save the extracted queries to a new JSON file
output_file = "lama2_wiki_trans_sparql_queries.json"
with open(output_file, "w") as output_file:
    json.dump(extracted_queries, output_file, indent=4)

print(f"Extracted SPARQL queries saved to {output_file}.")

Extracted SPARQL queries saved to <_io.TextIOWrapper name='lama2_wiki_trans_sparql_queries.json' mode='w' encoding='cp1252'>.
