## Translating DBLP queries to OpenAlex with LLMs

In [10]:
# import libraries
import re
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON
from openai import OpenAI

## Testing meta-llama-3.1-8b-instruct with 100 queries
The model includes 8billion parameters and is one of the samallest available models at Academic Cloud.

In [None]:
# Set up the LLM API connection
api_key = '###'
base_url = "https://chat-ai.academiccloud.de/v1"
model = "meta-llama-3.1-8b-instruct"

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

# Load the input dataset
input_file = "../../data/DBLP_to_OpenAlex_input.json"
output_file = "zero_shot_entity_aligned_output_llama_dblp_openalex.json"

with open(input_file, "r", encoding="utf-8") as file:
    llm_input_data = json.load(file)

# Initialize the list to store the responses
translated_dataset = []

# Query the LLM for each entry in the dataset
for entry in llm_input_data:
    context = entry["context"]
    
    # Create the prompt for the LLM
    prompt = (
        f"Context:\n"
        f"Natural Language Question: {context['natural_language_question']}\n"
        f"SPARQL Query for KG1 ({context['kg1_name']}):\n"
        f"{context['sparql_query_kg1']}\n"
        f"Knowledge Graph 1 Name: {context['kg1_name']}\n"
        f"Knowledge Graph 2 Name: {context['kg2_name']}\n"
        f"Entity and Relation Mapping (ER2):\n{json.dumps(context['er2'], indent=2)}\n"
        f"Instruction: {context['instruction']}"
    )
    
    try:
        # Query the LLM
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            model=model
        )
        
        # Extract the generated SPARQL query from the response
        response_text = response.choices[0].message.content
        
        # Append the response to the translated dataset
        translated_entry = {
            "context": context,
            "sparql_query_kg2": response_text
        }
        translated_dataset.append(translated_entry)
    
    except Exception as e:
        print(f"Error querying LLM for question: {context.get('natural_language_question', 'unknown')}: {e}")
        continue

# Save the translated dataset to a new JSON file
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(translated_dataset, file, indent=4)

print(f"Translated SPARQL queries saved to '{output_file}'.")

Translated SPARQL queries saved to 'zero_shot_entity_aligned_output_llama_dblp_openalex.json'.


## Testing Mistral-Large-instruct with 100 queries
The model includes 123billion parameters and is the largest available model at Academic Cloud https://chat-ai.academiccloud.de

In [None]:
# Set up the LLM API connection
api_key = '###'
base_url = "https://chat-ai.academiccloud.de/v1"
model = "mistral-large-instruct"

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

# Load the input dataset
input_file = "../../data/DBLP_to_OpenAlex_input.json"
output_file = "zero_shot_entity_aligned_output_mistral_dblp_openalex.json"

with open(input_file, "r", encoding="utf-8") as file:
    llm_input_data = json.load(file)

# Initialize the list to store the responses
translated_dataset = []

# Query the LLM for each entry in the dataset
for entry in llm_input_data:
    context = entry["context"]
    
    # Create the prompt for the LLM
    prompt = (
        f"Context:\n"
        f"Natural Language Question: {context['natural_language_question']}\n"
        f"SPARQL Query for KG1 ({context['kg1_name']}):\n"
        f"{context['sparql_query_kg1']}\n"
        f"Knowledge Graph 1 Name: {context['kg1_name']}\n"
        f"Knowledge Graph 2 Name: {context['kg2_name']}\n"
        f"Entity and Relation Mapping (ER2):\n{json.dumps(context['er2'], indent=2)}\n"
        f"Instruction: {context['instruction']}"
    )
    
    try:
        # Query the LLM
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            model=model
        )
        
        # Extract the generated SPARQL query from the response
        response_text = response.choices[0].message.content
        
        # Append the response to the translated dataset
        translated_entry = {
            "context": context,
            "sparql_query_kg2": response_text
        }
        translated_dataset.append(translated_entry)
    
    except Exception as e:
        print(f"Error querying LLM for question: {context.get('natural_language_question', 'unknown')}: {e}")
        continue

# Save the translated dataset to a new JSON file
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(translated_dataset, file, indent=4)

print(f"Translated SPARQL queries saved to '{output_file}'.")

Translated SPARQL queries saved to 'zero_shot_entity_aligned_output_mistral_dblp_openalex.json'.


## Second Version Test
Both models struggle with the structure of OpenAlex with the logic hasAuthorship and hasAuthor combined, which leads to a bad performance. Since this logic is very commonly used in OpenAlex, second version of the input dataset is used, where the logic is given in the instruction to the LLM too. 

In [None]:
# Define the new instruction
new_instruction = ("Given the information above, produce a SPARQL query for KG2. In your answer please highlight the final,"
                    "complete SPARQL query within the tags '<sparql>' and '</sparql>'. For the author use the following logic: hasAuthorship ?authorship . ?authorship :hasAuthor ?author . ?author and the ORCID \"https://dbpedia.org/ontology/orcidId\".")

# Load the input dataset
input_file = "../../data/DBLP_to_OpenAlex_input.json"
output_file = "../../data/DBLP_to_OpenAlex_input_v2.json"

with open(input_file, "r", encoding="utf-8") as file:
    dataset = json.load(file)

# Modify the instruction field in each entry
for entry in dataset:
    entry["context"]["instruction"] = new_instruction

# Save the modified dataset
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(dataset, file, indent=4)

print(f"Modified dataset saved to: {output_file}")


Modified dataset saved to: ../data/DBLP_to_OpenAlex_input_v2.json


In [None]:
# Set up the LLM API connection
api_key = '###' 
base_url = "https://chat-ai.academiccloud.de/v1"
model = "meta-llama-3.1-8b-instruct"

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

# Load the input dataset
input_file = "../../data/DBLP_to_OpenAlex_input_v2.json"
output_file = "zero_shot_entity_aligned_output_llama_dblp_openalex_v2.json"

with open(input_file, "r", encoding="utf-8") as file:
    llm_input_data = json.load(file)

# Limit to first 10 queries
llm_input_data = llm_input_data[:10]

# Initialize the list to store the responses
translated_dataset = []

# Query the LLM for each entry in the dataset
for i, entry in enumerate(llm_input_data):
    context = entry["context"]
    
    # Create the prompt for the LLM
    prompt = (
        f"Context:\n"
        f"Natural Language Question: {context['natural_language_question']}\n"
        f"SPARQL Query for KG1 ({context['kg1_name']}):\n"
        f"{context['sparql_query_kg1']}\n"
        f"Knowledge Graph 1 Name: {context['kg1_name']}\n"
        f"Knowledge Graph 2 Name: {context['kg2_name']}\n"
        f"Entity and Relation Mapping (ER2):\n{json.dumps(context['er2'], indent=2)}\n"
        f"Instruction: {context['instruction']}"
    )
    
    try:
        # Query the LLM
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            model=model
        )
        
        # Extract the generated SPARQL query from the response
        response_text = response.choices[0].message.content
        
        # Append the response to the translated dataset
        translated_entry = {
            "context": context,
            "sparql_query_kg2": response_text
        }
        translated_dataset.append(translated_entry)
        
        print(f"Processed query {i+1}/10")

    except Exception as e:
        print(f"Error querying LLM for question: {context.get('natural_language_question', 'unknown')}: {e}")
        continue

# Save the translated dataset to a new JSON file
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(translated_dataset, file, indent=4)

print(f"Translated SPARQL queries saved to '{output_file}'.")

Processed query 1/10
Processed query 2/10
Processed query 3/10
Processed query 4/10
Processed query 5/10
Processed query 6/10
Processed query 7/10
Processed query 8/10
Processed query 9/10
Processed query 10/10
Translated SPARQL queries saved to 'zero_shot_entity_aligned_output_llama_dblp_openalex_v2.json'.


In [None]:
# Set up the LLM API connection
api_key = '###' 
base_url = "https://chat-ai.academiccloud.de/v1"
model = "mistral-large-instruct"

# Start OpenAI client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

# Load the input dataset
input_file = "../../data/DBLP_to_OpenAlex_input_v2.json"
output_file = "zero_shot_entity_aligned_output_mistral_dblp_openalex_v2.json"

with open(input_file, "r", encoding="utf-8") as file:
    llm_input_data = json.load(file)

# Limit to first 10 queries
llm_input_data = llm_input_data[:10]

# Initialize the list to store the responses
translated_dataset = []

# Query the LLM for each entry in the dataset
for i, entry in enumerate(llm_input_data):
    context = entry["context"]
    
    # Create the prompt for the LLM
    prompt = (
        f"Context:\n"
        f"Natural Language Question: {context['natural_language_question']}\n"
        f"SPARQL Query for KG1 ({context['kg1_name']}):\n"
        f"{context['sparql_query_kg1']}\n"
        f"Knowledge Graph 1 Name: {context['kg1_name']}\n"
        f"Knowledge Graph 2 Name: {context['kg2_name']}\n"
        f"Entity and Relation Mapping (ER2):\n{json.dumps(context['er2'], indent=2)}\n"
        f"Instruction: {context['instruction']}"
    )
    
    try:
        # Query the LLM
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            model=model
        )
        
        # Extract the generated SPARQL query from the response
        response_text = response.choices[0].message.content
        
        # Append the response to the translated dataset
        translated_entry = {
            "context": context,
            "sparql_query_kg2": response_text
        }
        translated_dataset.append(translated_entry)
        
        print(f"Processed query {i+1}/10")

    except Exception as e:
        print(f"Error querying LLM for question: {context.get('natural_language_question', 'unknown')}: {e}")
        continue

# Save the translated dataset to a new JSON file
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(translated_dataset, file, indent=4)

print(f"Translated SPARQL queries saved to '{output_file}'.")

Processed query 1/10
Processed query 2/10
Processed query 3/10
Processed query 4/10
Processed query 5/10
Processed query 6/10
Processed query 7/10
Processed query 8/10
Processed query 9/10
Processed query 10/10
Translated SPARQL queries saved to 'zero_shot_entity_aligned_output_mistral_dblp_openalex_v2.json'.


## Analysis of the results
For now it can be seen that the extraction process for the larger model (mistral-large-instruct) is working better than for the smaller model (meta-llama-3.1-8b-instruct). The output of the smaller model is not that structured making it harder to extract the queries. On the first sight it also seems that the queries from the larger model are working better.


Extracting the SPARQL queries from LLM output for **mistral-large-instruct**.