# Data preperation DBLP to OpenAlex

In [None]:
import json
import random
import requests
import time

### DBLP Query selection 
**Purpose**
This script extracts a balanced selection of 100 questions from the DBLP-QuAD dataset, ensuring that:

- Only specific templates (TP01, TP02, etc.) are included, all questions which can also be answered with OpenAlex.
- Questions from TP17 have a valid ORCID (fetched via the DBLP SPARQL endpoint). For those it makes sense to include the ORCID, for later creating promts including the ORCID for OpenAlex
- Questions are evenly distributed among template IDs.

In [None]:
# URL of the JSON dataset
url = "https://raw.githubusercontent.com/awalesushil/DBLP-QuAD/refs/heads/main/data/DBLP-QuAD/train/questions.json"

# Define the SPARQL endpoint
SPARQL_ENDPOINT = "https://sparql.dblp.org/sparql"

# Fetch the JSON data from the URL
response = requests.get(url)
data = response.json()

# List of template_ids to filter
template_ids = [
    "TP01", "TP02", "TP03", "TP05", "TP12",
    "TP17", "TP34", "TP52"
]

# Filter questions by the specified template_ids
filtered_questions = {tid: [q for q in data["questions"] if q["template_id"] == tid] for tid in template_ids}

# Function to query SPARQL endpoint for ORCID using full author URL
def get_orcid_from_sparql(author_url):
    query = f"""
    SELECT ?orcid WHERE {{
      <{author_url}> <https://dblp.org/rdf/schema#orcid> ?orcid
    }}
    """
    params = {"query": query, "format": "json"}

    try:
        response = requests.get(SPARQL_ENDPOINT, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        # Extract ORCID if found
        bindings = data.get("results", {}).get("bindings", [])
        if bindings:
            return bindings[0]["orcid"]["value"]

    except requests.exceptions.RequestException as e:
        print(f"Error querying ORCID for {author_url}: {e}")

    return None  # Return None if ORCID is not found

# Ensure TP17 questions have an ORCID
valid_tp17_questions = []
for question in filtered_questions["TP17"]:
    updated_entities = []
    has_valid_orcid = False

    for entity in question["entities"]:
        entity_clean = entity.replace("<", "").replace(">", "")  # Remove < >
        if "/pid/" in entity_clean:  # Identify author entities
            print(f"Fetching ORCID for author: {entity_clean}")
            orcid = get_orcid_from_sparql(entity_clean)

            # If ORCID is found, mark as valid
            if orcid:
                has_valid_orcid = True

            updated_entities.append({"author": entity, "orcid": orcid})
            time.sleep(1)  # Prevent API rate limiting
        else:
            updated_entities.append({"entity": entity})  # Keep non-author entities unchanged

    if has_valid_orcid:
        question["entities"] = updated_entities
        valid_tp17_questions.append(question)

# Replace old TP17 questions with only valid ones
filtered_questions["TP17"] = valid_tp17_questions

# Determine the number of entries to select from each template_id
num_entries_per_template = 100 // len(template_ids)

# Dictionary to hold selected questions
selected_questions = {tid: [] for tid in template_ids}

# Randomly select questions for each template_id
for tid in template_ids:
    # Randomly sample required number of questions
    selected_questions[tid] = random.sample(
        filtered_questions[tid],
        min(num_entries_per_template, len(filtered_questions[tid]))
    )

# Flatten the list of selected questions
final_selection = [q for questions in selected_questions.values() for q in questions]

# If the total number of selected questions is less than 100, sample additional questions
if len(final_selection) < 100:
    remaining_questions = [q for q in data["questions"] if q not in final_selection and q["template_id"] in template_ids]
    additional_needed = 100 - len(final_selection)
    final_selection.extend(random.sample(remaining_questions, min(additional_needed, len(remaining_questions))))

final_selection = final_selection[:100]

# Count the number of questions per template_id
template_counts = {tid: 0 for tid in template_ids}
for question in final_selection:
    template_counts[question["template_id"]] += 1

# Print the counts per template_id
print("\nNumber of selected questions per template_id:")
for tid, count in sorted(template_counts.items()):
    print(f"{tid}: {count}")

# Create the final JSON structure
final_data = {"questions": final_selection}

# Save to a JSON file
output_filename = "DBLP_100_questions.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(final_data, f, indent=4, ensure_ascii=False)

print(f"\nTotal selected questions: {len(final_selection)}")
print(f"Saved selected questions to {output_filename}")


Fetching ORCID for author: https://dblp.org/pid/89/4335
Fetching ORCID for author: https://dblp.org/pid/77/5249
Fetching ORCID for author: https://dblp.org/pid/33/2148
Fetching ORCID for author: https://dblp.org/pid/269/7922
Fetching ORCID for author: https://dblp.org/pid/88/6436
Fetching ORCID for author: https://dblp.org/pid/16/4029
Fetching ORCID for author: https://dblp.org/pid/159/5093
Fetching ORCID for author: https://dblp.org/pid/04/3170
Fetching ORCID for author: https://dblp.org/pid/126/0296
Fetching ORCID for author: https://dblp.org/pid/06/3884
Fetching ORCID for author: https://dblp.org/pid/51/7361
Fetching ORCID for author: https://dblp.org/pid/200/7948
Fetching ORCID for author: https://dblp.org/pid/31/117
Fetching ORCID for author: https://dblp.org/pid/18/6200
Fetching ORCID for author: https://dblp.org/pid/38/10526
Fetching ORCID for author: https://dblp.org/pid/98/7910
Fetching ORCID for author: https://dblp.org/pid/73/7523
Fetching ORCID for author: https://dblp.org/

**Purpose**
The goal of this script is to execute SPARQL queries from DBLP_100_questions.json against the DBLP SPARQL endpoint and store the responses. This allows for:
- Retrieving structured knowledge (e.g., authors, publications, venues) from DBLP.
- Ensuring data completeness by associating each question with its corresponding SPARQL result.
- Generating a structured dataset (DBLP_100_results.json) that combines questions and responses for further analysis.

In [None]:
# Load queries from the JSON file
input_filename = "DBLP_100_questions.json"
output_filename = "DBLP_100_results.json"

with open(input_filename, "r", encoding="utf-8") as f:
    data = json.load(f)

# Store results
results = []

# Iterate over each question and execute the SPARQL query
for question in data["questions"]:
    query = question["query"]["sparql"]  # Extract the SPARQL query
    query_id = question["id"]  # Get the question ID

    # Define the request parameters
    params = {
        "query": query,
        "format": "json"
    }

    print(f"Querying {query_id}...")

    try:
        # Send request to the DBLP SPARQL endpoint
        response = requests.get(SPARQL_ENDPOINT, params=params, timeout=10)
        response.raise_for_status()
        query_result = response.json()

        # Store the original question along with the response
        question_with_response = question.copy()  # Preserve all original fields
        question_with_response["response"] = query_result  # Add API response

        results.append(question_with_response)

    except requests.exceptions.RequestException as e:
        print(f"Error querying {query_id}: {e}")

        # Store the original question along with the error message
        question_with_response = question.copy()
        question_with_response["response"] = {"error": str(e)}

        results.append(question_with_response)

# Save the results to a JSON file
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump({"questions": results}, f, indent=4, ensure_ascii=False)

print(f"\nSaved query results to {output_filename}")

Querying Q3731...
Querying Q3790...
Querying Q3633...
Querying Q3841...
Querying Q3806...
Querying Q3738...
Querying Q3612...
Querying Q3848...
Querying Q3781...
Querying Q3683...
Querying Q3595...
Querying Q3817...
Querying Q3518...
Querying Q3566...
Querying Q3536...
Querying Q3666...
Querying Q3822...
Querying Q3537...
Querying Q3578...
Querying Q3699...
Querying Q3762...
Querying Q3846...
Querying Q3686...
Querying Q3630...
Querying Q3775...
Querying Q3644...
Querying Q3771...
Querying Q3516...
Querying Q3832...
Querying Q3802...
Querying Q3576...
Querying Q3547...
Querying Q3663...
Querying Q3613...
Querying Q3709...
Querying Q3665...
Querying Q3548...
Querying Q3768...
Querying Q3641...
Querying Q3581...
Querying Q3585...
Querying Q3568...
Querying Q3844...
Querying Q3584...
Querying Q3606...
Querying Q3597...
Querying Q3769...
Querying Q3690...
Querying Q3999...
Querying Q3923...
Querying Q4057...
Querying Q4140...
Querying Q3932...
Querying Q3854...
Querying Q3852...
Querying Q

Next Steps: 
- Query the queries in OpenAlex and get the corresponding results.