In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
import json

from tenacity import retry, wait_random_exponential, before_sleep_log
import logging
import sys

from tqdm import tqdm

import urllib.parse        
import requests
from bs4 import BeautifulSoup

logging.basicConfig(stream=sys.stderr)

logger = logging.getLogger('Verifier')
logger.setLevel(logging.ERROR)

In [3]:
@retry(wait=wait_random_exponential(multiplier=1, max=60), before_sleep=before_sleep_log(logger, logging.ERROR))
def get_property_description(property_id):
    # Initialize the SPARQL wrapper with the endpoint
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    
    # Define the SPARQL query
    query = f"""
    SELECT ?property ?propertyLabel ?propertyDescription
    WHERE {{
      wd:{property_id} rdfs:label ?propertyLabel .
      wd:{property_id} schema:description ?propertyDescription .
      FILTER (lang(?propertyLabel) = 'en')
      FILTER (lang(?propertyDescription) = 'en')
    }}
    LIMIT 1
    """
    
    # Set the query to the wrapper
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    # Execute the query and parse the results
    results = sparql.query().convert()

    if len(results["results"]["bindings"]) > 0:
      result = results["results"]["bindings"][0]
      property_description = result.get("propertyDescription", {}).get("value", "No description available")
    else:
      property_description = ''
    return property_description

# Example usage
property_id = "P27"  # Replace with the actual property ID you want to query
get_property_description(property_id)

'the object is a country that recognizes the subject as its citizen'

In [4]:
relation_mapping = {}
with open("id2name_mappings/relation_mapping.jsonl", "r") as f:
    for line in f:
        mapping = eval(line)
        relation_mapping[mapping["id"]] = mapping["en_label"]
        
len(relation_mapping)

1187

In [5]:
rel2desc = {}

for rel_id in tqdm(relation_mapping.keys(), total=len(relation_mapping)):
    description = get_property_description(rel_id)
    rel2desc[rel_id] = description

  6%|▌         | 74/1187 [00:19<07:18,  2.54it/s]ERROR:Verifier:Retrying __main__.get_property_description in 0.8186214600629081 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
  9%|▊         | 103/1187 [00:28<04:58,  3.64it/s]ERROR:Verifier:Retrying __main__.get_property_description in 0.013864225393150242 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
  9%|▉         | 107/1187 [00:30<05:26,  3.30it/s]ERROR:Verifier:Retrying __main__.get_property_description in 0.4323052546917714 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
 10%|▉         | 115/1187 [00:32<05:15,  3.40it/s]ERROR:Verifier:Retrying __main__.get_property_description in 0.9185032659469747 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
 11%|█▏        | 134/1187 [00:38<04:22,  4.00it/s]ERROR:Verifier:Retrying __main__.get_property_description in 0.9180896668923295 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
 12%|█▏       

In [6]:
len(rel2desc)

1187

In [7]:
with open('data/relation2description.json', 'w') as f:
    json.dump(rel2desc, f)

In [8]:
@retry(wait=wait_random_exponential(multiplier=1, max=60), before_sleep=before_sleep_log(logger, logging.ERROR))
def get_entity_description(entity_id):
    # Define the endpoint
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    # Define the SPARQL query
    query = f"""
    SELECT ?entity ?entityLabel ?entityDescription
    WHERE {{
      wd:{entity_id} rdfs:label ?entityLabel .
      wd:{entity_id} schema:description ?entityDescription .
      FILTER (lang(?entityLabel) = 'en')
      FILTER (lang(?entityDescription) = 'en')
    }}
    LIMIT 1
    """

    # Set the query and the return format
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    # Execute the query and return results
    results = sparql.query().convert()
    if results["results"]["bindings"]:
        result = results["results"]["bindings"][0]
        description = result["entityDescription"]["value"]
        return description
    else:
        return ''

# Example usage
entity_id = "Q18154878"  # Example entity ID (Douglas Adams)
description = get_entity_description(entity_id)
if description:
    print(f"Description for entity {entity_id}: {description}")
else:
    print(f"No description found for entity {entity_id}")

No description found for entity Q18154878


In [9]:
entity_id = "Q18154878"

@retry(wait=wait_random_exponential(multiplier=1, max=60), before_sleep=before_sleep_log(logger, logging.ERROR))
def get_redirected_id(entity_id):
    url = f"https://www.wikidata.org/wiki/{entity_id}"
    response = requests.get(url)

    if response.status_code == 200:    
        new_id = response.headers['link'].split(";")[0].split("/")[-1].split(".")[0]
    else:
        new_id = None
    return new_id

In [10]:
get_entity_description(get_redirected_id('Q18154878'))

'battery-powered PC in a horizontal clamshell design with integrated keyboard and display, about the size of a pocket calculator'

In [11]:
entity_mapping = {}
with open("id2name_mappings/entity_mapping.jsonl", "r") as f:
    for line in f:
        mapping = eval(line)
        entity_mapping[mapping["id"]] = mapping["en_label"]
        
len(entity_mapping)

6427497

In [12]:
entities = []
with open("constrained_worlds/genie/entities.json", "r") as f:
    entities = json.load(f)
len(entities)

2724925

In [13]:
ent2desc = {}
with open("id2label_and_desc.json", "r") as f:
    ent2desc = json.load(f)
len(ent2desc)

107674454

In [14]:
id2desc_intersected = {}
not_found = []
for ent in tqdm(entities):
    if ent in ent2desc:
        id2desc_intersected[ent] = ent2desc[ent].copy()
    else:
        not_found.append(ent)

100%|██████████| 2724925/2724925 [00:06<00:00, 423747.71it/s]


In [15]:
not_found_desc = {}

for ent_id in tqdm(not_found):
    new_id = get_redirected_id(ent_id)
    if new_id:
        desc = get_entity_description(new_id)
    else:
        desc = ''
        
    not_found_desc[ent_id] = desc
    

  0%|          | 1/5406 [00:00<36:20,  2.48it/s]ERROR:Verifier:Retrying __main__.get_entity_description in 0.1902415194169934 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
  0%|          | 2/5406 [00:01<59:30,  1.51it/s]ERROR:Verifier:Retrying __main__.get_entity_description in 0.7452260995701065 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
  0%|          | 5/5406 [00:03<57:34,  1.56it/s]  ERROR:Verifier:Retrying __main__.get_entity_description in 0.041377366729226206 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
  1%|          | 38/5406 [00:18<42:09,  2.12it/s]ERROR:Verifier:Retrying __main__.get_entity_description in 0.7214968881992617 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
  1%|          | 53/5406 [00:26<42:49,  2.08it/s]  ERROR:Verifier:Retrying __main__.get_entity_description in 0.9007073127980582 seconds as it raised HTTPError: HTTP Error 429: Too Many Requests.
 24%|██▍       | 1311/5406 

In [16]:
with open('not_found_desc.json', 'w') as f:
    json.dump(not_found_desc, f)