# Searching for articles in WikiData

In [10]:
import requests
import numpy as np
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

WIKIDATA_URL = "https://www.wikidata.org/w/api.php"
model = SentenceTransformer('KBLab/sentence-bert-swedish-cased', device='cpu')


In [17]:
def search_wikidata(query):
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "sv",  # Language dictating how searches are made
        "uselang": "sv",   # Language of item description
        "limit": 10,       # Number of search results
        "search": query,
        "props": "P625"
    }
    response = requests.get(WIKIDATA_URL, params=params)
    return response.json()['search']

# def get_wikipedia_article_from_qid(qid: str):
#     params = {
#         "action": "wbgetentities",
#         "format": "json",
#         # "lang": "sv",  # Language dictating how searches are made
#         "props": "sitelinks",
#         "ids": qid,
#         "sitefilter": "svwiki",
#         # "uselang": "sv",   # Language of item description
#         # "limit": 10,       # Number of search results
#         # "search": query,
#     }
#     response = requests.get(WIKIDATA_URL, params=params)
#     return response.json()

def search_property(qid, prop='P625'):
    params = {
        "action": "wbgetentities",
        "format": "json",
        "languages": "se",
        "ids": qid,
        "props": "claims",
    }
    response = requests.get(WIKIDATA_URL, params=params)
    data = response.json()['entities'].get(qid)
    if not data:
        print(f"QID: {qid} was not found")
        return None

    prop_claim = data['claims'].get(prop)
    if not prop_claim:
        print(f"Entity: {qid} does not have property: {prop}")
        return None

    prop_value = prop_claim[0]['mainsnak']['datavalue']['value']
    return prop_value

def get_qid(entity):
    return entity['id']

def get_description(entity):
    return entity.get('display', '').get('description', '').get('value', '')


In [6]:
examples = [
    ["Paris", "<b>Paris</b> [franskt utt. pari], Frankrikes hufvudstad, näst London Europas folkrikaste stad, ligger under 2° 20' 15\" ö. lgd samt 48° 50' 11,2\" n. br. (nationalobservatoriet), på båda sidor om Seine,"],
    ["Fantasi", "<b>Fantasi</b> (Grek. <i>fantasia,</i> af <i>fantazein,</i> göra synbar), föreställningsförmåga, inbillningskraft; diktningsgåfva; inbillning, infall, nyck, hugskott; feberdröm; musikstycke utan bestä"],
    ["Åkarp", "<b>Åkarp.</b> 1. Socknar. Se Norra Åkarp och Södra Åkarp. - 2. Municipalsamhälle (jämlikt k. br. 21 nov. 1913) i Malmöhus län, Burlöfs och Tottarps socknar, beläget vid statens järnvägar, 8 km. från L"],
]

In [None]:
# search_term = "åkarp"
# result = search_wikidata(search_term)

# for search_item in result:
#     print(search_item)

## Compare with wikidata description for comparison of cosine similarity

In [27]:
# take the description of each item, compute embedding with kb-sbert
results = []
for e in examples:
    search_term = e[0]
    result = search_wikidata(search_term)
    vectors = []
    for item in tqdm(result):
        if "description" in item.keys():
            vectors.append(model.encode(item["description"]).tolist())
        else:
            vectors.append([0] * 768)
    
    example_vector = model.encode(e[1])
    scores = cosine_similarity([example_vector], vectors)[0]

    best_match_index = list(scores).index(max(scores))

    qid = get_qid(result[best_match_index])
    coords = search_property(qid)
    if coords == None:
        print(f"{e[0]}: None")
    else:
        print(f"{e[0]}: {coords['latitude']}, {coords['longitude']}")

    # results.append(result[best_match_index])

# results

# return the one with the highest cosine sim
# if it has P625 property (coordinate location), edit something
# edit the qdrant or json or smth


  0%|          | 0/10 [00:00<?, ?it/s]

Paris: 48.85666666666667, 2.352222222222222


  0%|          | 0/10 [00:00<?, ?it/s]

Entity: Q96634588 does not have property: P625
Fantasi: None


  0%|          | 0/10 [00:00<?, ?it/s]

Åkarp: 55.653998464109, 13.110953208687


In [21]:

example_vectors = []
for e in examples:
    example_vectors.append(model.encode(e))

scores = []
for e in example_vectors:
    cosine_similarity([e], vectors)[0]  # Look for the most similar vectors, manually score all vectors
scores
best_match_index = list(scores).index(max(scores))

result[best_match_index]
# top_scores_ids = np.argsort(scores)[-5:][::-1]  # Select top-5 with vectors the largest scores   



{'id': 'Q2402092',
 'title': 'Q2402092',
 'pageid': 2320476,
 'display': {'label': {'value': 'Åkarp', 'language': 'sv'},
  'description': {'value': 'tätort i Burlövs kommun, Sverige',
   'language': 'sv'}},
 'repository': 'wikidata',
 'concepturi': 'http://www.wikidata.org/entity/Q2402092',
 'label': 'Åkarp',
 'description': 'tätort i Burlövs kommun, Sverige',
 'match': {'type': 'label', 'language': 'sv', 'text': 'Åkarp'}}