# Searching for articles in WikiData

In [1]:
import requests
import json
import numpy as np
from qdrant_client import QdrantClient
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

WIKIDATA_URL = "https://www.wikidata.org/w/api.php"
model = SentenceTransformer('KBLab/sentence-bert-swedish-cased', device='cpu')


In [23]:
def search_wikidata(query):
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "sv",  # Language dictating how searches are made
        "uselang": "sv",   # Language of item description
        "limit": 10,       # Number of search results
        "search": query,
    }
    response = requests.get(WIKIDATA_URL, params=params)
    if 'search' not in response.json().keys():
        return None
    return response.json()['search']

# def get_wikipedia_article_from_qid(qid: str):
#     params = {
#         "action": "wbgetentities",
#         "format": "json",
#         # "lang": "sv",  # Language dictating how searches are made
#         "props": "sitelinks",
#         "ids": qid,
#         "sitefilter": "svwiki",
#         # "uselang": "sv",   # Language of item description
#         # "limit": 10,       # Number of search results
#         # "search": query,
#     }
#     response = requests.get(WIKIDATA_URL, params=params)
#     return response.json()

def search_property(qid, prop='P625'):
    params = {
        "action": "wbgetentities",
        "format": "json",
        "languages": "se",
        "ids": qid,
        "props": "claims",
    }
    response = requests.get(WIKIDATA_URL, params=params)
    data = response.json()['entities'].get(qid)
    if not data:
        print(f"QID: {qid} was not found")
        return None

    prop_claim = data['claims'].get(prop)
    if not prop_claim:
        print(f"Entity: {qid} does not have property: {prop}")
        return None

    prop_value = prop_claim[0]['mainsnak']['datavalue']['value']
    return prop_value

def get_qid(entity):
    return entity['id']

def get_description(entity):
    return entity.get('display', '').get('description', '').get('value', '')


In [4]:
with open('e1_linked.json', 'r', encoding='utf-8') as infile:
    e1_items = json.loads(infile.read())
e1_items[0]
# examples = [
#     ["Paris", "<b>Paris</b> [franskt utt. pari], Frankrikes hufvudstad, näst London Europas folkrikaste stad, ligger under 2° 20' 15\" ö. lgd samt 48° 50' 11,2\" n. br. (nationalobservatoriet), på båda sidor om Seine,"],
#     ["Fantasi", "<b>Fantasi</b> (Grek. <i>fantasia,</i> af <i>fantazein,</i> göra synbar), föreställningsförmåga, inbillningskraft; diktningsgåfva; inbillning, infall, nyck, hugskott; feberdröm; musikstycke utan bestä"],
#     ["Åkarp", "<b>Åkarp.</b> 1. Socknar. Se Norra Åkarp och Södra Åkarp. - 2. Municipalsamhälle (jämlikt k. br. 21 nov. 1913) i Malmöhus län, Burlöfs och Tottarps socknar, beläget vid statens järnvägar, 8 km. från L"],
# ]

{'headword': 'A',
 'entryid': 'e1_aa_9_0',
 'text': '<b>A</b> är den första <i>bokstafven</i> i alla indoeuropeiska språks alfabet utom i den vanliga runföljden, der det innehar det tionde rummet.  Det är tillika det renaste och klaraste af alla <i>språ',
 'type': 0,
 'qid': '0',
 'second_edition_key': 'e2_ba_13_0',
 'fourth_edition_key': ''}

In [None]:
# search_term = "åkarp"
# result = search_wikidata(search_term)

# for search_item in result:
#     print(search_item)

## Compare with wikidata description for comparison of cosine similarity

In [30]:
# take the description of each item, compute embedding with kb-sbert
# where should we take the data from? qdrant or json or what
results = []
iterations = 0
for e in e1_items:
    iterations += 1
    if iterations > 10:
        break
    search_term = e['headword']
    if search_term == "":
        continue
    result = search_wikidata(search_term)
    if not result :
        continue
    vectors = []
    for item in tqdm(result):
        if "description" in item.keys():
            vectors.append(model.encode(item["description"]).tolist())
        else:
            vectors.append([0] * 768)
    
    example_vector = model.encode(e['text'])
    scores = cosine_similarity([example_vector], vectors)[0]

    best_match_index = list(scores).index(max(scores))

    qid = get_qid(result[best_match_index])
    coords = search_property(qid)
    if coords == None:
        print(f"{e['headword']}: None")
    else:
        print(f"{e['headword']}: {coords['latitude']}, {coords['longitude']}")
        e['type'] = 1
        e['coordinates'] = [coords['latitude'], coords['longitude']]
    e['qid'] = qid
    

with open('e1_linked2.json', 'w', encoding='utf-8') as outfile:
    json.dump(e1_items, outfile, ensure_ascii=False, indent=4)
# results

# return the one with the highest cosine sim
# if it has P625 property (coordinate location), edit something
# edit the qdrant or json or smth


  0%|          | 0/10 [00:00<?, ?it/s]

Entity: Q9659 does not have property: P625
A: None


  0%|          | 0/10 [00:00<?, ?it/s]

Entity: Q9659 does not have property: P625
A: None


  0%|          | 0/10 [00:00<?, ?it/s]

Aa: 59.423611111111, 27.1525


  0%|          | 0/10 [00:00<?, ?it/s]

Aabenraa: 55.044444444444, 9.4180555555556


  0%|          | 0/10 [00:00<?, ?it/s]

Aachen: 50.75, 6.25


  0%|          | 0/2 [00:00<?, ?it/s]

Aafjord: 63.918888888889, 10.073611111111


  0%|          | 0/1 [00:00<?, ?it/s]

Entity: Q455518 does not have property: P625
Aagesön: None


  0%|          | 0/10 [00:00<?, ?it/s]

Entity: Q300815 does not have property: P625
Aak: None


  0%|          | 0/10 [00:00<?, ?it/s]

Aakirkeby: 55.061836146049, 14.90781569946


In [21]:

example_vectors = []
for e in examples:
    example_vectors.append(model.encode(e))

scores = []
for e in example_vectors:
    cosine_similarity([e], vectors)[0]  # Look for the most similar vectors, manually score all vectors
scores
best_match_index = list(scores).index(max(scores))

result[best_match_index]
# top_scores_ids = np.argsort(scores)[-5:][::-1]  # Select top-5 with vectors the largest scores   



{'id': 'Q2402092',
 'title': 'Q2402092',
 'pageid': 2320476,
 'display': {'label': {'value': 'Åkarp', 'language': 'sv'},
  'description': {'value': 'tätort i Burlövs kommun, Sverige',
   'language': 'sv'}},
 'repository': 'wikidata',
 'concepturi': 'http://www.wikidata.org/entity/Q2402092',
 'label': 'Åkarp',
 'description': 'tätort i Burlövs kommun, Sverige',
 'match': {'type': 'label', 'language': 'sv', 'text': 'Åkarp'}}