# Searching for articles in WikiData

In [None]:
import os
os.chdir('../../')
print(os.getcwd())

import requests
from utils import json_helpers as jh
from utils.paths import *
import regex as re
import numpy as np
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

WIKIDATA_URL = "https://www.wikidata.org/w/api.php"
WIKIPEDIA_URL = "https://sv.wikipedia.org/w/api.php"
model = SentenceTransformer('KBLab/sentence-bert-swedish-cased', device='cpu')

SEARCH_LIMIT = 5
MATCH_THRESHOLD = 0.6

In [None]:
def search_wikidata(query: str, limit: int):
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "sv",  # Language dictating how searches are made
        "uselang": "sv",   # Language of item description
        "limit": limit,       # Number of search results
        "search": query,
    }
    response = requests.get(WIKIDATA_URL, params=params)
    if 'search' not in response.json().keys():
        return None
    return response.json()['search']

def get_wikipedia_title_from_qid(qid: str):
    params = {
        "action": "wbgetentities",
        "format": "json",
        # "lang": "sv",  # Language dictating how searches are made
        "props": "sitelinks",
        "ids": qid,
        "sitefilter": "svwiki",
        "languages": "se",
        # "uselang": "sv",   # Language of item description
        # "limit": 10,       # Number of search results
        # "search": query,
    }
    response = requests.get(WIKIDATA_URL, params=params)
    entity_data = response.json()['entities'].get(qid)

    if not entity_data:
        return None

    wikipedia_dict = entity_data['sitelinks'].get('svwiki')

    if not wikipedia_dict:
        return None
    
    return wikipedia_dict['title']
    #wikipedia_page_url = f"https://sv.wikipedia.org/wiki/{wikipedia_dict['title'].replace(' ', '_')}"
    #return wikipedia_page_url

def get_first_paragraph_text_wikipedia(qid: str):
    # Extract the page title from the Wikipedia link
    page_title = get_wikipedia_title_from_qid(qid) # TITLE 

    # Step 1: Get the Wikipedia page content
    page_params = {
        "action": "parse",
        "format": "json",
        "page": page_title,
        "prop": "text",
    }
    page_response = requests.get(WIKIPEDIA_URL, params=page_params)
    page_data = page_response.json()

    if 'error' in page_data:
        return None

    # Step 2: Extract the text of the first paragraph
    page_text = page_data['parse']['text']['*']

    # Use regex to find the first paragraph within the HTML content
    first_paragraph_match = re.search(r'<p>(.*?)</p>', page_text, re.DOTALL)

    if first_paragraph_match:
        # Remove HTML tags from the paragraph
        first_paragraph = re.sub(r'<.*?>', '', first_paragraph_match.group(1))
        first_paragraph = re.sub(r'\[.*?\]', '', first_paragraph)
        return first_paragraph.strip()
    else:
        return None

def search_property(qid: str, prop: str ='P625'):
    params = {
        "action": "wbgetentities",
        "format": "json",
        "languages": "se",
        "ids": qid,
        "props": "claims",
    }
    response = requests.get(WIKIDATA_URL, params=params)
    data = response.json()['entities'].get(qid)
    if not data:
        return None

    prop_claim = data['claims'].get(prop)
    if not prop_claim:
        return None

    prop_value = prop_claim[0]['mainsnak']['datavalue']['value']
    return prop_value

def get_qid(entity):
    return entity['id']

def get_description(entity):
    return entity.get('display', {}).get('description', {}).get('value', '')


## Compare with wikidata description for comparison of cosine similarity

In [None]:
# take the description of each item, compute embedding with kb-sbert
# where should we take the data from? qdrant or json or what
def search(edition_file: str, search_limit: int, match_threshold: float):
    items = jh.read_items(edition_file)

    for e in tqdm(items):
        if e['class'] == 0 or e['cross_ref_key'] != "": # Ignore cross references
            continue
        search_term = e['headword']
        result = search_wikidata(search_term, search_limit)
        if not result :
            continue
        vectors = []
        for item in result:
            qid = get_qid(item)
            description = get_description(item)
            wikipedia_text = get_first_paragraph_text_wikipedia(qid)

            if wikipedia_text:
                vectors.append(model.encode(wikipedia_text[:200]).tolist())
            elif description:
                vectors.append(model.encode(description[:200]).tolist())
            else:
                vectors.append([0] * 768)
        
        example_vector = model.encode(e['text'])
        scores = cosine_similarity([example_vector], vectors)[0]
        
        if max(scores) > match_threshold: 
            best_match_index = list(scores).index(max(scores))
            qid = get_qid(result[best_match_index])
            coords = search_property(qid)
            if coords != None:
                e['latitude'] = coords['latitude']
                e['longitude'] = coords['longitude']

            e['qid'] = qid

    jh.write_items(items, edition_file)

    # results

    # return the one with the highest cosine sim
    # if it has P625 property (coordinate location), edit something
    # edit json or smth


### Try wiki searcher against test data to decide match threshold

In [None]:
def valid_location(entry: dict) -> bool:
    lat = entry['correct_lat']
    lon = entry['correct_lon']
    pred_lat = entry['latitude']
    pred_lon = entry['longitude']

    if lat == None and lon == None and pred_lat == None and pred_lon == None:
        return True
    elif lat != None and lon != None and pred_lat != None and pred_lon != None:
        lat_dif = abs(lat - pred_lat)
        lon_dif = abs(lon - pred_lon)
        if lat_dif < 0.5 and lon_dif < 0.5: # Number chosen intuitively
            return True
        else:
            return False
    else:
        return False

def eval_wiki_searcher(testfile: str, edition_nbr: int, match_threshold: float):
    search(edition_file=testfile, search_limit=5, match_threshold=match_threshold)
    
    e = jh.read_items(testfile)
    nr_entries = len(e)
    nr_correct = 0
    for entry in e:
        if valid_location(entry):
            nr_correct += 1
    
    accuracy = nr_correct / nr_entries
    with open(f'{WIKI_STATS_FOLDER}/e{edition_nbr}_stats_wiki.txt', 'a', encoding='utf-8') as file:
        file.write(f'Match threshold: {match_threshold}\n')
        file.write(f'Accuracy score: {accuracy}\n')
        file.write(f'------------\n')


In [None]:
# Tests. We found 0.6 to work best for both.
# A high threshold is prefereable as after location classifier,
# there may be non locations classified as locations.
# MATCH_THRESHOLD = 0.6
eval_wiki_searcher(f'{WIKI_TEST_FOLDER}/e1_test_wiki', 1, MATCH_THRESHOLD)
eval_wiki_searcher(f'{WIKI_TEST_FOLDER}/e2_test_wiki', 2, MATCH_THRESHOLD)

### Link editions with wikidata.

In [None]:
search(edition_file=f'{ENCYCLOPEDIAS_JSON_FOLDER}/e1', search_limit=SEARCH_LIMIT, match_threshold=MATCH_THRESHOLD)
search(edition_file=f'{ENCYCLOPEDIAS_JSON_FOLDER}/e2', search_limit=SEARCH_LIMIT, match_threshold=MATCH_THRESHOLD)