# Searching for articles in WikiData

In [None]:
import requests
import json_helpers as jh
import regex as re
import numpy as np
from qdrant_client import QdrantClient
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

WIKIDATA_URL = "https://www.wikidata.org/w/api.php"
WIKIPEDIA_URL = "https://sv.wikipedia.org/w/api.php"
model = SentenceTransformer('KBLab/sentence-bert-swedish-cased', device='cpu')


In [None]:
def search_wikidata(query: str):
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "sv",  # Language dictating how searches are made
        "uselang": "sv",   # Language of item description
        "limit": 10,       # Number of search results
        "search": query,
    }
    response = requests.get(WIKIDATA_URL, params=params)
    if 'search' not in response.json().keys():
        return None
    return response.json()['search']

def get_wikipedia_title_from_qid(qid: str):
    params = {
        "action": "wbgetentities",
        "format": "json",
        # "lang": "sv",  # Language dictating how searches are made
        "props": "sitelinks",
        "ids": qid,
        "sitefilter": "svwiki",
        "languages": "se",
        # "uselang": "sv",   # Language of item description
        # "limit": 10,       # Number of search results
        # "search": query,
    }
    response = requests.get(WIKIDATA_URL, params=params)
    entity_data = response.json()['entities'].get(qid)

    if not entity_data:
        print("No data found.")
        return None

    wikipedia_dict = entity_data['sitelinks'].get('svwiki')

    if not wikipedia_dict:
        print("No svwiki found")
        return None
    
    return wikipedia_dict['title']
    #wikipedia_page_url = f"https://sv.wikipedia.org/wiki/{wikipedia_dict['title'].replace(' ', '_')}"
    #return wikipedia_page_url

def get_first_paragraph_text_wikipedia(qid: str):
    # Extract the page title from the Wikipedia link
    page_title = get_wikipedia_title_from_qid(qid) # TITLE 

    # Step 1: Get the Wikipedia page content
    page_params = {
        "action": "parse",
        "format": "json",
        "page": page_title,
        "prop": "text",
    }
    page_response = requests.get(WIKIPEDIA_URL, params=page_params)
    page_data = page_response.json()

    if 'error' in page_data:
        print("Error:", page_data['error']['info'])
        return None

    # Step 2: Extract the text of the first paragraph
    page_text = page_data['parse']['text']['*']

    # Use regex to find the first paragraph within the HTML content
    first_paragraph_match = re.search(r'<p>(.*?)</p>', page_text, re.DOTALL)

    if first_paragraph_match:
        # Remove HTML tags from the paragraph
        first_paragraph = re.sub(r'<.*?>', '', first_paragraph_match.group(1))
        first_paragraph = re.sub(r'\[.*?\]', '', first_paragraph)
        return first_paragraph.strip()
    else:
        print("First paragraph not found.")
        return None

def search_property(qid: str, prop: str ='P625'):
    params = {
        "action": "wbgetentities",
        "format": "json",
        "languages": "se",
        "ids": qid,
        "props": "claims",
    }
    response = requests.get(WIKIDATA_URL, params=params)
    data = response.json()['entities'].get(qid)
    if not data:
        print(f"QID: {qid} was not found")
        return None

    prop_claim = data['claims'].get(prop)
    if not prop_claim:
        print(f"Entity: {qid} does not have property: {prop}")
        return None

    prop_value = prop_claim[0]['mainsnak']['datavalue']['value']
    return prop_value

def get_qid(entity):
    return entity['id']

def get_description(entity):
    return entity.get('display', '').get('description', '').get('value', '')


## Compare with wikidata description for comparison of cosine similarity

In [None]:
# take the description of each item, compute embedding with kb-sbert
# where should we take the data from? qdrant or json or what
def search(edition: str):
    if edition == 'e1':
        items = jh.read_items('e1_linked')
    else:
        items = jh.read_items('e2_linked')

    iterations = 0
    for e in items:
        iterations += 1
        if iterations > 20:
            break
        search_term = e['headword']
        if search_term == "":
            continue
        result = search_wikidata(search_term)
        if not result :
            continue
        vectors = []
        for item in tqdm(result):
            if "description" in item.keys():
                vectors.append(model.encode(item["description"]).tolist())
            else:
                vectors.append([0] * 768)
        
        example_vector = model.encode(e['text'])
        scores = cosine_similarity([example_vector], vectors)[0]

        best_match_index = list(scores).index(max(scores))

        qid = get_qid(result[best_match_index])
        coords = search_property(qid)
        if coords == None:
            print(f"{e['headword']}: None")

            e['latitude'] = None
            e['longitude'] = None
        else:
            print(f"{e['headword']}: {coords['latitude']}, {coords['longitude']}")

            e['type'] = 1
            e['latitude'] = coords['latitude']
            e['longitude'] = coords['longitude']
        
        e['qid'] = qid
        
    jh.write_items(items, f'{edition}_linked2')

    # results

    # return the one with the highest cosine sim
    # if it has P625 property (coordinate location), edit something
    # edit json or smth


In [None]:
search(edition='e1')
search(edition='e2')

In [None]:

example_vectors = []
for e in examples:
    example_vectors.append(model.encode(e))

scores = []
for e in example_vectors:
    cosine_similarity([e], vectors)[0]  # Look for the most similar vectors, manually score all vectors
scores
best_match_index = list(scores).index(max(scores))

result[best_match_index]
# top_scores_ids = np.argsort(scores)[-5:][::-1]  # Select top-5 with vectors the largest scores   

