### Method 1

In [81]:
import sys
from qwikidata.sparql import return_sparql_query_results
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api
from SPARQLWrapper import SPARQLWrapper, JSON
import json
from tqdm import tqdm
import re
import requests as r
import pandas as pd
from collections import defaultdict

In [3]:
class Translation_Api():
    # https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-reference
    def __init__(self):
        self.headers = {
                "Ocp-Apim-Subscription-Key":"e669322a133044489e6dc4e9cde3edee",
                "Content-Type":"application/json",
                "Ocp-Apim-Subscription-Region":"centralus"
            }
        self.transliterate_url = "https://api.cognitive.microsofttranslator.com/transliterate?api-version=3.0&language=hi&fromScript=Latn&toScript=Deva"
        self.translate_url = "https://api.cognitive.microsofttranslator.com/translate?api-version=3.0&to=hi&from=en&toScript=Deva"
    
    def get_translation(self,data):
        if "{{" in data : return data
        data = data.split("<ref>")[0]
        if data == "" : return data
        data = [{"Text":data}]
        res = r.post( self.translate_url , json = data , headers = self.headers).text
        res = json.loads(res)
        res = [ ret['translations'][0]['text'] for ret in res ]
        return res[0].replace("[[","[").replace("]]","]").replace("[","[[").replace("]","]]")
        
    def get_transliteration(self,data):
        if "{{" in data : return data
        data = data.split("<ref>")[0]
        if data == "" : return data
        data = [{"Text":data}]
        res = r.post( self.transliterate_url , json = data , headers = self.headers).text
        res = json.loads(res)
        res = [ret['text'] for ret in res]
        return res[0].replace("[[","[").replace("]]","]").replace("[","[[").replace("]","]]")
translator = Translation_Api()

In [4]:
biography = {
    "image": "P18",
    "gender":"P21",
    "residence":"P551",
    "birth_place":"P19",
    "birth_date":"P569",
    "profession": "P106",
    "notable_works": "P800",
    "education": "P69",
    "positions":"P39",
    "awards": "P166",
    "spouse": "P26",
    "nationality": "P27",
}

translation = {
    "name":"नाम",
    "description":"विवरण",
    "image": "चित्र",
    "gender":"लिंग",
    "residence":"निवास",
    "birth_place":"जन्म स्थान",
    "birth_date":"जन्मतारीख",
    "profession": "व्यवसाय",
    "notable_works": "उल्लेखनीय कार्य",
    "education": "शिक्षा",
    "positions": "पद",
    "awards": "पुरस्कार",
    "spouse": "पति या पत्नी",
    "other_available_information":"अन्य उपलब्ध जानकारी",
    "main_info": "मुख्य जानकारी",
    "nationality": "राष्ट्रीयता"
}

to_transliteration = ['name','birth_place','spouse','notable_works']

In [17]:
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

endpoint_url = "https://query.wikidata.org/sparql"

In [46]:
#calls qwikidata get entity function for an entity id
def getEntityInfo(eid):
    return get_entity_dict_from_api(eid)

#extract the name of the entity or the property value in native language
def extractName(info):
    return info.get('labels', {}).get('en', {}).get('value', "")

#extract the description of the entity or the property value in native language
def extractDescription(info):
    return info.get('descriptions', {}).get('en', {}).get('value', "")

#print the name and the description
def printNameAndDescription(info, trans):
    name = extractName(info)
    if name != "":
        print(trans['name'] + ":", translator.get_transliteration(name))
    desc = extractDescription(info)
    if desc != "":
        print(trans['description'] + ":", translator.get_translation(desc))
        
#print the information present in the entity itself
def printOtherInfo(entity, bio):
    #get the claims subdict
#     print(entity['claims'])
    for p in entity['claims'].keys():
        if p not in bio.values():
            #get information on the property in bengali
            ent_info = getEntityInfo(p)
            name, desc = extractName(ent_info), extractDescription(ent_info)
            if name == "":
                continue
            value = ""
            #for every property in the claims subdict get inforamtion on the correspoding values
            for data in entity.get('claims', {}).get(p, []):
                res = data.get('mainsnak',{}).get('datavalue', {}).get('value', {})
                if type(res) == dict:
                    info_id = res.get('id', "")
                    if info_id == "":
                        continue
                    info = getEntityInfo(info_id)
                    pname, pdesc = extractName(info), extractDescription(info)
                    if pname == "":
                        continue
                    if pdesc != "":
                        pname += str(f'({pdesc})')
                    value += pname 
            #only print property name and value if the value is present in native language
            if value != "":
                print(name + ": " + value)
    
def method1_infobox(wd, bio, trans):
    result = {}
    #get entity from api
    entity_info = getEntityInfo(wd)
    result['name'] = translator.get_transliteration(extractName(entity_info))
    result['description'] = translator.get_translation(extractDescription(entity_info))
    #print name and description
#     printNameAndDescription(entity_info, trans)
    #explicitly query using sparql to get main biography data
#     print("----------------------",trans['main_info'],"----------------------")
    for entity, wdt in bio.items():
        spqrqlq = f"SELECT ?entity ?entityLabel ?entityDescription WHERE {{ wd:{wd} wdt:{wdt} ?entity; SERVICE wikibase:label {{ bd:serviceParam wikibase:language \"en\". }} }}"
        s, v = trans[entity] + ": ", ""
        res = get_results(endpoint_url, str(spqrqlq))
        for entities in res['results']['bindings']:
            value = entities.get('entityLabel').get('value', "")
            if value != '' and 'Q' not in value:
                v += value + ','
        if v != "":
            if entity in to_transliteration:
                result[entity] = translator.get_transliteration(v)
            else:
                result[entity] = translator.get_translation(v)
#             print(s + v)
    return result

### Method2

In [6]:
import wptools
import json
from tqdm import tqdm
import re
import requests as r
from collections import defaultdict

In [7]:
def update(infobox , translator):
    infobox = defaultdict(str , infobox)
    updated_infobox = {}
    updated_infobox['name'] = translator.get_transliteration(infobox['name'])
    
    updated_infobox['image'] = infobox['image']
    
    updated_infobox['caption'] = translator.get_translation(infobox['caption'])
    
    updated_infobox['fullname'] = translator.get_transliteration(infobox['fullname'])
    updated_infobox['nickname'] = translator.get_transliteration(infobox['nickname'])
    
    infobox['birth_date'] = infobox['birth_date'].replace("df|","df").replace("|yes","yes")
    updated_infobox['residence'] = translator.get_transliteration(infobox['residence'])
    updated_infobox['birth_date'] = translator.get_translation(infobox['birth_date'])
    updated_infobox['birth_place'] = translator.get_transliteration(infobox['birth_place'])
    
    updated_infobox['death_date'] = translator.get_translation(infobox['death_date'])
    updated_infobox['death_place'] = translator.get_transliteration(infobox['death_place'])
    
    updated_infobox['country'] = translator.get_transliteration(infobox['country'])
    updated_infobox['nationality'] = translator.get_translation(infobox['nationality'])
    updated_infobox['occupation'] = translator.get_translation(infobox['occupation'])
    updated_infobox['profession'] = translator.get_translation(infobox['profession'])
    updated_infobox['positions'] = translator.get_translation(infobox['positions'])
    updated_infobox['heightm'] = infobox['heightm']
    updated_infobox['gender'] = translator.get_translation(infobox['gender'])
    
    updated_infobox['spouse'] = translator.get_transliteration(infobox['spouse'])
    updated_infobox['children'] = translator.get_transliteration(infobox['children'])
    updated_infobox['parents'] = translator.get_transliteration(infobox['parents'])
    updated_infobox['father'] = translator.get_transliteration(infobox['father'])
    updated_infobox['mother'] = translator.get_transliteration(infobox['mother'])
    
    updated_infobox['party'] = translator.get_translation(infobox['party'])
    updated_infobox['awards'] = translator.get_transliteration(infobox['awards'])
    updated_infobox['relations'] = translator.get_translation(infobox['relations'])
    updated_infobox['known_for'] = translator.get_translation(infobox['known_for'])
    updated_infobox['notable_works'] = translator.get_transliteration(infobox['notable_works'])
    
    updated_infobox['alma_mater'] = translator.get_translation(infobox['alma_mater'])
    updated_infobox['education'] = translator.get_translation(infobox['education'])
    updated_infobox = { key : val for key , val in updated_infobox.items() if val!=''}
    cur_len = len(updated_infobox)
    if len(updated_infobox) < 15:
        for key , val in infobox.items():
            if key in updated_infobox : continue
            updated_infobox[key] = translator.get_transliteration(val)
            if len(updated_infobox) == 15: break
    updated_infobox = { key : val for key , val in updated_infobox.items() if val!=''}
    return updated_infobox

In [40]:
def method2_infobox(name):
    page = wptools.page(name).get_parse()
    result = update(page.data['infobox'] , translator)
    return result

### Baseline

In [73]:
def getEntityInfo(eid):
    return get_entity_dict_from_api(eid)

#extract the name of the entity or the property value in native language
def extractNameBaseline(info):
    return info.get('labels', {}).get('hi', {}).get('value', "")

#extract the description of the entity or the property value in native language
def extractDescriptionBaseline(info):
    return info.get('descriptions', {}).get('hi', {}).get('value', "")

#print the name and the description
def printNameAndDescriptionBaseline(info, trans):
    name = extractName(info)
    if name != "":
        print(trans['name'] + ":", name)
    desc = extractDescription(info)
    if desc != "":
        print(trans['description'] + ":", desc)
    
def baseline_infobox(wd, bio, trans):
    result = {}
    #get entity from api
    entity_info = getEntityInfo(wd)
    #print name and description
#     printNameAndDescription(entity_info, trans)
    result['name'] = extractNameBaseline(entity_info)
    result['description'] = extractDescriptionBaseline(entity_info)
    #explicitly query using sparql to get main biography data
#     print("----------------------",trans['main_info'],"----------------------")
    for entity, wdt in bio.items():
        spqrqlq = f"SELECT ?entity ?entityLabel ?entityDescription WHERE {{ wd:{wd} wdt:{wdt} ?entity; SERVICE wikibase:label {{ bd:serviceParam wikibase:language \"hi\". }} }}"
        s, v = trans[entity] + ": ", ""
        res = get_results(endpoint_url, str(spqrqlq))
        for entities in res['results']['bindings']:
            value = entities.get('entityLabel').get('value', "")
            if value != '' and 'Q' not in value:
                v += value + ','
        if v != "":
            result[entity] = v
    return result
#             print(s + v)
            

In [69]:
def change_format(infobox):
    infobox = [ ("|" + key + " = " +  val) for key , val in infobox.items()]
    result = "{{Infobox person\n"
    result += "\n".join(infobox)
    result += "\n}}"
    return result

In [32]:
from libindic import inexactsearch
# from People_translator import Translation_Api

In [50]:
# translator = Translation_Api()
# translated_infobox = translator.get_infobox(page_name="Virat Kohli")
# actual_infobox = translator.get_page(page_name="विराट कोहली" ,language="hi").data['infobox']
# print(actual_infobox)

### Method1 Vs Method2

In [33]:
inst = inexactsearch.InexactSearch()

In [43]:
markings_m1 = {
    'C' : [],
    'S' : [],
    'D' : []
}
markings_m2 = {
    'C' : [],
    'S' : [],
    'D' : []
}

In [44]:
wiki_people = [['Q1058','Narendra Modi'],['Q213854','Virat Kohli']] #Example

In [78]:
import csv
with open('Baseline_results.csv', 'w', newline='') as file:
    writer_baseline = csv.writer(file)
    writer_baseline.writerow(["QId", "Name", "InfoBox"])
with open('Method1_results.csv', 'w', newline='') as file:
    writer_m1 = csv.writer(file)
    writer_m1.writerow(["QId", "Name", "InfoBox"])
with open('Method2_results.csv', 'w', newline='') as file:
    writer_m2 = csv.writer(file)
    writer_m2.writerow(["QId", "Name", "InfoBox"])
    
for qid,name in wiki_people:
    method1 = method1_infobox(qid, biography,translation)
    baseline = baseline_infobox(qid,biography,translation)
    for key in method1.keys():
        if key in baseline.keys():
    #         print(translated_infobox[key] , actual_infobox[key]) 
            val = inst.compare(method1[key] , baseline[key])
            if val > 0.70 :
                markings_m1['C'].append(key)
            else : markings_m1['S'].append(key)
    for key in baseline.keys():
        if key not in method1.keys():
            markings_m1['D'].append(key)
    
    method2 = method2_infobox(name)
    for key in method2.keys():
        if key in baseline.keys():
    #         print(translated_infobox[key] , actual_infobox[key]) 
            val = inst.compare(method2[key] , baseline[key])
            if val > 0.70 :
                markings_m2['C'].append(key)
            else : markings_m2['S'].append(key)
    for key in baseline.keys():
        if key not in method2.keys():
            markings_m2['D'].append(key)
    
    with open('Baseline_results.csv','a') as file:
        writer_baseline = csv.writer(file)
        writer_baseline.writerow([qid,name,change_format(baseline)])
    with open('Method1_results.csv','a') as file:
        writer_m1 = csv.writer(file)
        writer_m1.writerow([qid,name,change_format(method1)])
    with open('Method2_results.csv','a') as file:
        writer_m2 = csv.writer(file)
        writer_m2.writerow([qid,name,change_format(method1)])

en.wikipedia.org (parse) Narendra Modi
en.wikipedia.org (imageinfo) File:Prime Minister, Shri Narendra M...
Narendra Modi (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Prime Min...
  infobox: <dict(46)> image, image_size, order, office, president,...
  iwlinks: <list(3)> https://commons.wikimedia.org/wiki/Category:N...
  pageid: 444222
  parsetree: <str(272284)> <root><template><title>short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Narendra Modi
  wikibase: Q1058
  wikidata_url: https://www.wikidata.org/wiki/Q1058
  wikitext: <str(217521)> {{short description|14th and current Pri...
}
en.wikipedia.org (parse) Virat Kohli
en.wikipedia.org (imageinfo) File:The President, Shri Pranab Mukh...
Virat Kohli (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:The Presi...
  infobox: <dict(97)> name, image, caption, birth_date, birth_plac...
  iwlinks: <list(2)> https://commons.wikimedia.org/wiki/Category:V...
  pageid: 16017429
  pars

In [48]:
def print_results(markings):
    precision = len(markings['C']) / (len(markings['C']) + len(markings['S']))
    recall = len(markings['C']) / (len(markings['C']) + len(markings['S']) + len(markings['D']))
    print('Precisions :',precision)
    print('Recall :',recall)

In [49]:
print('Method 1')
print_results(markings_m1)
print('Method 2')
print_results(markings_m2)

Method 1
Precisions : 0.6944444444444444
Recall : 0.6944444444444444
Method 2
Precisions : 0.1
Recall : 0.043478260869565216


In [84]:
df=pd.read_csv('Baseline_results.csv')
df

Unnamed: 0,QId,Name,InfoBox
0,Q1058,Narendra Modi,{{Infobox person\n|name = नरेन्द्र मोदी\n|desc...
1,Q213854,Virat Kohli,{{Infobox person\n|name = विराट कोहली\n|descri...


### Ine

In [22]:
inst.search("रिसुभ रीसुबह रीसुभ रईसुभ रिसुब" , "रीसुभ")

{'रिसुभ': 0.9,
 'रीसुबह': 0.6666666666666666,
 'रीसुभ': 1.0,
 'रईसुभ': 0.9,
 'रिसुब': 0.9}

In [26]:
inst.search("ज्ञानसन्दूक","ज्ञानसंदूक")

{'ज्ञानसन्दूक': 0.7146814404432132}

In [51]:
inst.compare("रिसुभ रीसुबह रीसुभ रईसुभ रिसुब" , "रीसुभ")

0.1983471074380165

In [52]:
inst.compare("रिसुभ" , "रीसुभ")

0.9