In [1]:
import sys; sys.path.insert(0,'../../..')
from wikidata import *
# !pip install wptools googlesearch-python

In [2]:
def wikipedia_opensearch(qstr,lang="en",**kwargs):
    import requests
    with requests.Session() as s:
        url = f"https://{lang}.wikipedia.org/w/api.php"
        params = {
            "action": "opensearch",
            "namespace": "0",
            "search": qstr,
            "limit": "5",
            "format": "json"
        }
        r = s.get(url=url, params=params)
        data = r.json()
        titles = data[1]
        if titles:
            if len(titles)>1: return titles
            if len(titles)==1: return titles[0]

def wikipedia_search(qstr,lang="en",qprefix='wikipedia ',topn=1,**kwargs):
    from googlesearch import search as gsearch
    import wptools
    urlprefix = f'https://{lang}.wikipedia.org/wiki/'
    for i,url in enumerate(gsearch(qprefix+qstr,lang=lang)):
        if i>=topn: break
        if url and type(url)==str and url.startswith(urlprefix):
            return url
        else:
            print('!?',url)


# def wikidata_search(qstr,lang="en",**kwargs):
#     page = wikipedia_search(qstr,lang=lang,**kwargs)
#     if page is not None:
#         o = page.get_wikidata(0)
#         return o.data.get('wikibase')

def wikidata_query_str(text):
    # return clean_text(f'{text.shorttitle} by {text.shortauthor}')
    return clean_text(f'{text.shorttitle} by {text.au}')

def wikidata_gsearch(
        qstr,
        lang="en",
        qprefix='wikidata ',
        topn=1,
        what='work',
        **kwargs):
    from googlesearch import search as gsearch
    import wptools
    urlprefix = f'https://www.wikidata.org/wiki/'
    for i,url in enumerate(gsearch(qprefix+qstr,lang=lang)):
        if i>=topn: break
        if url and type(url)==str and url.startswith(urlprefix):
            qid=url.split('/')[-1]
            page = wptools.page(wikibase=qid,silent=True)
            o = page.get_wikidata()
            if o is not None and hasattr(o,'data'):
                data=o.data
                whatres = data.get('what','')
                if not whatres or not what or what in whatres:
                    return (qid,data.get('wikidata'))
        else:
            print('!?',url)

In [3]:
C=Corpus('chadwyck')
text = C.t
qstr=wikidata_query_str(text)
qstr

[36m[04:15:15][0m [34m[1mlltk.corpus.corpus.init()[0m[36m:263:[0m Initializing from metadata: [Chadwyck](chadwyck)


'The Unclad Horseman by Thompson'

In [4]:
# data = wikidata_gsearch(qstr)
# data

In [11]:


def wikidata_search(
        qstr_or_text,
        lang="en",
        what=["work","manuscript","text"],
        min_match_ratio_au=50,
        min_match_ratio_ti=50,
        **kwargs):
    import requests,wptools,bs4
    from fuzzywuzzy import fuzz

    if issubclass(type(qstr_or_text), BaseText):
        text = qstr_or_text
        qstr=wikidata_query_str(qstr_or_text)
    else:
        text = None
        qstr=qstr_or_text

    print('Q',qstr)
    
    safe='+'.join(clean_text(qstr).split())
    url=f'https://www.wikidata.org/w/index.php?search={safe}'
    with requests.Session() as s: html = s.get(url).text
    dom=bs4.BeautifulSoup(html,'html')

    for item in dom.select('.wb-itemlink-id'):
        itext=item.text
        qid=itext.replace('(','').replace(')','')
        
        # check type
        page = wptools.page(wikibase=qid, silent=True)
        o = page.get_wikidata()
        if o is not None and hasattr(o,'data'):
            data=o.data
            whatres = data.get('what','')
            for whatx in what:
                if not whatres or not whatx or whatx in whatres:
                    odat=data.get('wikidata')
                    od = format_wikidata_d(odat)
                    wd_au,wd_ti = wikidata_get_author(od), wikidata_get_title(od)
                    if not wd_ti: wd_ti=data.get('label')
                    # pprint([wd_au,wd_ti,od])
                    if not wd_au or not wd_ti: continue

                    if text is not None:
                        t_au,t_ti = text.shortauthor, text.shorttitle
                        au_ratio=fuzz.token_set_ratio(wd_au, t_au)
                        ti_ratio=fuzz.token_set_ratio(wd_ti, t_ti)
                        if min_match_ratio_au and au_ratio < min_match_ratio_au: continue
                        if min_match_ratio_ti and ti_ratio < min_match_ratio_ti: continue
                    else:
                        ti_ratio=np.nan
                        au_ratio=np.nan

                    return dict(
                        id=qid,
                        meta=od,
                        meta_simple=format_wikidata_d_simple(od),
                        title=wd_ti,
                        author=wd_au,
                        match_author=au_ratio,
                        match_title=ti_ratio,
                        query=qstr,
                    )

def format_wikidata_str(o,simple=False):
    if type(o)==str:
        if '(' in o and o.endswith(')'):
            name,qid = o[:-1].split('(',1)
            name,qid = name.strip(),qid.strip()
            if qid and qid[0] in {'Q','P'}:
                if simple: return f'{qid}|{name}'
                return (qid,name) if not simple else name
        return ('',o.strip()) if not simple else o.strip()
    elif type(o)==list:
        return [format_wikidata_str(x,simple=simple) for x in o]
    elif type(o)==tuple:
        if len(o)==2:
            if simple:
                qid,name = o
                return name.replace(' ','_')
            return o
        return tuple([format_wikidata_str(x,simple=simple) for x in o])
    elif type(o)==dict:
        return format_wikidata_d(o,simple=simple)
    return ('',o)

def format_wikidata_d(d,simple=False):
    od={}
    for k,v in d.items():
        od[format_wikidata_str(k,simple=simple)] = format_wikidata_str(v,simple=simple)
    return od

def wikidata_get_prop(d,prop='',propname=''):
    if type(d)==tuple and len(d)==2 and type(d[1])==dict: d=d[1]
    if type(d)==dict and 'meta' in d: d=d['meta']
    if prop or propname:
        for k in d:
            if type(k)==tuple and len(k)==2 and k[0]==prop or k[1]==propname:
                o=d[k]
                if type(o)==list: o=o[0]
                if type(o)==tuple and len(o)==2: return o[-1]
                return o
    return ''

def wikidata_get_title(d): return wikidata_get_prop(d,'P1476','title')
def wikidata_get_author(d): return wikidata_get_prop(d,'P50','author')

def format_wikidata_d_simple(d):
    od={}
    for k,v in d.items():
        od[format_wikidata_str(k,simple=True).replace(' ','_')] = format_wikidata_str(v,simple=True)
    return od

In [17]:
texts = [t for t in C.texts() if t.subcorpus in {'Eighteenth-Century_Fiction','Nineteenth-Century_Fiction'}]
text = random.choice(texts)
wikidata_query_str(text)

'The Diary of a Nobody by Grossmith'

In [18]:
wikidata_search(text)

Q The Diary of a Nobody by Grossmith


{'id': 'Q864150',
 'meta': {('P31', 'instance of'): ('Q47461344', 'written work'),
  ('P50', 'author'): [('Q1564825', 'George Grossmith'),
   ('Q7979350', 'Weedon Grossmith')],
  ('P136', 'genre'): [('Q465821', 'epistolary novel'),
   ('Q2561390', 'comic novel')],
  ('P840', 'narrative location'): ('Q84', 'London'),
  ('P123', 'publisher'): ('Q6107326', 'J. W. Arrowsmith'),
  ('P646', 'Freebase ID'): ('', '/m/01k430'),
  ('P18', 'image'): ('', 'Charles and Lupin Pooter.gif'),
  ('P495', 'country of origin'): ('Q145', 'United Kingdom'),
  ('P1242', 'Theatricalia play ID'): ('', '4x8'),
  ('P407', 'language of work or name'): ('Q1860', 'English'),
  ('P1417', 'Encyclopædia Britannica Online ID'): ('',
   'topic/The-Diary-of-a-Nobody'),
  ('P1476', 'title'): ('', 'The Diary of a Nobody'),
  ('P747', 'has edition or translation'): ('Q98785962',
   'The Diary of a Nobody')},
 'meta_simple': {'instance_of': 'written_work',
  'author': ['George_Grossmith', 'Weedon_Grossmith'],
  'genre': ['ep

In [53]:
text=C.textd['Early_American_Fiction/poeedgar.05']
data = wikidata_search(text)
data

Q Tales by Poe


{'id': 'Q3928506',
 'meta': {('P50', 'author'): ('Q16867', 'Edgar_Allan_Poe'),
  ('P31', 'instance_of'): [('Q7725634', 'literary_work'),
   ('Q1279564', 'short_story_collection')],
  ('P18', 'image'): ('', 'Edgar Allan Poe 1848.jpg'),
  ('P577', 'publication_date'): ('', '+1845-00-00T00:00:00Z'),
  ('P6216', 'copyright_status'): [('Q19652', 'public_domain'),
   ('Q19652', 'public_domain')],
  ('P407', 'language_of_work_or_name'): ('Q1860', 'English'),
  ('P747', 'has_edition_or_translation'): ('Q105269371', 'Racconti!')},
 'meta_simple': {'author': 'Edgar_Allan_Poe',
  'instance_of': ['literary_work', 'short_story_collection'],
  'image': 'Edgar_Allan_Poe_1848.jpg',
  'publication_date': '+1845-00-00T00:00:00Z',
  'copyright_status': ['public_domain', 'public_domain'],
  'language_of_work_or_name': 'English',
  'has_edition_or_translation': 'Racconti!'},
 'title': 'Tales',
 'author': 'Edgar_Allan_Poe',
 'match_author': 53,
 'match_title': 100,
 'query': 'Tales by Poe'}

In [None]:
wikidata_get_author(data)

'Edgar Allan Poe'

In [None]:
import wptools
page = wptools.page(wikibase='Q64027719')
page

<wptools.page.WPToolsPage at 0x172a556d0>

In [None]:
o=page.get_wikidata()
o.data

www.wikidata.org (wikidata) Q64027719
www.wikidata.org (labels) Q19091875|P7937|P31|P1476|P50|Q7725634|...
The Landscape Garden (en) data
{
  claims: <dict(8)> P31, P50, P5331, P646, P747, P1085, P1476, P7937
  description: tale by Edgar Allan Poe
  label: The Landscape Garden
  labels: <dict(12)> Q19091875, P7937, P31, P1476, P50, Q7725634, ...
  modified: <dict(1)> wikidata
  requests: <list(2)> wikidata, labels
  title: The_Landscape_Garden
  what: literary work
  wikibase: Q64027719
  wikidata: <dict(8)> instance of (P31), author (P50), OCLC work I...
  wikidata_pageid: 63730877
  wikidata_url: https://www.wikidata.org/wiki/Q64027719
}


{'requests': ['wikidata', 'labels'],
 'labels': {'Q19091875': 'The Landscape Garden',
  'P7937': 'form of creative work',
  'P31': 'instance of',
  'P1476': 'title',
  'P50': 'author',
  'Q7725634': 'literary work',
  'P1085': 'LibraryThing work ID',
  'P747': 'has edition or translation',
  'Q49084': 'short story',
  'P5331': 'OCLC work ID',
  'P646': 'Freebase ID',
  'Q16867': 'Edgar Allan Poe'},
 'wikidata': {'instance of (P31)': 'literary work (Q7725634)',
  'author (P50)': 'Edgar Allan Poe (Q16867)',
  'OCLC work ID (P5331)': '5400911',
  'Freebase ID (P646)': '/m/02gftq0',
  'has edition or translation (P747)': 'The Landscape Garden (Q19091875)',
  'LibraryThing work ID (P1085)': '20353480',
  'title (P1476)': 'The Landscape Garden',
  'form of creative work (P7937)': 'short story (Q49084)'},
 'wikidata_pageid': 63730877,
 'modified': {'wikidata': '2021-11-22T01:45:43Z'},
 'wikibase': 'Q64027719',
 'wikidata_url': 'https://www.wikidata.org/wiki/Q64027719',
 'description': 'tale b