In [1]:
import re
import json
from tqdm import tqdm
from itertools import groupby

import wikipedia

import spacy
import pandas as pd

# import nltk
# from nltk.corpus import stopwords

from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
# stopwords = set(stopwords.words('english'))
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

In [3]:
nlp = spacy.load('en_core_web_sm')

In [3]:
query = '''
select distinct ?actor, ?actor_wiki_id, ?actor_wiki_link, ?movie_title, ?release_date
where {
   ?p a dbo:Actor .
   ?p foaf:name ?actor .
   FILTER(regex(?actor, '([a-zA-Z]+ )+', 'i'))

   ?p dbo:wikiPageID ?actor_wiki_id .
   ?p foaf:isPrimaryTopicOf ?actor_wiki_link .

   ?movie dbo:starring ?p .
   ?movie rdfs:label ?movie_title .
   FILTER(lang(?movie_title) = 'en')

   ?movie dct:subject ?release .
   FILTER(regex(?release, '[0-9]{4}_films', 'i'))

   ?release rdfs:label ?release_date .
}
group by ?actor_wiki_id ?movie_title ?release_date
order by ?actor
limit 100
'''

In [5]:
def query_dbpedia(query):
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

In [6]:
def process_data(data):
    results = list()
    n_actors = len(set(x['actor']['value'] for x in data['results']['bindings']))
    data = group_data(data)
    
    for (actor, pageid), rows in tqdm(data, total=n_actors):
        wiki_page = get_wiki_page(int(pageid))
        for row in rows:
            results.append({ 
            'actor'          : actor,
            'wiki_page'      : wiki_page,
            'actor_wiki_id'  : int(pageid),
            'actor_wiki_link': row['actor_wiki_link']['value'],
            'movie_title'    : row['movie_title']['value'],
            'release_date'   : row['release_date']['value'],
            })
    return results


def group_data(data):
    keyfunc = lambda x: (x['actor']['value'], x['actor_wiki_id']['value'])
    return groupby(data['results']['bindings'], key=keyfunc)


def get_wiki_page(pageid):
    try:
        page = wikipedia.page(pageid=pageid)
        return page.content
    except wikipedia.exceptions.PageError:
        print(f'Cannot extract wiki page for {pageid}')
        return None
    

def write_to_json(file_name, data):
    with open(file_name, 'w') as f:
        json.dump(data, f)
        
def load_json(file_name):
    with open(file_name, 'r') as f:
        data = json.load(f)
    return data

In [7]:
try:
    data = load_json('actors.json')
except FileNotFoundError:
    data = query_dbpedia(query)
    data = process_data(data)
    write_to_json('actors.json', data)

In [8]:
def clean_text(text):
    text = re.sub(r'=+.*?=+', '', text)
    text = re.sub(r'[\s\t\n]+', ' ', text)
    return text

In [10]:
for (actor, wiki_page), films in groupby(data, key=lambda x: (x['actor'], x['wiki_page'])):
    print(actor)
    wiki_page = clean_text(wiki_page)
    doc = nlp(wiki_page)
    for sent in doc.sents:
        print(sent)
    break

(Elvis Tsui)


ValueError: sentence boundary detection requires the dependency parse, which requires data to be installed. If you haven't done so, run: 
python -m spacy.en.download all
to install the data

In [11]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()