In [8]:
import requests


In [54]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

# import sys
import requests
url = 'https://www.wikidata.org/w/api.php'

def search_entities(name, url):
    """
    Get the entity id, using a name, by calling the Wiki API. 
    The output is a list of dict of dictionaries corresponding to the number of
    occurances of the given name 
    """
    params = {
        'action': 'wbsearchentities',
        'format': 'json',
        'language': 'en',
        'search': name
    }
    response = requests.get(url, params=params)
    return response.json()['search']



In [163]:
from SPARQLWrapper import SPARQLWrapper, JSON
import sys

endpoint_url = "https://query.wikidata.org/sparql"

query = """
SELECT ?item ?occupationLabel ?image ?genderLabel ?bdayLabel ?citizenshipLabel
WHERE 
{
  ?item wdt:P106 ?occupation .
  ?item wdt:P21 ?gender .
  ?item wdt:P18 ?image .
  ?item wdt:P569 ?bday .
  ?item wdt:P27 ?citizenship
  FILTER(?item = wd:Q37079)
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    print(result)

{'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q37079'}, 'image': {'type': 'uri', 'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/TomCruiseDec08MTV%20cropped.jpg'}, 'occupationLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'film director'}, 'genderLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'male'}, 'bdayLabel': {'type': 'literal', 'value': '1962-07-03T00:00:00Z'}, 'citizenshipLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'United States of America'}}
{'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q37079'}, 'image': {'type': 'uri', 'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/TomCruiseDec08MTV%20cropped.jpg'}, 'occupationLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'film producer'}, 'genderLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'male'}, 'bdayLabel': {'type': 'literal', 'value': '1962-07-03T00:00:00Z'}, 'citizenshipLabel': {'xml:lang': 'en', 'type': 'literal', 'value':

In [24]:
results["results"]["bindings"][0]["image"]

{'type': 'uri',
 'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/Brad%20Pitt%202019%20by%20Glenn%20Francis.jpg'}

In [51]:
import pandas as pd
# Get all the names and entity_id
data = pd.read_csv("../raw_data/list_act.csv") #read current names from the list_act csv

In [None]:
#new dictionary to ho
new_file_dict = {
    "name":[],
    "wiki_id":[]
}
for name in data["name"]:
    res = search_entities(name,url=url)[0]["id"]
    new_file_dict["name"].append(name)
    new_file_dict["wiki_id"].append(res)
   

In [60]:
df_act = pd.DataFrame(new_file_dict)


In [62]:
df_act.to_csv("../raw_data/celebrity_list.csv")

In [173]:
# Create a new df with all the name taken from the df having wiki_id
from wikidata import WikiDataQueryResults as wdr

new_df = pd.DataFrame(columns=['itemLabel','item', 'image', 'occupationLabel','genderLabel', 'bdayLabel', 'citizenshipLabel'])

for wiki_id in df_act["wiki_id"]:

    query = f"""
    SELECT ?itemLabel ?item ?occupationLabel ?image ?genderLabel ?bdayLabel ?citizenshipLabel
    WHERE 
    {{
    ?item wdt:P106 ?occupation .
    ?item wdt:P21 ?gender .
    ?item wdt:P18 ?image .
    ?item wdt:P569 ?bday .
    ?item wdt:P27 ?citizenship .
    FILTER(?item = wd:{wiki_id})
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """

    data_extracter = wdr(query)
    df = data_extracter.load_as_dataframe()
    new_df = pd.concat([new_df,df[0:1]])
new_df.to_csv("../raw_data/output_wiki/metafile.csv") #write to csv file

In [188]:
new_df

Unnamed: 0,itemLabel,item,image,occupationLabel,genderLabel,bdayLabel,citizenshipLabel
0,Cate Blanchett,http://www.wikidata.org/entity/Q80966,http://commons.wikimedia.org/wiki/Special:File...,actor,female,1969-05-14T00:00:00Z,United States of America
0,Natalie Portman,http://www.wikidata.org/entity/Q37876,http://commons.wikimedia.org/wiki/Special:File...,screenwriter,female,1981-06-09T00:00:00Z,United States of America
0,Uma Thurman,http://www.wikidata.org/entity/Q125017,http://commons.wikimedia.org/wiki/Special:File...,screenwriter,female,1970-04-29T00:00:00Z,United States of America
0,Helena Bonham Carter,http://www.wikidata.org/entity/Q170428,http://commons.wikimedia.org/wiki/Special:File...,stage actor,female,1966-05-26T00:00:00Z,United Kingdom
0,Frances McDormand,http://www.wikidata.org/entity/Q204299,http://commons.wikimedia.org/wiki/Special:File...,actor,female,1957-06-23T00:00:00Z,United States of America
...,...,...,...,...,...,...,...
0,Alan Rickman,http://www.wikidata.org/entity/Q106481,http://commons.wikimedia.org/wiki/Special:File...,screenwriter,male,1946-02-21T00:00:00Z,United Kingdom
0,Edward G. Robinson,http://www.wikidata.org/entity/Q83812,http://commons.wikimedia.org/wiki/Special:File...,character actor,male,1893-12-12T00:00:00Z,United States of America
0,Will Smith,http://www.wikidata.org/entity/Q40096,http://commons.wikimedia.org/wiki/Special:File...,actor,male,1968-09-25T00:00:00Z,United States of America
0,John Goodman,http://www.wikidata.org/entity/Q215072,http://commons.wikimedia.org/wiki/Special:File...,actor,male,1952-06-20T00:00:00Z,United States of America


In [204]:
new_df.to_csv("../raw_data/output_wiki/metafile.csv")

In [206]:
import os
import unicodedata

user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1]) #define agent
headers = {'User-Agent': user_agent}

def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

# Iterate over all celebrities in df and get image and save to output_wiki
for idx, row in new_df.iterrows():
    img_url = new_df["image"].iloc[idx]
    # person_name = new_df["itemLabel"].iloc[idx]
    person_name = "_".join(strip_accents(new_df["itemLabel"].iloc[idx].lower()).split())+".jpg"
    target_path = os.path.join(os.getcwd(),"raw_data","output_wiki",person_name)
    target_image = target_path
    response = requests.get(img_url,stream=True,headers=headers)

    if response.status_code:
        fp = open(target_image, 'wb')
        fp.write(response.content)
        fp.close()
