<a href="https://colab.research.google.com/github/sinawrm/AppII-Entity-disambiguation/blob/main/notebooks/Wikidata_Info.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wikidata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wikidata
  Downloading Wikidata-0.7.0-py3-none-any.whl (29 kB)
Installing collected packages: wikidata
Successfully installed wikidata-0.7.0


In [None]:
!pip install sparqlwrapper pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sparqlwrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1 (from sparqlwrapper)
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->sparqlwrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib, sparqlwrapper
Successfully installed isodate-0.6.1 rdflib-6.3.2 sparqlwrapper-2.0.0


In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [None]:
names = [
    "Barack Obama",
    "Malala Yousafzai",
    "Beyoncé",
    "J.K. Rowling", 
    "Oprah Winfrey", 
    "Emma Watson", 
    "Angela Merkel",
    "Nelson Mandela", 
    "Michelle Obama",
    "Taylor Swift", 
    "Ellen DeGeneres",
    "Serena Williams",
    "Cristiano Ronaldo",
    "Billie Eilish", 
    "Kamala Harris", 
    "Priyanka Chopra", 
    "Adele", 
    "Stephen Hawking", 
    "Meghan Markle", 
    "Tim Cook", 
]


In [None]:
def execute_sparql_query(query):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results

In [None]:
data = []
for name in names:
    query = f"""
    SELECT DISTINCT ?person ?personLabel ?birthDate ?nationalityLabel ?occupationLabel WHERE {{ 
        ?person wdt:P31 wd:Q5;
                rdfs:label "{name}"@en.
        OPTIONAL {{ ?person wdt:P569 ?birthDate. }}
        ?person wdt:P19 ?placeOfBirth.
        ?placeOfBirth wdt:P17 ?nationality.
        ?person wdt:P106 ?occupation.
        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    results = execute_sparql_query(query)

    for result in results["results"]["bindings"]:
        person_name = result["personLabel"]["value"]
        birth_year = result["birthDate"]["value"][:4] if "birthDate" in result else "Unknown"
        nationality = result["nationalityLabel"]["value"]
        occupation = result["occupationLabel"]["value"]

        data.append([person_name, birth_year, nationality, occupation])

In [None]:
df = pd.DataFrame(data, columns=["Name", "Year of Birth", "Nationality", "Occupation"])

In [None]:
print(df)

             Name Year of Birth               Nationality  \
0    Barack Obama          1961  United States of America   
1    Barack Obama          1961  United States of America   
2    Barack Obama          1961  United States of America   
3    Barack Obama          1961  United States of America   
4    Barack Obama          1961  United States of America   
..            ...           ...                       ...   
133      Tim Cook          1960  United States of America   
134      Tim Cook          1960  United States of America   
135      Tim Cook          1960  United States of America   
136      Tim Cook          1960  United States of America   
137      Tim Cook          1960  United States of America   

                          Occupation  
0    international forum participant  
1                             lawyer  
2                         politician  
3                           academic  
4                             jurist  
..                               

In [None]:
distinct_names = df['Name'].unique()
print(distinct_names)

['Barack Obama' 'Malala Yousafzai' 'Beyoncé' 'Oprah Winfrey' 'Emma Watson'
 'Angela Merkel' 'Nelson Mandela' 'Michelle Obama' 'Taylor Swift'
 'Ellen DeGeneres' 'Serena Williams' 'Cristiano Ronaldo' 'Billie Eilish'
 'Kamala Harris' 'Priyanka Chopra' 'Adele' 'Stephen Hawking' 'Tim Cook']


In [None]:
name_counts = df['Name'].value_counts()
print(name_counts)

Taylor Swift         16
Beyoncé              14
Oprah Winfrey        13
Serena Williams      12
Ellen DeGeneres      11
Barack Obama         10
Tim Cook              9
Emma Watson           8
Adele                 8
Priyanka Chopra       6
Nelson Mandela        5
Malala Yousafzai      5
Michelle Obama        4
Cristiano Ronaldo     4
Billie Eilish         4
Kamala Harris         4
Stephen Hawking       3
Angela Merkel         2
Name: Name, dtype: int64


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df.to_csv("/content/drive/My Drive/APP II-FINAL PROJECT/data/wikidata.csv", index=False)