<a href="https://colab.research.google.com/github/ravi-gopalan/DAND_Data_Wrangling/blob/master/wikidata_sparql_query.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install sparqlwrapper

# https://rdflib.github.io/sparqlwrapper/

Collecting sparqlwrapper
  Downloading https://files.pythonhosted.org/packages/00/9b/443fbe06996c080ee9c1f01b04e2f683b2b07e149905f33a2397ee3b80a2/SPARQLWrapper-1.8.5-py3-none-any.whl
Collecting rdflib>=4.0
[?25l  Downloading https://files.pythonhosted.org/packages/3c/fe/630bacb652680f6d481b9febbb3e2c3869194a1a5fc3401a4a41195a2f8f/rdflib-4.2.2-py3-none-any.whl (344kB)
[K     |████████████████████████████████| 348kB 7.4MB/s 
[?25hCollecting isodate
[?25l  Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)
[K     |████████████████████████████████| 51kB 6.4MB/s 
Installing collected packages: isodate, rdflib, sparqlwrapper
Successfully installed isodate-0.6.0 rdflib-4.2.2 sparqlwrapper-1.8.5


In [0]:
import pandas as pd
import numpy as np
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import re
from collections import Counter

def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
      item = []
      for c in cols:
        item.append(row.get(c, {}).get('value'))
      out.append(item)
    return pd.DataFrame(out, columns = cols)

In [4]:
endpoint_url = "https://query.wikidata.org/sparql"

query_noodle = """SELECT ?noodle ?noodleLabel ?country_of_origin ?country_of_originLabel ?instance_of ?instance_ofLabel ?subclass_of ?subclass_ofLabel  ?Commons_category 
WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "   [AUTO_LANGUAGE],en". }
  ?noodle wdt:P279 wd:Q192874.
  OPTIONAL { ?noodle wdt:P495 ?country_of_origin. }
  OPTIONAL { ?noodle wdt:P31 ?instance_of. }
  OPTIONAL { ?noodle wdt:P279 ?subclass_of. }
  OPTIONAL { ?noodle wdt:P373 ?Commons_category. }  
}"""


df_noodle = get_results(endpoint_url, query_noodle)
df_noodle.head()

Unnamed: 0,noodle,noodleLabel,country_of_origin,country_of_originLabel,instance_of,instance_ofLabel,subclass_of,subclass_ofLabel,Commons_category
0,http://www.wikidata.org/entity/Q20065,Spätzle,http://www.wikidata.org/entity/Q142,France,,,http://www.wikidata.org/entity/Q192874,noodle,Spaetzle
1,http://www.wikidata.org/entity/Q20065,Spätzle,http://www.wikidata.org/entity/Q1142,Alsace,,,http://www.wikidata.org/entity/Q192874,noodle,Spaetzle
2,http://www.wikidata.org/entity/Q20065,Spätzle,http://www.wikidata.org/entity/Q142,France,,,http://www.wikidata.org/entity/Q53619707,egg pasta,Spaetzle
3,http://www.wikidata.org/entity/Q20065,Spätzle,http://www.wikidata.org/entity/Q1142,Alsace,,,http://www.wikidata.org/entity/Q53619707,egg pasta,Spaetzle
4,http://www.wikidata.org/entity/Q34156,Korean noodles,http://www.wikidata.org/entity/Q18097,Korea,,,http://www.wikidata.org/entity/Q192874,noodle,Noodles from Korea


In [5]:
noodle_list = sorted([re.sub("q[0-9]+","",item.lower()) \
                         for item in df_noodle.groupby(\
                                                     ['noodleLabel'])['country_of_originLabel']\
                    .agg('count').index.values.tolist()])

noodle_list = sorted([re.sub("\-", " ",item) for item in noodle_list])
noodle_counter = Counter(noodle_list)

noodle_list = []
for key in noodle_counter.keys():
  if len(key) != 0:
    noodle_list.append(key)
print(noodle_list)

['bakmi', 'black noodles', 'boat noodles', 'buckwheat noodles', 'buldak bokkeum myun', 'bánh hỏi', 'cart noodle', 'cellophane noodles', 'char kway teow', 'chinese noodles', 'chinkiang pot cover noodles', 'cold noodle', 'curry noodle', 'dragon beard noodles', 'extruded noodle', 'fried noodles', 'hokkien mee', 'hot dry noodles', 'jajangmyeon', 'japanese noodles', 'kadaif noodles', 'kesme', 'kishimen', 'knife cut noodle', 'korean noodles', 'laghman', 'mi rebus', 'mie ayam', 'mì', 'narrow lapsha', 'pancit', 'phat si io', 'ramen', 'ribbon noodle', 'rice noodles', 'singapore chow mein', 'soba', 'spätzle', 'sōmen', 'udon', 'vietnamese noodles', 'wide lapsha', 'wonton noodles', 'zhajiangmian']


In [6]:
endpoint_url = "https://query.wikidata.org/sparql"
query_soup = """SELECT ?subclass ?subclassLabel ?TasteAtlas_ID ?instance_of ?instance_ofLabel ?country_of_origin ?country_of_originLabel ?topic_s_main_template ?topic_s_main_templateLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?subclass wdt:P279 wd:Q41415.
  OPTIONAL { ?subclass wdt:P5456 ?TasteAtlas_ID. }
  OPTIONAL { ?subclass wdt:P31 ?instance_of. }
  OPTIONAL { ?subclass wdt:P495 ?country_of_origin. }
  OPTIONAL { ?subclass wdt:P1424 ?topic_s_main_template. }
}"""

df_soup = get_results(endpoint_url, query_soup)
df_soup.head()

Unnamed: 0,subclass,subclassLabel,TasteAtlas_ID,instance_of,instance_ofLabel,country_of_origin,country_of_originLabel,topic_s_main_template,topic_s_main_templateLabel
0,http://www.wikidata.org/entity/Q11179944,Stewed dog meat with soft-shelled turtle,,,,http://www.wikidata.org/entity/Q148,People's Republic of China,,
1,http://www.wikidata.org/entity/Q11242258,Soto mie,,http://www.wikidata.org/entity/Q2095,food,http://www.wikidata.org/entity/Q252,Indonesia,,
2,http://www.wikidata.org/entity/Q11265947,Kenoshiru,,,,http://www.wikidata.org/entity/Q17,Japan,,
3,http://www.wikidata.org/entity/Q11266028,Kenchinjiru,,,,,,,
4,http://www.wikidata.org/entity/Q11443995,taipien,,,,,,,


In [7]:
soup_list = sorted([re.sub("q[0-9]+","",item.lower()) \
                         for item in df_soup.groupby(\
                                                     ['subclassLabel'])['country_of_originLabel']\
                    .agg('count').index.values.tolist()])

soup_list = sorted([re.sub("\-", " ",item) for item in soup_list])
soup_counter = Counter(soup_list)

soup_list = []
for key in soup_counter.keys():
  if len(key) != 0:
    soup_list.append(key)
print(soup_list)

['15 bean soup', '2007 vietnam food scare', 'acquacotta', 'aguadito de pollo', 'ajoblanco', 'alicot', 'amiedi', 'amish preaching soup', 'arabaşı soup', 'ash reshteh', 'ashe doogh', 'asian soup', 'aush', 'bacon soup', 'baeksuk', 'bagnun', 'banga', 'batchoy', 'bean soup', 'beer soup', 'beetroot soup', 'beyran', 'binignit', "bird's nest soup", 'bisque', 'black sesame soup', 'black soup', 'blood soup', 'bob chorba', 'bogeo', 'bookbinder soup', 'borş de burechiuşe', 'borș', 'bosintang', 'bouillon', 'bouillon cube', 'bouneschlupp', 'brown windsor soup', 'budae jjigae', 'buddha jumps over the wall', 'bun rieu', 'buridda', 'bún bò huế', 'bún mắm', 'bún ốc', 'cabbage soup', 'caldillo de perro', 'caldo de costilla', 'caldo de pollo', 'caldo de siete mares', 'caldo gallego', 'caldo tlalpeño', 'caldo verde', 'canja de goa', 'cantonese seafood soup', 'caparrones', 'cazuela', 'celimpungan', 'chakna', 'changua', 'chankonabe', 'cheese soup', 'chicken and dumplings', 'chicken soup', 'chikhirtma', 'chin

In [8]:
endpoint_url = "https://query.wikidata.org/sparql"
query_rice_dish = """SELECT ?is_a_list_of ?instance_of ?instance_ofLabel ?subclass_of ?subclass_ofLabel ?country_of_origin ?country_of_originLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?is_a_list_of wdt:P279 wd:Q21976260.
  OPTIONAL {  }
  OPTIONAL { ?is_a_list_of wdt:P5456 ?instance_of. }
  OPTIONAL { ?is_a_list_of wdt:P279 ?subclass_of. }
  OPTIONAL { ?is_a_list_of wdt:P495 ?country_of_origin. }
}
"""

df_rice_dish = get_results(endpoint_url,query_rice_dish)
df_rice_dish.head()

Unnamed: 0,is_a_list_of,instance_of,instance_ofLabel,subclass_of,subclass_ofLabel,country_of_origin,country_of_originLabel
0,http://www.wikidata.org/entity/Q2841189,,,http://www.wikidata.org/entity/Q21976260,rice dish,http://www.wikidata.org/entity/Q668,India
1,http://www.wikidata.org/entity/Q2916401,,,http://www.wikidata.org/entity/Q21976260,rice dish,,
2,http://www.wikidata.org/entity/Q2921726,,,http://www.wikidata.org/entity/Q21976260,rice dish,,
3,http://www.wikidata.org/entity/Q3239751,,,http://www.wikidata.org/entity/Q21976260,rice dish,,
4,http://www.wikidata.org/entity/Q3239751,,,http://www.wikidata.org/entity/Q27994917,chicken dish,,


In [9]:
df_rice_dish.groupby(['subclass_ofLabel', 'instance_ofLabel'], as_index=False)['country_of_originLabel'].agg('count').instance_ofLabel.values.tolist()

rice_dish_list = sorted([re.sub("q[0-9]+","",item.lower()) \
                         for item in df_rice_dish.groupby(\
                                                          ['subclass_ofLabel', 'instance_ofLabel'], as_index=False)\
                         ['country_of_originLabel'].agg('count').instance_ofLabel.values.tolist()])

rice_dish_list = sorted([re.sub("\-", " ",item) for item in rice_dish_list])
rice_dish_counter = Counter(rice_dish_list)

rice_dish_list = []
for key in rice_dish_counter.keys():
  if len(key) != 0:
    rice_dish_list.append(key)
print(rice_dish_list)

['akki rotti', 'arroz caldo', 'arroz carreteiro', 'arroz con coco', 'arroz con leche', 'arroz con pollo', 'arroz doce', 'bibimbap', 'biryani', 'botamochi', 'cabidela', 'calas', 'chazuke', 'chukadon', 'com lam', 'daifuku', 'dal bhat', 'donburi', 'gyudon', 'hayashi rice', 'hoedeopbap', 'hokkien fried rice', 'idli', 'kabsa', 'kamameshi', 'kayu', 'ketupat', 'kheer', 'kiribath', 'lemang', 'loco moco', 'lotus leaf rice', 'mansaf', 'midye dolma', 'mujaddara', 'nasi campur', 'nasi kerabu', 'nasi lemak', 'oyakodon', 'pabellon criollo', 'paella', 'panta bhat', 'perde pilavi', 'platillo moros y cristiano', 'pongal', 'red beans and rice', 'risotto', 'sarma', 'sekihan', 'sindhi biryani', 'sushi', 'sutlijas', 'tacu tacu', 'tamago kake gohan', 'tekkadon', 'unadon', 'warabimochi', 'xoi', 'yangzhou fried rice', 'zongzi', 'zosui']


In [10]:
endpoint_url = "https://query.wikidata.org/sparql"
query_salad = """
SELECT ?subclass ?subclassLabel ?TasteAtlas_ID ?instance_of ?instance_ofLabel ?country_of_origin ?country_of_originLabel ?topic_s_main_template ?topic_s_main_templateLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?subclass wdt:P279 wd:Q9266.
  OPTIONAL { ?subclass wdt:P5456 ?TasteAtlas_ID. }
  OPTIONAL { ?subclass wdt:P31 ?instance_of. }
  OPTIONAL { ?subclass wdt:P495 ?country_of_origin. }
  OPTIONAL { ?subclass wdt:P1424 ?topic_s_main_template. }
}
"""

df_salad = get_results(endpoint_url,query_salad)
df_salad.head()



Unnamed: 0,subclass,subclassLabel,TasteAtlas_ID,instance_of,instance_ofLabel,country_of_origin,country_of_originLabel,topic_s_main_template,topic_s_main_templateLabel
0,http://www.wikidata.org/entity/Q6030071,Gavurdağı salatası,,http://www.wikidata.org/entity/Q2095,food,http://www.wikidata.org/entity/Q43,Turkey,,
1,http://www.wikidata.org/entity/Q6060108,Q6060108,,,,,,,
2,http://www.wikidata.org/entity/Q6076597,Q6076597,pipirrana,,,,,,
3,http://www.wikidata.org/entity/Q6117451,Q6117451,,,,,,,
4,http://www.wikidata.org/entity/Q6128321,Salad cream,,,,http://www.wikidata.org/entity/Q145,United Kingdom,,


In [11]:
salad_list = sorted([re.sub("q[0-9]+","",item.lower()) for item in df_salad.subclassLabel.values.tolist()])
salad_counter = Counter(salad_list)

salad_list = []
for key in salad_counter.keys():
  if len(key) != 0:
    salad_list.append(key)
print(salad_list)

['a thoke', 'acar', 'ahtapot salata', 'arab salad', 'asinan', 'baba ghanoush', 'bean salad', 'beetroot salad', 'beyin salata', 'broccoli slaw', 'buljol', 'caesar salad', 'caprese salad', 'carrot salad', 'celery root salad', 'celery victor', 'cheese slaw', 'chef salad', 'chicken salad', 'chilean salad', 'chinese chicken salad', 'cobb salad', 'coleslaw', 'composed salad', 'cookie salad', 'crab louie', 'curtido', 'dakos', 'dessert salad', 'dressed herring', 'egg salad', 'eggplant salads and appetizers', 'egg–anchovy salad', 'eruca vesicaria', 'escalivada', 'esgarrat', 'esqueixada', 'fattoush', 'fiambre', 'fruit salad', 'gado-gado', 'garden salad', 'gavurdağı salatası', 'glass noodle salad', 'glasswort salad', 'glorified rice', 'golbaengi-muchim', 'goma-ae', 'greek salad', 'ham salad', 'herring salad', 'hmiss', 'israeli eggplant salad', 'israeli salad', 'jello salad', 'karedok', 'kelan antep', 'kinilnat', 'korean carrot salad', 'kosambari', 'kuluban', 'kurkkusalaatti', 'kısır', 'lalab', 'l

In [12]:
endpoint_url = "https://query.wikidata.org/sparql"
query_dish = """
SELECT ?subclass ?TasteAtlas_ID ?subclass_of ?subclass_ofLabel ?instance_of ?instance_ofLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  
  OPTIONAL { ?subclass wdt:P5456 ?TasteAtlas_ID. }
  OPTIONAL { ?subclass wdt:P279 ?subclass_of. }
  OPTIONAL { ?subclass wdt:P31 ?instance_of. }
  
  ?subclass wdt:P279 wd:Q746549.
}"""
df_dish = get_results(endpoint_url,query_dish)
df_dish.head()

Unnamed: 0,subclass,TasteAtlas_ID,subclass_of,subclass_ofLabel,instance_of,instance_ofLabel
0,http://www.wikidata.org/entity/Q177,pizza,http://www.wikidata.org/entity/Q666242,flatbread,http://www.wikidata.org/entity/Q19861951,type of food or dish
1,http://www.wikidata.org/entity/Q177,pizza,http://www.wikidata.org/entity/Q746549,dish,http://www.wikidata.org/entity/Q19861951,type of food or dish
2,http://www.wikidata.org/entity/Q177,pizza,http://www.wikidata.org/entity/Q3245975,finished good,http://www.wikidata.org/entity/Q19861951,type of food or dish
3,http://www.wikidata.org/entity/Q177,pizza,http://www.wikidata.org/entity/Q13485782,convenience food,http://www.wikidata.org/entity/Q19861951,type of food or dish
4,http://www.wikidata.org/entity/Q177,pizza,http://www.wikidata.org/entity/Q26996677,"Pizzas, casseroles",http://www.wikidata.org/entity/Q19861951,type of food or dish


In [13]:
dish_list = sorted([re.sub("q[0-9]+","",item.lower()) \
                    for item in df_dish.groupby(\
                                                ['TasteAtlas_ID', 'subclass_ofLabel'], as_index=False)\
                    .agg('count').TasteAtlas_ID.values.tolist()])

dish_list = sorted([re.sub("\-", " ",item) for item in dish_list])

dish_counter = Counter(dish_list)

dish_list = []
for key in dish_counter.keys():
  if len(key) != 0:
    dish_list.append(key)
print(dish_list)

['a gei', 'agedashi tofu', 'ajapsandali', 'alheira', 'aloo gobi', 'amatriciana', 'ambuyat', 'anticucho', 'aperitivo', 'apohtin', 'apple sauce', 'arroz con coco', 'arroz junto', 'arroz rojo', 'aushak', 'avocado toast', 'bacalhau a bras', 'bacon egg and cheese  sandwich', 'badrijani', 'baingan bharta', 'baked beans', 'balut', 'banana bread', 'banku', 'bannock', 'banosh', 'barreado', 'beef bourguignon', 'beef chow fun', 'beef noodle soup', 'beef stroganoff', 'beef wellington', 'beignets', 'bethmannchen', 'biryani', 'bison burger', 'black peas', 'black sesame soup', 'bolo do caco', 'bouillabaisse', 'briam', 'bruschetta', 'bubur ayam', 'bun cha', 'burger', 'butadon', 'cacik', 'calulu', 'cantonese seafood soup', 'capuns', 'carapulcra', 'carcerato', 'casunziei', 'cauliflower cheese', 'century egg', 'cevapi', 'cha siu bao', 'chahan', 'chairo', 'chanpuru', 'chen mapo doufu', 'chistorra', 'choucroute garnie', 'chow mein', 'chukadon', 'ciccioli', 'ciorba de burta', 'cock a leekie', 'confit de can

In [14]:
full_list = sorted(dish_list + rice_dish_list + soup_list + noodle_list + salad_list)
full_list = sorted([item for item in full_list if not item[0].isdigit()])

full_counter = Counter(full_list)
full_list=[]
for key in full_counter.keys():
  if len(key) != 0:
    full_list.append(key)
print(len(full_list))

871


In [31]:
major_list = []

for item in full_list:
  pattern_list = []
  pattern_dict = {}
  for word in item.split():
    token_dict = {}
    token_dict['LOWER'] = word
    pattern_list.append(token_dict)

  pattern_dict['label'] = 'dish'
  pattern_dict['pattern'] = pattern_list

  major_list.append(pattern_dict)

major_list


[{'label': 'dish', 'pattern': [{'LOWER': 'a'}, {'LOWER': 'gei'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'a'}, {'LOWER': 'thoke'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'acar'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'acquacotta'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'agedashi'}, {'LOWER': 'tofu'}]},
 {'label': 'dish',
  'pattern': [{'LOWER': 'aguadito'}, {'LOWER': 'de'}, {'LOWER': 'pollo'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'ahtapot'}, {'LOWER': 'salata'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'ajapsandali'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'ajoblanco'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'akki'}, {'LOWER': 'rotti'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'alheira'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'alicot'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'aloo'}, {'LOWER': 'gobi'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'amatriciana'}]},
 {'label': 'dish', 'pattern': [{'LOWER': 'ambuyat'}]},
 {'label': 'dish', 'patter

In [0]:
dish_keyword_patterns = np.array(major_list)
np.savez("dish_keyword_patterns", dish_keyword_patterns)

In [0]:
import json

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))


def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [38]:
for item in major_list:
  dump_jsonl([item],'patterns.jsonl', append=True)

Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 record

In [0]:
for url in [
    'https://chordanalytics.ca/',
    'https://github.com/agalea91',
    'https://medium.com/chord-analytics',
]:
    page = requests.get(url)
    webpage_data = {
        'page_url': page.url,
        'status_code': page.status_code,
        'date': datetime.datetime.now().isoformat(),
    }
    print(webpage_data)
    dump_jsonl([webpage_data], 'out.jsonl', append=True)