<a href="https://colab.research.google.com/github/ravi-gopalan/DAND_Data_Wrangling/blob/master/wikidata_sparql_query.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [83]:
!pip install sparqlwrapper

# https://rdflib.github.io/sparqlwrapper/



In [84]:
!pip install wikipedia



In [85]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import re
from collections import Counter
import time
import itertools
import wikipedia
import requests
from pandas.io.json import json_normalize

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
      item = []
      for c in cols:
        item.append(row.get(c, {}).get('value'))
      out.append(item)
    return pd.DataFrame(out, columns = cols)

In [0]:
def query_and_process_results(url, query_parameter):

  q1 = 'SELECT ?item ?itemLabel WHERE {?item wdt:P279 wd:'
  q2 = '. SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}'

  query = q1 + query_parameter + q2
#  print(query)

  df = get_results(url, query)
  df.shape
  df.head()

  res_list =  sorted([re.sub("q[0-9]+","",item.lower()) for item in df.groupby(['itemLabel'])['item'].agg('count').index.values.tolist()])
  res_list = sorted([re.sub("\-", " ",item) for item in res_list])
  res_list = [item for item in res_list if not re.findall("[0-9]+",item)]
  res_counter = Counter(res_list)
  res_list = []
  for key in res_counter.keys():
    if len(key) != 0:
      res_list.append(key)

  print(len(res_list), res_list)

  return df, res_list, res_counter  


In [0]:
def get_wikidata_id(col):
  rx = re.compile(r'(http\:\/\/www\.wikidata\.org\/entity\/)(Q[0-9]+)')
  m = rx.match(col)
  if m is not None:
    return m.group(2)



def get_wikidata_description(row):
  base_string = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&languages=en&format=json&props=descriptions&ids='
  query_string = base_string + row
#  print(query_string)
  r = requests.get(query_string)
  return json.loads(r.content.decode('utf-8'))['entities'][row]

def get_wikidata_label(row):
  base_string = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&languages=en&format=json&props=labels&ids='
  query_string = base_string + row
#  print(query_string)
  r = requests.get(query_string)
  return json.loads(r.content.decode('utf-8'))['entities'][row]

In [0]:

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))


def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [90]:
cd '/gdrive/My Drive/abv_reviews'

/gdrive/My Drive/abv_reviews


In [91]:
ls

 CoherenceScore.csv          pyLDAvis_13.html
 datalist_corrected.csv      pyLDAvis_14.html
 df_check_de.csv             pyLDAvis_15.html
 df_check_en.csv             pyLDAvis_16.html
 df_check_es.csv             pyLDAvis_17.html
 df_check_fr.csv             pyLDAvis_18.html
 df_check_fy.csv             pyLDAvis_19.html
 df_check_id.csv             pyLDAvis_20.html
 df_check_it.csv             pyLDAvis_21.html
 df_check_ms.csv             pyLDAvis_22.html
 df_check_tr.csv             pyLDAvis_23.html
 df_coffee.csv               pyLDAvis_24.html
 df_consolidated.csv         pyLDAvis_25.html
 df_dish.csv                 pyLDAvis_26.html
 df_drink.csv                pyLDAvis_27.html
 df_hot_bev.csv              pyLDAvis_28.html
 df_noodle.csv               pyLDAvis_29.html
 df_rice.csv                 pyLDAvis_3.html
 df_salad.csv                pyLDAvis_4.html
 df_soup.csv                 pyLDAvis_5.html
 df_taste_atlas.csv          pyLDAvis_6.html
 dictionary                  pyLDAvis_

In [0]:
endpoint_url = "https://query.wikidata.org/sparql"

# Query List

In [0]:
# noodle
query_noodle = """SELECT ?item ?itemLabel WHERE {
  ?item wdt:P279 wd:Q192874.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

# soup
query_soup = """SELECT ?item ?itemLabel WHERE {
  ?item wdt:P279 wd:Q41415.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

# rice dish
query_rice_dish = """SELECT ?item ?itemLabel WHERE {
  ?item wdt:P279 wd:Q21976260.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

# salad
query_salad = """SELECT ?item ?itemLabel WHERE {
  ?item wdt:P279 wd:Q9266.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

# dish
query_dish = """SELECT ?item ?itemLabel WHERE {
  ?item wdt:P279 wd:Q746549.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

# drink
query_drink = """SELECT ?item ?itemLabel WHERE {
  ?item wdt:P279 wd:Q40050.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

# hot bev
query_hot_bev = """SELECT ?item ?itemLabel WHERE {
  ?item wdt:P279 wd:Q19359564.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

# coffee drink
query_coffee_drink = """SELECT ?item ?itemLabel WHERE {
  ?item wdt:P279 wd:Q37756327.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

# tasteatlas
query_taste_atlas = """SELECT ?subclass ?subclassLabel ?TasteAtlas_ID WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  OPTIONAL { ?subclass wdt:P5456 ?TasteAtlas_ID. }
}
"""

In [10]:
cat_list = ['noodle', 'soup', 'rice', 'salad','dish', 'drink','hot_bev', 'coffee']
query_parameter_list = ['Q192874','Q41415', 'Q21976260', 'Q9266', 'Q746549', 'Q40050', 'Q19359564', 'Q37756327']
df_list = []
result_lists = []
result_counters = []


for cat, query_parameter in zip(cat_list, query_parameter_list):
  print("Category query in progress for {}".format(cat))
  df, r_list, r_counter = query_and_process_results(endpoint_url, query_parameter)
  df_list.append(df)
  result_lists.append(r_list)
  result_counters.append(r_counter)
  print("Category {} - completed".format(cat))
  time.sleep(10)

Category query in progress for noodle
44 ['bakmi', 'black noodles', 'boat noodles', 'buckwheat noodles', 'buldak bokkeum myun', 'bánh hỏi', 'cart noodle', 'cellophane noodles', 'char kway teow', 'chinese noodles', 'chinkiang pot cover noodles', 'cold noodle', 'curry noodle', 'dragon beard noodles', 'extruded noodle', 'fried noodles', 'hokkien mee', 'hot dry noodles', 'jajangmyeon', 'japanese noodles', 'kadaif noodles', 'kesme', 'kishimen', 'knife cut noodle', 'korean noodles', 'laghman', 'mi rebus', 'mie ayam', 'mì', 'narrow lapsha', 'pancit', 'phat si io', 'ramen', 'ribbon noodle', 'rice noodles', 'singapore chow mein', 'soba', 'spätzle', 'sōmen', 'udon', 'vietnamese noodles', 'wide lapsha', 'wonton noodles', 'zhajiangmian']
Category noodle - completed
Category query in progress for soup
361 ['acquacotta', 'aguadito de pollo', 'ajoblanco', 'alicot', 'amiedi', 'amish preaching soup', 'arabaşı soup', 'ash reshteh', 'ashe doogh', 'asian soup', 'aush', 'bacon soup', 'baeksuk', 'bagnun', '

In [0]:
for df in df_list:
  df['wikidata_entity_id'] = df['item'].apply(get_wikidata_id)
  df['desc_detail'] = df['wikidata_entity_id'].apply(get_wikidata_description)
  df['label_detail'] = df['wikidata_entity_id'].apply(get_wikidata_label)


In [0]:
for cat, df in zip(cat_list, df_list):
  temp = 'df_'+cat+'.csv'
  df = pd.concat([df,json_normalize(df['desc_detail'],max_level=2),json_normalize(df['label_detail'],max_level=2)],axis=1)
  df.to_csv(temp)

In [27]:
df_list[0].tail()

Unnamed: 0,item,itemLabel,wikidata_entity_id,desc_detail,label_detail
42,http://www.wikidata.org/entity/Q35778572,laghman,Q35778572,"{'type': 'item', 'id': 'Q35778572', 'descripti...","{'type': 'item', 'id': 'Q35778572', 'labels': ..."
43,http://www.wikidata.org/entity/Q47096879,extruded noodle,Q47096879,"{'type': 'item', 'id': 'Q47096879', 'descripti...","{'type': 'item', 'id': 'Q47096879', 'labels': ..."
44,http://www.wikidata.org/entity/Q47149231,knife-cut noodle,Q47149231,"{'type': 'item', 'id': 'Q47149231', 'descripti...","{'type': 'item', 'id': 'Q47149231', 'labels': ..."
45,http://www.wikidata.org/entity/Q65066993,Mì,Q65066993,"{'type': 'item', 'id': 'Q65066993', 'descripti...","{'type': 'item', 'id': 'Q65066993', 'labels': ..."
46,http://www.wikidata.org/entity/Q67440705,ribbon noodle,Q67440705,"{'type': 'item', 'id': 'Q67440705', 'descripti...","{'type': 'item', 'id': 'Q67440705', 'labels': ..."


In [25]:
json_normalize(df_list[0]['desc_detail'], max_level=2)

Unnamed: 0,type,id,descriptions.en.language,descriptions.en.value
0,item,Q20065,en,type of egg noodle
1,item,Q34156,,
2,item,Q44737,en,Chinese noodle dish
3,item,Q234646,en,East Asian noodle
4,item,Q391082,en,Indonesian dish
5,item,Q471861,en,a type of thick wheat flour noodle of Japanese...
6,item,Q701057,en,thin white noodles made of wheat flour
7,item,Q753910,en,thin Japanese noodle made from buckwheat flour
8,item,Q832338,en,"dish from Wuhan, often sold in street carts, c..."
9,item,Q835336,en,Noodle dish which became popular in Hong Kong ...


In [18]:
json_normalize(df_list[0]['label_detail'], max_level=2)

Unnamed: 0,type,id,labels.en.language,labels.en.value
0,item,Q20065,en,Spätzle
1,item,Q34156,en,Korean noodles
2,item,Q44737,en,zhajiangmian
3,item,Q234646,en,ramen
4,item,Q391082,en,Curry noodle
5,item,Q471861,en,udon
6,item,Q701057,en,sōmen
7,item,Q753910,en,soba
8,item,Q832338,en,hot dry noodles
9,item,Q835336,en,cart noodle


In [29]:
json.loads(response.content.decode('utf-8'))

str

In [22]:
result_lists[0]

['bakmi',
 'black noodles',
 'boat noodles',
 'buckwheat noodles',
 'buldak bokkeum myun',
 'bánh hỏi',
 'cart noodle',
 'cellophane noodles',
 'char kway teow',
 'chinese noodles',
 'chinkiang pot cover noodles',
 'cold noodle',
 'curry noodle',
 'dragon beard noodles',
 'extruded noodle',
 'fried noodles',
 'hokkien mee',
 'hot dry noodles',
 'jajangmyeon',
 'japanese noodles',
 'kadaif noodles',
 'kesme',
 'kishimen',
 'knife cut noodle',
 'korean noodles',
 'laghman',
 'mi rebus',
 'mie ayam',
 'mì',
 'narrow lapsha',
 'pancit',
 'phat si io',
 'ramen',
 'ribbon noodle',
 'rice noodles',
 'singapore chow mein',
 'soba',
 'spätzle',
 'sōmen',
 'udon',
 'vietnamese noodles',
 'wide lapsha',
 'wonton noodles',
 'zhajiangmian']

In [28]:
endpoint_url = "https://query.wikidata.org/sparql"
query_taste_atlas = """SELECT ?subclass ?subclassLabel ?TasteAtlas_ID WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  OPTIONAL { ?subclass wdt:P5456 ?TasteAtlas_ID. }
}
"""
df_taste_atlas = get_results(endpoint_url,query_taste_atlas)
print(df_taste_atlas.shape)
df_taste_atlas.head()


(6079, 3)


Unnamed: 0,subclass,subclassLabel,TasteAtlas_ID
0,http://www.wikidata.org/entity/Q178,pasta,pastasicily
1,http://www.wikidata.org/entity/Q177,pizza,pizza
2,http://www.wikidata.org/entity/Q616,Beaujolais wine,beaujolais
3,http://www.wikidata.org/entity/Q732,Francesinha,francesinha
4,http://www.wikidata.org/entity/Q7633290,sudan,sudan


In [29]:
taste_atlas_list = sorted([re.sub("q[0-9]+","",item.lower()) for item in df_taste_atlas.groupby(['subclassLabel'])['subclass'].agg('count').index.values.tolist()])
taste_atlas_list = sorted([re.sub("\-", " ",item) for item in taste_atlas_list])

taste_atlas_counter = Counter(taste_atlas_list)

taste_atlas_list = []
for key in taste_atlas_counter.keys():
  if len(key) != 0:
    taste_atlas_list.append(key)
print(len(taste_atlas_list),taste_atlas_list)

5056 ["'mpanatigghi", "'nduja", "'ota 'ika", '.amaro ramazzotti', '20th century', '7 and 7', 'a gei', 'aachener printen', 'aam panna', 'aamras', 'abbaye de citeaux', 'abbaye de tamié', 'abertam cheese', 'abgoosht', 'abondance', 'aborrajado', 'abricotine', 'absinthe', 'aburaage', 'acar', 'acarajé', 'acceglio (cheese)', 'acini di pepe', 'ackee and saltfish', 'acqua pazza', 'acquacotta', 'adana kebabı', 'adjika', 'adobo', 'afelia', 'affogato', 'afghan biscuit', 'afritada', 'afternoon snack', "afuega'l pitu", 'agedashi dofu', 'agent orange', 'aggala', 'aglianico', 'agliata', 'agneau du périgord', 'agnolotti', 'agrodolce', 'agua de sevilla', 'agua de valencia', 'aguachile', 'aguadito de pollo', 'aguas frescas', 'agwijjim', 'ahlgrens bilar', 'aioli', 'aish merahrah', 'ajapsandali', 'ajdovi žganci', 'aji de gallina', 'ajiaco', 'ajilimójili', 'ajoblanco', 'ajvar', 'akafuku', 'akanés', 'akkawi', 'akki rotti', 'akple', 'akumaki', 'akvavit', 'al pastor', 'alabama slammer', 'alambre', 'albariño', 

In [0]:
df_taste_atlas['wikidata_entity_id'] = df_taste_atlas['subclass'].apply(get_wikidata_id)
df_taste_atlas['desc_detail'] = df_taste_atlas['wikidata_entity_id'].apply(get_wikidata_description)
df_taste_atlas['label_detail'] = df_taste_atlas['wikidata_entity_id'].apply(get_wikidata_label)

In [32]:
df_taste_atlas.head()

Unnamed: 0,subclass,subclassLabel,TasteAtlas_ID,wikidata_entity_id,desc_detail,label_detail
0,http://www.wikidata.org/entity/Q178,pasta,pastasicily,Q178,"{'type': 'item', 'id': 'Q178', 'descriptions':...","{'type': 'item', 'id': 'Q178', 'labels': {'en'..."
1,http://www.wikidata.org/entity/Q177,pizza,pizza,Q177,"{'type': 'item', 'id': 'Q177', 'descriptions':...","{'type': 'item', 'id': 'Q177', 'labels': {'en'..."
2,http://www.wikidata.org/entity/Q616,Beaujolais wine,beaujolais,Q616,"{'type': 'item', 'id': 'Q616', 'descriptions':...","{'type': 'item', 'id': 'Q616', 'labels': {'en'..."
3,http://www.wikidata.org/entity/Q732,Francesinha,francesinha,Q732,"{'type': 'item', 'id': 'Q732', 'descriptions':...","{'type': 'item', 'id': 'Q732', 'labels': {'en'..."
4,http://www.wikidata.org/entity/Q7633290,sudan,sudan,Q7633290,"{'type': 'item', 'id': 'Q7633290', 'descriptio...","{'type': 'item', 'id': 'Q7633290', 'labels': {..."


In [0]:
df_taste_atlas = pd.concat([df_taste_atlas,json_normalize(df_taste_atlas['desc_detail'],max_level=2),json_normalize(df_taste_atlas['label_detail'],max_level=2)],axis=1)
df_taste_atlas.to_csv('df_taste_atlas.csv')

In [34]:
df_taste_atlas.head()

Unnamed: 0,subclass,subclassLabel,TasteAtlas_ID,wikidata_entity_id,desc_detail,label_detail,type,id,descriptions.en.language,descriptions.en.value,type.1,id.1,labels.en.language,labels.en.value
0,http://www.wikidata.org/entity/Q178,pasta,pastasicily,Q178,"{'type': 'item', 'id': 'Q178', 'descriptions':...","{'type': 'item', 'id': 'Q178', 'labels': {'en'...",item,Q178,en,"Italian food made from flour, eggs and water a...",item,Q178,en,pasta
1,http://www.wikidata.org/entity/Q177,pizza,pizza,Q177,"{'type': 'item', 'id': 'Q177', 'descriptions':...","{'type': 'item', 'id': 'Q177', 'labels': {'en'...",item,Q177,en,Italian dish of oven-baked bread with various ...,item,Q177,en,pizza
2,http://www.wikidata.org/entity/Q616,Beaujolais wine,beaujolais,Q616,"{'type': 'item', 'id': 'Q616', 'descriptions':...","{'type': 'item', 'id': 'Q616', 'labels': {'en'...",item,Q616,en,wine from the Beaujolais AOC of France,item,Q616,en,Beaujolais wine
3,http://www.wikidata.org/entity/Q732,Francesinha,francesinha,Q732,"{'type': 'item', 'id': 'Q732', 'descriptions':...","{'type': 'item', 'id': 'Q732', 'labels': {'en'...",item,Q732,en,Portuguese sandwich,item,Q732,en,Francesinha
4,http://www.wikidata.org/entity/Q7633290,sudan,sudan,Q7633290,"{'type': 'item', 'id': 'Q7633290', 'descriptio...","{'type': 'item', 'id': 'Q7633290', 'labels': {...",item,Q7633290,en,traditional Korean punch,item,Q7633290,en,sudan


In [0]:
df_files  = ['df_coffee.csv','df_dish.csv','df_drink.csv','df_hot_bev.csv','df_noodle.csv','df_rice.csv', 'df_salad.csv', 'df_soup.csv', 'df_taste_atlas.csv']

In [48]:
df_consolidated.columns

Index(['Unnamed: 0', 'item', 'itemLabel', 'wikidata_entity_id', 'type', 'id',
       'descriptions.en.language', 'descriptions.en.value', 'type.1', 'id.1',
       'labels.en.language', 'labels.en.value', 'subclass', 'subclassLabel',
       'TasteAtlas_ID'],
      dtype='object')

In [52]:
taste = ['subclass', 'subclassLabel', 'wikidata_entity_id', 'descriptions.en.language', 'descriptions.en.value', 'labels.en.language', 'labels.en.value', 'TasteAtlas_ID']
non_taste = ['item', 'itemLabel', 'wikidata_entity_id', 'descriptions.en.language', 'descriptions.en.value', 'labels.en.language', 'labels.en.value']
df_consolidated = pd.DataFrame()
for file in df_files:
  
  if file == 'df_taste_atlas.csv':
    df = pd.read_csv(file, usecols=taste)    
    df.rename(columns={"subclass": "item", "subclassLabel": "itemLabel"},inplace=True)
  else:
    df = pd.read_csv(file, usecols=non_taste) 
  df_consolidated = pd.concat([df_consolidated, df],axis=0, sort=False)


df_consolidated.head()
df_consolidated.shape

Unnamed: 0,item,itemLabel,wikidata_entity_id,descriptions.en.language,descriptions.en.value,labels.en.language,labels.en.value,TasteAtlas_ID
0,http://www.wikidata.org/entity/Q8486,coffee,Q8486,en,brewed beverage prepared from roasted coffee s...,en,coffee,
1,http://www.wikidata.org/entity/Q20674,Pharisees,Q20674,en,hot drink,en,Pharisees,
2,http://www.wikidata.org/entity/Q22929,latte macchiato,Q22929,en,coffee beverage,en,latte macchiato,
3,http://www.wikidata.org/entity/Q59072,cortado,Q59072,en,beverage consisting of espresso mixed with a r...,en,cortado,
4,http://www.wikidata.org/entity/Q62449,flat white,Q62449,en,espresso-based coffee beverage; prepared by po...,en,flat white,


(8102, 8)

In [53]:
df_consolidated.tail()

Unnamed: 0,item,itemLabel,wikidata_entity_id,descriptions.en.language,descriptions.en.value,labels.en.language,labels.en.value,TasteAtlas_ID
6074,http://www.wikidata.org/entity/Q564722,Charqui,Q564722,,,en,Charqui,charqui
6075,http://www.wikidata.org/entity/Q19947359,Q19947359,Q19947359,,,,,canard-a-foie-gras-du-sud-ouest
6076,http://www.wikidata.org/entity/Q11162,Ciccioli,Q11162,en,traditional dish,en,Ciccioli,ciccioli
6077,http://www.wikidata.org/entity/Q60793741,Tlačenka,Q60793741,,,en,Tlačenka,tlacenka
6078,http://www.wikidata.org/entity/Q122195,tequila,Q122195,en,alcoholic beverage from Mexico,en,tequila,نمر


In [0]:
df_consolidated.to_csv('df_consolidated.csv')

In [12]:
df_consolidated.columns

Index(['Unnamed: 0', 'item', 'itemLabel', 'wikidata_entity_id',
       'descriptions.en.language', 'descriptions.en.value',
       'labels.en.language', 'labels.en.value', 'TasteAtlas_ID'],
      dtype='object')

In [0]:
cons_columns = ['item', 'itemLabel', 'wikidata_entity_id', 'descriptions.en.language', 'descriptions.en.value', 'labels.en.language', 'labels.en.value', 'TasteAtlas_ID']
df_consolidated = pd.read_csv('df_consolidated.csv',usecols=cons_columns)

In [109]:
df_consolidated.sort_values(by=['itemLabel'],inplace=True)
df_consolidated.head()
df_consolidated.shape
df_consolidated.drop_duplicates(inplace=True)
df_consolidated.shape

Unnamed: 0,item,itemLabel,wikidata_entity_id,descriptions.en.language,descriptions.en.value,labels.en.language,labels.en.value,TasteAtlas_ID
3533,http://www.wikidata.org/entity/Q3596214,'Mpanatigghi,Q3596214,,,en,'Mpanatigghi,mpanatigghi
8076,http://www.wikidata.org/entity/Q1505085,'Nduja,Q1505085,en,spicy spreadable sausage made with pork,en,'Nduja,nduja
5618,http://www.wikidata.org/entity/Q3393853,'Ota 'ika,Q3393853,en,Polynesian dish,en,'Ota 'ika,ota-ika
7159,http://www.wikidata.org/entity/Q1758979,.Amaro Ramazzotti,Q1758979,,,en,.Amaro Ramazzotti,ramazzotti
1787,http://www.wikidata.org/entity/Q4550892,15 bean soup,Q4550892,en,packaged dry bean soup product from the N.K. H...,en,15 bean soup,


(8102, 8)

(8025, 8)

In [0]:
def create_df_and_remove_duplicates(input_file, col_names):
  df = pd.read_csv(input_file, usecols=col_names)
  print(df.head())
  print(df.shape)
  df.drop_duplicates(inplace=True)
  print(df.shape)
  return df


In [111]:
cols_to_use = ['name','page','summary','image_ref']

df_check_en = create_df_and_remove_duplicates('df_check_en.csv', cols_to_use)
df_check_de = create_df_and_remove_duplicates('df_check_de.csv', cols_to_use)
df_check_it = create_df_and_remove_duplicates('df_check_it.csv', cols_to_use)
df_check_es = create_df_and_remove_duplicates('df_check_es.csv', cols_to_use)
df_check_fr = create_df_and_remove_duplicates('df_check_fr.csv', cols_to_use)
df_check_fy = create_df_and_remove_duplicates('df_check_fy.csv', cols_to_use)
df_check_tr = create_df_and_remove_duplicates('df_check_tr.csv', cols_to_use)
df_check_id = create_df_and_remove_duplicates('df_check_id.csv', cols_to_use)
df_check_ms = create_df_and_remove_duplicates('df_check_ms.csv', cols_to_use)

df_right = pd.concat([df_check_en, df_check_de,df_check_it, df_check_es,\
                      df_check_fr, df_check_fy,df_check_tr, df_check_id,\
                      df_check_ms], ignore_index=True, sort=False)


                name  ...                                          image_ref
0       'Mpanatigghi  ...  https://upload.wikimedia.org/wikipedia/commons...
1             'Nduja  ...  https://upload.wikimedia.org/wikipedia/commons...
2          'Ota 'ika  ...  https://upload.wikimedia.org/wikipedia/commons...
3  .Amaro Ramazzotti  ...  https://upload.wikimedia.org/wikipedia/commons...
4       15 bean soup  ...  https://upload.wikimedia.org/wikipedia/commons...

[5 rows x 4 columns]
(5574, 4)
(5574, 4)
        name  ...                                          image_ref
0      a gei  ...  https://upload.wikimedia.org/wikipedia/commons...
1  americano  ...  https://upload.wikimedia.org/wikipedia/commons...
2       açma  ...  https://upload.wikimedia.org/wikipedia/commons...
3       baba  ...  https://upload.wikimedia.org/wikipedia/commons...
4     banket  ...  https://upload.wikimedia.org/wikipedia/commons...

[5 rows x 4 columns]
(256, 4)
(256, 4)
          name  ...                       

In [112]:
df_right.head()
df_right.shape
df_right.drop_duplicates(inplace=True)
df_right.shape

Unnamed: 0,name,page,summary,image_ref
0,'Mpanatigghi,https://en.wikipedia.org/wiki/Empanada,An empanada is a type of baked or fried turnov...,https://upload.wikimedia.org/wikipedia/commons...
1,'Nduja,https://en.wikipedia.org/wiki/%27Nduja,'Nduja (Calabrian: [nˈduːja]) is a particularl...,https://upload.wikimedia.org/wikipedia/commons...
2,'Ota 'ika,https://en.wikipedia.org/wiki/%27Ota_%27ika,"'Ota ika is a Polynesian dish, similar to Lati...",https://upload.wikimedia.org/wikipedia/commons...
3,.Amaro Ramazzotti,https://en.wikipedia.org/wiki/Amaro_(liqueur),"Amaro (Italian for ""bitter"") is an Italian her...",https://upload.wikimedia.org/wikipedia/commons...
4,15 bean soup,https://en.wikipedia.org/wiki/15_bean_soup,15 Bean Soup (a registered trademark of the N....,https://upload.wikimedia.org/wikipedia/commons...


(6070, 4)

(6070, 4)

In [113]:
df_consolidated = df_consolidated.merge(df_right, how='left',left_on='itemLabel',right_on='name')
df_consolidated.head()
df_consolidated.shape

Unnamed: 0,item,itemLabel,wikidata_entity_id,descriptions.en.language,descriptions.en.value,labels.en.language,labels.en.value,TasteAtlas_ID,name,page,summary,image_ref
0,http://www.wikidata.org/entity/Q3596214,'Mpanatigghi,Q3596214,,,en,'Mpanatigghi,mpanatigghi,'Mpanatigghi,https://en.wikipedia.org/wiki/Empanada,An empanada is a type of baked or fried turnov...,https://upload.wikimedia.org/wikipedia/commons...
1,http://www.wikidata.org/entity/Q1505085,'Nduja,Q1505085,en,spicy spreadable sausage made with pork,en,'Nduja,nduja,'Nduja,https://en.wikipedia.org/wiki/%27Nduja,'Nduja (Calabrian: [nˈduːja]) is a particularl...,https://upload.wikimedia.org/wikipedia/commons...
2,http://www.wikidata.org/entity/Q3393853,'Ota 'ika,Q3393853,en,Polynesian dish,en,'Ota 'ika,ota-ika,'Ota 'ika,https://en.wikipedia.org/wiki/%27Ota_%27ika,"'Ota ika is a Polynesian dish, similar to Lati...",https://upload.wikimedia.org/wikipedia/commons...
3,http://www.wikidata.org/entity/Q1758979,.Amaro Ramazzotti,Q1758979,,,en,.Amaro Ramazzotti,ramazzotti,.Amaro Ramazzotti,https://en.wikipedia.org/wiki/Amaro_(liqueur),"Amaro (Italian for ""bitter"") is an Italian her...",https://upload.wikimedia.org/wikipedia/commons...
4,http://www.wikidata.org/entity/Q4550892,15 bean soup,Q4550892,en,packaged dry bean soup product from the N.K. H...,en,15 bean soup,,15 bean soup,https://en.wikipedia.org/wiki/15_bean_soup,15 Bean Soup (a registered trademark of the N....,https://upload.wikimedia.org/wikipedia/commons...


(8026, 12)

In [18]:
matched_list = []
for item in load_jsonl('taste_atlas_info_en.jsonl'):
  matched_list.append(item['name'])
matched_list

Loaded 8025 records from taste_atlas_info_en.jsonl


["'Mpanatigghi",
 "'Nduja",
 "'Ota 'ika",
 '.Amaro Ramazzotti',
 '15 bean soup',
 '1519 Tequila',
 '2007 Vietnam food scare',
 '20th Century',
 '3 A.M. Vodka',
 '4 Copas',
 '5-hour Energy',
 '7 and 7',
 'A thoke',
 'A-gei',
 'A-gei',
 'Aachener Printen',
 'Aam panna',
 'Aamras',
 'Abbaye de Tamié',
 'Abertam cheese',
 'Abgoosht',
 'Aborrajado',
 'Abricotine',
 'Aburaage',
 'Acarajé',
 'Acceglio (cheese)',
 'Accelerade',
 'Achar',
 'Ackee and saltfish',
 'Acqua pazza',
 'Acquacotta',
 'Acquacotta',
 'Acquasale',
 'Adana kebabı',
 'Adjika',
 'Afelia',
 'Affogato',
 'Affogato',
 'Afghan biscuit',
 'Afritada',
 "Afuega'l pitu",
 'Agedashi dofu',
 'Agedashi dofu',
 'Agemochi',
 'Agent Orange',
 'Aggala',
 'Aglianico',
 'Agliata',
 'Agneau du Périgord',
 'Agnolotti',
 'Agrodolce',
 'Agua de Sevilla',
 'Agua de Valencia',
 'Aguachile',
 'Aguadito de pollo',
 'Aguadito de pollo',
 'Aguas frescas',
 'Ahlgrens bilar',
 'Ahtapot salata',
 'Air sirap',
 'Aish Merahrah',
 'Ajapsandali',
 'Ajapsanda

In [34]:
df_check_en = pd.read_json('taste_atlas_info_en.jsonl',lines=True)
df_check_en.head()
df_check_en.shape

Unnamed: 0,name,page,summary,image_ref
0,'Mpanatigghi,https://en.wikipedia.org/wiki/Empanada,An empanada is a type of baked or fried turnov...,https://upload.wikimedia.org/wikipedia/commons...
1,'Nduja,https://en.wikipedia.org/wiki/%27Nduja,'Nduja (Calabrian: [nˈduːja]) is a particularl...,https://upload.wikimedia.org/wikipedia/commons...
2,'Ota 'ika,https://en.wikipedia.org/wiki/%27Ota_%27ika,"'Ota ika is a Polynesian dish, similar to Lati...",https://upload.wikimedia.org/wikipedia/commons...
3,.Amaro Ramazzotti,https://en.wikipedia.org/wiki/Amaro_(liqueur),"Amaro (Italian for ""bitter"") is an Italian her...",https://upload.wikimedia.org/wikipedia/commons...
4,15 bean soup,https://en.wikipedia.org/wiki/15_bean_soup,15 Bean Soup (a registered trademark of the N....,https://upload.wikimedia.org/wikipedia/commons...


(8025, 4)

In [26]:
df_check.query('name == "Cantonese seafood soup"')

Unnamed: 0,name,page,summary,image_ref
778,Cantonese seafood soup,https://en.wikipedia.org/wiki/Cantonese_seafoo...,Cantonese seafood soup is one of the main seaf...,https://upload.wikimedia.org/wikipedia/commons...
779,Cantonese seafood soup,https://en.wikipedia.org/wiki/Cantonese_seafoo...,Cantonese seafood soup is one of the main seaf...,https://upload.wikimedia.org/wikipedia/commons...


In [26]:
df_check_en[df_check_en.duplicated(['name'],False)]

Unnamed: 0,name,page,summary,image_ref
13,A-gei,,,
14,A-gei,,,
30,Acquacotta,https://en.wikipedia.org/wiki/Acquacotta,Acquacotta (pronounced [ˌakkwaˈkɔtta]; Italian...,https://upload.wikimedia.org/wikipedia/commons...
31,Acquacotta,https://en.wikipedia.org/wiki/Acquacotta,Acquacotta (pronounced [ˌakkwaˈkɔtta]; Italian...,https://upload.wikimedia.org/wikipedia/commons...
36,Affogato,https://en.wikipedia.org/wiki/Affogato,"An affogato (Italian for ""drowned"") is an Ital...",https://upload.wikimedia.org/wikipedia/commons...
...,...,...,...,...
8007,İnegöl meatballs,https://en.wikipedia.org/wiki/%C4%B0neg%C3%B6l...,"İnegöl köfte, is grilled meatballs (köfte) spe...",https://upload.wikimedia.org/wikipedia/commons...
8012,şakşuka,https://en.wikipedia.org/wiki/%C5%9Eak%C5%9Fuka,Şakşuka is a Turkish side dish or meze made of...,https://upload.wikimedia.org/wikipedia/commons...
8013,şakşuka,https://en.wikipedia.org/wiki/%C5%9Eak%C5%9Fuka,Şakşuka is a Turkish side dish or meze made of...,https://upload.wikimedia.org/wikipedia/commons...
8020,Štruklji,https://en.wikipedia.org/wiki/%C5%A0truklji,"Štruklji are a traditional Slovene dish, compo...",https://upload.wikimedia.org/wikipedia/commons...


In [35]:
df_check_en.drop_duplicates(['name'],inplace=True)
df_check_en.head()
df_check_en.shape

Unnamed: 0,name,page,summary,image_ref
0,'Mpanatigghi,https://en.wikipedia.org/wiki/Empanada,An empanada is a type of baked or fried turnov...,https://upload.wikimedia.org/wikipedia/commons...
1,'Nduja,https://en.wikipedia.org/wiki/%27Nduja,'Nduja (Calabrian: [nˈduːja]) is a particularl...,https://upload.wikimedia.org/wikipedia/commons...
2,'Ota 'ika,https://en.wikipedia.org/wiki/%27Ota_%27ika,"'Ota ika is a Polynesian dish, similar to Lati...",https://upload.wikimedia.org/wikipedia/commons...
3,.Amaro Ramazzotti,https://en.wikipedia.org/wiki/Amaro_(liqueur),"Amaro (Italian for ""bitter"") is an Italian her...",https://upload.wikimedia.org/wikipedia/commons...
4,15 bean soup,https://en.wikipedia.org/wiki/15_bean_soup,15 Bean Soup (a registered trademark of the N....,https://upload.wikimedia.org/wikipedia/commons...


(7378, 4)

In [36]:
list_with_issues = list(df_check_en[df_check_en['page'].isna()]['name'])
list_with_issues = [re.sub("q[0-9]+","",item.lower()) for item in list_with_issues]
list_with_issues = sorted([re.sub("\-", " ",item) for item in list_with_issues])

df_check_en.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
df_check_en.shape

(5574, 4)

In [37]:
issues_counter = Counter(list_with_issues)
list_with_issues = []
for key in issues_counter.keys():
  if len(key) != 0:
    list_with_issues.append(key)



print(len(list_with_issues), list_with_issues)

573 ['a gei', 'achar', 'acqua pazza', 'agneau du périgord', 'ahtapot salata', 'air sirap', 'all sport', 'amba', 'americano', 'anju', 'arabaşı soup', 'aranese cream', 'ardi gasna', 'arjona tuberosa', 'arroz al horno', 'arwa', 'azumadon', 'açma', 'baba', 'baci di dama', 'banga', 'banket', 'bantan', 'barley based dishes', 'bavarian beers', 'bavette', 'bazeen', 'bear claw', 'beaufort', "bee's knees", 'beep', 'bergamottes de nancy', "bey's soup", 'beyin salata', 'beyran', 'binyang dog leg dish', 'biscotti del lagaccio', 'bitter', 'bleu de termignon', 'blood dishes', 'bloody mary', 'bogeo', 'boil up', 'bosna', 'boterkoek', 'bouchee a la reine', 'bouchon de sancerre', 'bourride', 'bouyiourdi', 'brandy sour', 'brass monkey', 'brassica dishes', 'breaded mushrooms', 'briquette de brebis', 'budu', 'budyń', 'burgossan cheese', 'buttery', 'bâtard montrachet', 'bún thịt nướng', 'bún ốc', 'bœuf de bazas igp', 'cable car', 'cachena cattle', 'cadi', 'calas', 'calv steak adlon', 'cannoli', 'capirotada',

In [43]:
list(df_check_en.name)

["'Mpanatigghi",
 "'Nduja",
 "'Ota 'ika",
 '.Amaro Ramazzotti',
 '15 bean soup',
 '1519 Tequila',
 '2007 Vietnam food scare',
 '20th Century',
 '3 A.M. Vodka',
 '4 Copas',
 '5-hour Energy',
 '7 and 7',
 'A thoke',
 'Aachener Printen',
 'Aam panna',
 'Aamras',
 'Abbaye de Tamié',
 'Abertam cheese',
 'Abgoosht',
 'Aborrajado',
 'Abricotine',
 'Aburaage',
 'Acarajé',
 'Acceglio (cheese)',
 'Accelerade',
 'Ackee and saltfish',
 'Acquacotta',
 'Acquasale',
 'Adana kebabı',
 'Adjika',
 'Afelia',
 'Affogato',
 'Afghan biscuit',
 'Afritada',
 "Afuega'l pitu",
 'Agedashi dofu',
 'Agemochi',
 'Agent Orange',
 'Aggala',
 'Aglianico',
 'Agliata',
 'Agnolotti',
 'Agrodolce',
 'Agua de Sevilla',
 'Agua de Valencia',
 'Aguachile',
 'Aguadito de pollo',
 'Aguas frescas',
 'Ahlgrens bilar',
 'Aish Merahrah',
 'Ajapsandali',
 'Ajdovi žganci',
 'Aji de Gallina',
 'Ajiaco',
 'Ajilimójili',
 'Ajoblanco',
 'Akafuku',
 'Akanés',
 'Akkawi',
 'Akki rotti',
 'Akple',
 'Akumaki',
 'Akuri',
 'Akvavit',
 'Al pasto

In [0]:
list_with_issues_en = sorted(list_with_issues)



In [0]:
df_check_en.to_csv('df_check_en.csv')

In [44]:
wikipedia.set_lang("de")
wiki_info_list_de = []
items_with_issues_list_de = []
for item in list_with_issues_en:
  time.sleep(1)
  wiki_dict_de = {}
  try:
    wiki_dict_de['name'] = item
    wiki_dict_de['page'] = wikipedia.page(item).url
    wiki_dict_de['summary'] = wikipedia.summary(item)
    wiki_dict_de['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list_de.append(item)
  wiki_info_list_de.append(wiki_dict_de)

for item in wiki_info_list_de:
  dump_jsonl([item],'taste_atlas_info_de.jsonl', append=True)



  lis = BeautifulSoup(html).find_all('li')


Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 re

In [45]:
df_check_de = pd.read_json('taste_atlas_info_de.jsonl',lines=True)
df_check_de.shape
df_check_de.head()

(573, 4)

Unnamed: 0,name,page,summary,image_ref
0,a gei,https://de.wikipedia.org/wiki/Die_Geier-Wally_...,Die Geier-Wally ist ein Roman von Wilhelmine v...,https://upload.wikimedia.org/wikipedia/commons...
1,achar,,,
2,acqua pazza,,,
3,agneau du périgord,,,
4,ahtapot salata,,,


In [46]:
df_check_de[df_check_de.duplicated(['name'],False)]

Unnamed: 0,name,page,summary,image_ref


In [48]:
list_with_issues_de = list(df_check_de[df_check_de['page'].isna()]['name'])
print(len(list_with_issues_de),list_with_issues_de)

317 ['achar', 'acqua pazza', 'agneau du périgord', 'ahtapot salata', 'air sirap', 'all sport', 'amba', 'anju', 'arabaşı soup', 'aranese cream', 'ardi gasna', 'arjona tuberosa', 'arroz al horno', 'arwa', 'azumadon', 'baci di dama', 'banga', 'bantan', 'barley based dishes', 'bavarian beers', 'bavette', 'bear claw', 'beaufort', "bee's knees", "bey's soup", 'beyin salata', 'beyran', 'binyang dog leg dish', 'biscotti del lagaccio', 'blood dishes', 'bogeo', 'boil up', 'boterkoek', 'bouchon de sancerre', 'bouyiourdi', 'brassica dishes', 'breaded mushrooms', 'briquette de brebis', 'budu', 'budyń', 'burgossan cheese', 'bún thịt nướng', 'bún ốc', 'cable car', 'cachena cattle', 'calv steak adlon', 'caravane', 'carne de la sierra de guadarrama', 'carne de morucha de salamanca', 'carne de vacuno del país vasco', 'carp soup', "carré de l'est", 'casino', 'cassoeula', 'causa a la limeña', 'chacha', 'cham cham', 'chapea', 'chashu', "cheese from l'alt urgell y la cerdanya", 'chexo', 'chilean pisco', 'ch

In [49]:

list_with_issues_de = [re.sub("q[0-9]+","",item.lower()) for item in list_with_issues_de]
list_with_issues_de = sorted([re.sub("\-", " ",item) for item in list_with_issues_de])

df_check_de.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
df_check_de.shape
df_check_de.head()


(256, 4)

Unnamed: 0,name,page,summary,image_ref
0,a gei,https://de.wikipedia.org/wiki/Die_Geier-Wally_...,Die Geier-Wally ist ein Roman von Wilhelmine v...,https://upload.wikimedia.org/wikipedia/commons...
8,americano,https://de.wikipedia.org/wiki/Americano_(Cockt...,Der Americano ist ein klassischer Aperitifcock...,https://upload.wikimedia.org/wikipedia/commons...
17,açma,https://de.wikipedia.org/wiki/Mahlab,"Als Mahlab, auch Mahalab, Mahleb oder Mahaleb,...",https://upload.wikimedia.org/wikipedia/commons...
18,baba,https://de.wikipedia.org/wiki/Baba_Jaga,"Baba Jaga (russisch Ба́ба-Яга́), regional auch...",https://upload.wikimedia.org/wikipedia/commons...
21,banket,https://de.wikipedia.org/wiki/Mahlzeitstillleben,Bei den Mahlzeitstillleben handelt es sich um ...,https://upload.wikimedia.org/wikipedia/commons...


In [0]:
df_check_de.to_csv('df_check_de.csv')

In [61]:
df_check_de[df_check_de['page'].notna()].drop_duplicates()

Unnamed: 0,name,page,summary,image_ref
0,a gei,https://de.wikipedia.org/wiki/Die_Geier-Wally_...,Die Geier-Wally ist ein Roman von Wilhelmine v...,https://upload.wikimedia.org/wikipedia/commons...
3,alu tama,https://de.wikipedia.org/wiki/Einschienenbahn,Eine Einschienenbahn ist eine dem Passagier- o...,https://upload.wikimedia.org/wikipedia/commons...
5,americano,https://de.wikipedia.org/wiki/Americano_(Cockt...,Der Americano ist ein klassischer Aperitifcock...,https://upload.wikimedia.org/wikipedia/commons...
11,açma,https://de.wikipedia.org/wiki/Mahlab,"Als Mahlab, auch Mahalab, Mahleb oder Mahaleb,...",https://upload.wikimedia.org/wikipedia/commons...
12,baba,https://de.wikipedia.org/wiki/Baba_Jaga,"Baba Jaga (russisch Ба́ба-Яга́), regional auch...",https://upload.wikimedia.org/wikipedia/commons...
...,...,...,...,...
886,seafood rice,https://de.wikipedia.org/wiki/Liste_der_Ninten...,Die folgende Liste enthält alle in Europa verö...,https://upload.wikimedia.org/wikipedia/commons...
888,silvana,https://de.wikipedia.org/wiki/Silvana,Silvana ist ein weiblicher Vorname.\n\n,https://upload.wikimedia.org/wikipedia/commons...
898,tavuk şiş,https://de.wikipedia.org/wiki/T%C3%BCrkische_K...,Die türkische Küche hat eine lange Geschichte ...,https://upload.wikimedia.org/wikipedia/commons...
900,turnover,https://de.wikipedia.org/wiki/Turnover_(Americ...,Ein Turnover bedeutet im American Football den...,https://upload.wikimedia.org/wikipedia/commons...


In [52]:
print(len(items_with_issues_list_de), items_with_issues_list_de)

337 ['achar', 'acqua pazza', 'agneau du périgord', 'ahtapot salata', 'air sirap', 'all sport', 'amba', 'anju', 'arabaşı soup', 'aranese cream', 'ardi gasna', 'arjona tuberosa', 'arroz al horno', 'arwa', 'azumadon', 'baci di dama', 'banga', 'bantan', 'barley based dishes', 'bavarian beers', 'bavette', 'bear claw', 'beaufort', "bee's knees", 'beep', "bey's soup", 'beyin salata', 'beyran', 'binyang dog leg dish', 'biscotti del lagaccio', 'blood dishes', 'bogeo', 'boil up', 'boterkoek', 'bouchon de sancerre', 'bouyiourdi', 'brassica dishes', 'breaded mushrooms', 'briquette de brebis', 'budu', 'budyń', 'burgossan cheese', 'bún thịt nướng', 'bún ốc', 'cable car', 'cachena cattle', 'calv steak adlon', 'caravane', 'carne de la sierra de guadarrama', 'carne de morucha de salamanca', 'carne de vacuno del país vasco', 'carp soup', 'carrulim', "carré de l'est", 'casino', 'cassoeula', 'cathare', 'causa a la limeña', 'chacha', 'cham cham', 'chapea', 'chashu', "cheese from l'alt urgell y la cerdanya"

In [53]:
wikipedia.set_lang("it")
wiki_info_list_it = []
items_with_issues_list_it = []
for item in items_with_issues_list_de:
  time.sleep(1)
  wiki_dict_it = {}
  try:
    wiki_dict_it['name'] = item
    wiki_dict_it['page'] = wikipedia.page(item).url
    wiki_dict_it['summary'] = wikipedia.summary(item)
    wiki_dict_it['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list_it.append(item)
  wiki_info_list_it.append(wiki_dict_it)
for item in wiki_info_list_it:
  dump_jsonl([item],'taste_atlas_info_it.jsonl', append=True)



  lis = BeautifulSoup(html).find_all('li')


Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 re

In [54]:
df_check_it = pd.read_json('taste_atlas_info_it.jsonl',lines=True)
df_check_it.shape
df_check_it.head()

(337, 4)

Unnamed: 0,name,page,summary,image_ref
0,achar,https://it.wikipedia.org/wiki/Amrita_Acharia,"Amrita Acharia, in nepalese अमृता आचार्य (Katm...",https://upload.wikimedia.org/wikipedia/commons...
1,acqua pazza,https://it.wikipedia.org/wiki/Pesce_all%27acqu...,L'acqua pazza è una preparazione del pesce tip...,https://upload.wikimedia.org/wikipedia/commons...
2,agneau du périgord,,,
3,ahtapot salata,,,
4,air sirap,,,


In [55]:
df_check_it[df_check_it.duplicated(['name'],False)]

Unnamed: 0,name,page,summary,image_ref


In [56]:
print(len(list(df_check_it[df_check_it['page'].isna()]['name'])),list(df_check_it[df_check_it['page'].isna()]['name']))

231 ['agneau du périgord', 'ahtapot salata', 'air sirap', 'arabaşı soup', 'aranese cream', 'ardi gasna', 'arjona tuberosa', 'arroz al horno', 'azumadon', 'banga', 'barley based dishes', 'bavarian beers', 'bear claw', "bee's knees", "bey's soup", 'beyin salata', 'beyran', 'binyang dog leg dish', 'blood dishes', 'bogeo', 'boil up', 'boterkoek', 'bouchon de sancerre', 'bouyiourdi', 'brassica dishes', 'breaded mushrooms', 'briquette de brebis', 'budyń', 'burgossan cheese', 'bún thịt nướng', 'cachena cattle', 'calv steak adlon', 'caravane', 'carne de morucha de salamanca', 'carne de vacuno del país vasco', 'carp soup', 'carrulim', "carré de l'est", 'causa a la limeña', 'chacha', 'chapea', "cheese from l'alt urgell y la cerdanya", 'chexo', 'chilean pisco', 'chistorramesta', 'cider confit', 'civet de lapin', 'coca de recapte', 'colby cheese', 'collard liquor', 'corn steak', 'cracknel', 'cremas', 'croûte aux morilles', 'culinary speciality', 'dairy drink', 'deditos de queso', 'deram deram', 'd

In [57]:
list_with_issues_it = list(df_check_it[df_check_it['page'].isna()]['name'])
print(len(list_with_issues_it),list_with_issues_it)

231 ['agneau du périgord', 'ahtapot salata', 'air sirap', 'arabaşı soup', 'aranese cream', 'ardi gasna', 'arjona tuberosa', 'arroz al horno', 'azumadon', 'banga', 'barley based dishes', 'bavarian beers', 'bear claw', "bee's knees", "bey's soup", 'beyin salata', 'beyran', 'binyang dog leg dish', 'blood dishes', 'bogeo', 'boil up', 'boterkoek', 'bouchon de sancerre', 'bouyiourdi', 'brassica dishes', 'breaded mushrooms', 'briquette de brebis', 'budyń', 'burgossan cheese', 'bún thịt nướng', 'cachena cattle', 'calv steak adlon', 'caravane', 'carne de morucha de salamanca', 'carne de vacuno del país vasco', 'carp soup', 'carrulim', "carré de l'est", 'causa a la limeña', 'chacha', 'chapea', "cheese from l'alt urgell y la cerdanya", 'chexo', 'chilean pisco', 'chistorramesta', 'cider confit', 'civet de lapin', 'coca de recapte', 'colby cheese', 'collard liquor', 'corn steak', 'cracknel', 'cremas', 'croûte aux morilles', 'culinary speciality', 'dairy drink', 'deditos de queso', 'deram deram', 'd

In [58]:
list_with_issues_it = [re.sub("q[0-9]+","",item.lower()) for item in list_with_issues_it]
list_with_issues_it = sorted([re.sub("\-", " ",item) for item in list_with_issues_it])

df_check_it.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
df_check_it.shape
df_check_it.head()

df_check_it.to_csv('df_check_it.csv')

(106, 4)

Unnamed: 0,name,page,summary,image_ref
0,achar,https://it.wikipedia.org/wiki/Amrita_Acharia,"Amrita Acharia, in nepalese अमृता आचार्य (Katm...",https://upload.wikimedia.org/wikipedia/commons...
1,acqua pazza,https://it.wikipedia.org/wiki/Pesce_all%27acqu...,L'acqua pazza è una preparazione del pesce tip...,https://upload.wikimedia.org/wikipedia/commons...
5,all sport,https://it.wikipedia.org/wiki/Sport,Lo sport è l'insieme di attività che impegna -...,https://upload.wikimedia.org/wikipedia/commons...
6,amba,https://it.wikipedia.org/wiki/Amba_Aradam,"L'Amba Aradam (amarico: አምባ አረደም, Āmiba Āredem...",https://upload.wikimedia.org/wikipedia/commons...
7,anju,https://it.wikipedia.org/wiki/L%27intendente_S...,L'intendente Sansho è un film del 1954 diretto...,https://upload.wikimedia.org/wikipedia/commons...


In [59]:
wikipedia.set_lang("es")
wiki_info_list_es = []
items_with_issues_list_es = []
for item in items_with_issues_list_it:
  time.sleep(1)
  wiki_dict_es = {}
  try:
    wiki_dict_es['name'] = item
    wiki_dict_es['page'] = wikipedia.page(item).url
    wiki_dict_es['summary'] = wikipedia.summary(item)
    wiki_dict_es['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list_es.append(item)
  wiki_info_list_es.append(wiki_dict_es)

for item in wiki_info_list_es:
  dump_jsonl([item],'taste_atlas_info_es.jsonl', append=True)  



  lis = BeautifulSoup(html).find_all('li')


Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 re

In [60]:
df_check_es = pd.read_json('taste_atlas_info_es.jsonl',lines=True)
df_check_es.shape
df_check_es.head()

df_check_es[df_check_es.duplicated(['name'],False)]

list_with_issues_es = list(df_check_es[df_check_es['page'].isna()]['name'])
print(len(list_with_issues_es),list_with_issues_es)

list_with_issues_es = [re.sub("q[0-9]+","",item.lower()) for item in list_with_issues_es]
list_with_issues_es = sorted([re.sub("\-", " ",item) for item in list_with_issues_es])

df_check_es.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
df_check_es.shape
df_check_es.head()

df_check_es.to_csv('df_check_es.csv')

(231, 4)

Unnamed: 0,name,page,summary,image_ref
0,agneau du périgord,,,
1,ahtapot salata,,,
2,air sirap,,,
3,arabaşı soup,,,
4,aranese cream,,,


Unnamed: 0,name,page,summary,image_ref


173 ['agneau du périgord', 'ahtapot salata', 'air sirap', 'arabaşı soup', 'aranese cream', 'ardi gasna', 'banga', 'barley based dishes', 'bavarian beers', "bey's soup", 'beyin salata', 'beyran', 'binyang dog leg dish', 'blood dishes', 'boil up', 'boterkoek', 'bouchon de sancerre', 'bouyiourdi', 'brassica dishes', 'breaded mushrooms', 'briquette de brebis', 'budyń', 'burgossan cheese', 'bún thịt nướng', 'cachena cattle', 'calv steak adlon', 'caravane', 'carp soup', 'chacha', 'chapea', "cheese from l'alt urgell y la cerdanya", 'chilean pisco', 'chistorramesta', 'cider confit', 'civet de lapin', 'collard liquor', 'corn steak', 'cracknel', 'croûte aux morilles', 'culinary speciality', 'dairy drink', 'deram deram', 'dondonyaki', 'east friesland tea blend', 'eba', 'ed', 'egg–anchovy salad', 'fried srhimp', 'gala', 'gaufre aux fruits', 'gerbeaud cake', 'groninger cake', 'gyulai kolbász', 'hibiscus sabdariffa costanera', 'hořické roll', 'ikinaridango', 'issen yōshoku', 'i̇slim kebap', 'jambon 

(58, 4)

Unnamed: 0,name,page,summary,image_ref
6,arjona tuberosa,https://es.wikipedia.org/wiki/Arjona_tuberosa,Arjona tuberosa es una especie de planta fa...,https://upload.wikimedia.org/wikipedia/commons...
7,arroz al horno,https://es.wikipedia.org/wiki/Arroz_al_horno,El arroz al horno (arròs al forn en valenciano...,https://upload.wikimedia.org/wikipedia/commons...
8,azumadon,https://es.wikipedia.org/wiki/Azulado,Azulado es un EP de la banda argentina Los 7 D...,https://upload.wikimedia.org/wikipedia/commons...
12,bear claw,https://es.wikipedia.org/wiki/Bear_claw,Un bear claw (‘garra de oso’) es un dulce de d...,https://upload.wikimedia.org/wikipedia/commons...
13,bee's knees,https://es.wikipedia.org/wiki/Bee%27s_Knees,"El Bee's Knees es un cóctel hecho con ginebra,...",https://upload.wikimedia.org/wikipedia/commons...


In [0]:
def get_wikipedia_page_summaries(input_issues_list, output_jsonl_file,lang='en'):
  wikipedia.set_lang(lang)
  output_list = []
  output_issues_list = []

  for item in input_issues_list:
    time.sleep(0.5)
    wiki_dict = {}
    try:
      wiki_dict['name'] = item
      wiki_dict['page'] = wikipedia.page(item).url
      wiki_dict['summary'] = wikipedia.summary(item)
      wiki_dict['image_ref'] = wikipedia.page(item).images[0]
    except:
      output_issues_list.append(item)
    output_list.append(wiki_dict)
  for item in output_list:
    dump_jsonl([item],output_jsonl_file, append=True)
  return output_issues_list



In [0]:
df_check_it.to_csv('df_check_it.csv')
df_check_es.to_csv('df_check_es.csv')

In [63]:
list_with_issues_fr = get_wikipedia_page_summaries(list_with_issues_es, 'taste_atlas_info_fr.jsonl',lang='fr')



  lis = BeautifulSoup(html).find_all('li')


Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 re

In [65]:
df_check_fr = pd.read_json('taste_atlas_info_fr.jsonl',lines=True)
df_check_fr.shape
df_check_fr.head()

df_check_fr[df_check_fr.duplicated(['name'],False)]

list_with_issues_fr = list(df_check_fr[df_check_fr['page'].isna()]['name'])
print(len(list_with_issues_fr),list_with_issues_fr)

list_with_issues_fr = [re.sub("q[0-9]+","",item.lower()) for item in list_with_issues_fr]
list_with_issues_fr = sorted([re.sub("\-", " ",item) for item in list_with_issues_fr])

df_check_fr.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
df_check_fr.shape
df_check_fr.head()

df_check_fr.to_csv('df_check_fr.csv')

(250, 4)

Unnamed: 0,name,page,summary,image_ref
0,alcamo wine,,,
1,apohtin,https://fr.wikipedia.org/wiki/Charcuterie,Le terme charcuterie désigne couramment de nom...,https://upload.wikimedia.org/wikipedia/commons...
2,ardi gasna,https://fr.wikipedia.org/wiki/Ardi-Gasna,Ardi-Gasna est une marque commerciale déposée ...,https://upload.wikimedia.org/wikipedia/commons...
3,bavarian beers,,,
4,bey's soup,,,


Unnamed: 0,name,page,summary,image_ref
2,ardi gasna,https://fr.wikipedia.org/wiki/Ardi-Gasna,Ardi-Gasna est une marque commerciale déposée ...,https://upload.wikimedia.org/wikipedia/commons...
3,bavarian beers,,,
4,bey's soup,,,
5,beyran,,,
6,boil up,,,
...,...,...,...,...
236,tupí,https://fr.wikipedia.org/wiki/Tupis,Les Tupis sont les Amérindiens autochtones de ...,https://upload.wikimedia.org/wikipedia/commons...
237,turkish style semolina dessert,,,
238,tutmaç soup,,,
239,uyutma,,,


183 ['alcamo wine', 'bavarian beers', "bey's soup", 'beyran', 'boil up', 'boterkoek', 'budyń', 'burgossan cheese', 'carp soup', 'chacha', "cheese from l'alt urgell y la cerdanya", 'chistorramesta', 'cokodok', 'fried srhimp', 'gala', 'gerbeaud cake', 'groninger cake', 'guaiwei', 'gyulai kolbász', 'hibiscus sabdariffa costanera', 'ikinaridango', 'imbuljuta tal qastan', 'i̇slim kebap', "jeppson's malört", 'kaisen don', 'keshi yena', 'kibi dango', 'krûdkoeke', 'kuzu kapama', 'ladro di colonnata', 'lomnice biscuits', 'mercimek köftesi', 'minced meat patties', 'misoltin', 'nisa cheese', 'old sour', 'pear of jumilla', 'pumpkin tortellini', 'saketini', 'sansho pepper', 'semizotu with yogurt', 'semmelwrap', 'spiesebraten', 'ternera asturiana', 'ternera gallega', 'tokiwado kaminariokoshi honpo co.,ltd.', 'turkish style semolina dessert', 'tutmaç soup', 'uyutma', 'yuvalama soup', 'éisleker ham', 'ahtapot salata', 'air sirap', 'arabaşı soup', 'aranese cream', 'barley based dishes', 'bavarian beers

(67, 4)

Unnamed: 0,name,page,summary,image_ref
1,apohtin,https://fr.wikipedia.org/wiki/Charcuterie,Le terme charcuterie désigne couramment de nom...,https://upload.wikimedia.org/wikipedia/commons...
2,ardi gasna,https://fr.wikipedia.org/wiki/Ardi-Gasna,Ardi-Gasna est une marque commerciale déposée ...,https://upload.wikimedia.org/wikipedia/commons...
8,bouchon de sancerre,https://fr.wikipedia.org/wiki/Bouchon_de_Sancerre,Le bouchon de Sancerre ou bouchon de chèvre es...,https://upload.wikimedia.org/wikipedia/commons...
9,briquette de brebis,https://fr.wikipedia.org/wiki/Briquette_de_brebis,La briquette de brebis est un fromage au lait ...,https://upload.wikimedia.org/wikipedia/commons...
14,chambolle musigny wine,https://fr.wikipedia.org/wiki/Les_Gouttes_de_Dieu,"Les Gouttes de Dieu (神の雫, Kami no Shizuku) est...",https://upload.wikimedia.org/wikipedia/commons...


In [66]:
list_with_issues_fy = get_wikipedia_page_summaries(list_with_issues_fr, 'taste_atlas_info_fy.jsonl',lang='fy')

Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 re

In [0]:
def remove_duplicate_list_items(input_list):
  c = Counter(input_list)
  output_list = []
  for key in c.keys():
    if len(key) != 0:
      output_list.append(key)
  
  return output_list



In [72]:
df_check_fy = pd.read_json('taste_atlas_info_fy.jsonl',lines=True)
df_check_fy.shape
df_check_fy.head()

df_check_fy[df_check_fy.duplicated(['name'],False)]

list_with_issues_fy = remove_duplicate_list_items(list(df_check_fy[df_check_fy['page'].isna()]['name']))

print(len(list_with_issues_fy),list_with_issues_fy)

list_with_issues_fy = [re.sub("q[0-9]+","",item.lower()) for item in list_with_issues_fy]
list_with_issues_fy = sorted([re.sub("\-", " ",item) for item in list_with_issues_fy])

df_check_fy.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
df_check_fy.shape
df_check_fy.head()

df_check_fy.to_csv('df_check_fy.csv')

(234, 4)

Unnamed: 0,name,page,summary,image_ref
0,alcamo wine,,,
1,bavarian beers,,,
2,bey's soup,,,
3,beyran,,,
4,boil up,,,


Unnamed: 0,name,page,summary,image_ref
0,alcamo wine,,,
1,bavarian beers,,,
2,bey's soup,,,
3,beyran,,,
4,boil up,,,
...,...,...,...,...
220,uyutma,,,
221,uyutma,,,
228,yuvalama soup,,,
229,yuvalama soup,,,


138 ['alcamo wine', 'bavarian beers', "bey's soup", 'beyran', 'boil up', 'boterkoek', 'budyń', 'burgossan cheese', 'carp soup', 'chacha', "cheese from l'alt urgell y la cerdanya", 'chistorramesta', 'cokodok', 'fried srhimp', 'gerbeaud cake', 'groninger cake', 'guaiwei', 'gyulai kolbász', 'hibiscus sabdariffa costanera', 'ikinaridango', 'imbuljuta tal qastan', 'i̇slim kebap', "jeppson's malört", 'kaisen don', 'keshi yena', 'kibi dango', 'kuzu kapama', 'ladro di colonnata', 'lomnice biscuits', 'mercimek köftesi', 'minced meat patties', 'misoltin', 'nisa cheese', 'old sour', 'pear of jumilla', 'pumpkin tortellini', 'saketini', 'sansho pepper', 'semizotu with yogurt', 'semmelwrap', 'spiesebraten', 'ternera asturiana', 'ternera gallega', 'tokiwado kaminariokoshi honpo co.,ltd.', 'turkish style semolina dessert', 'tutmaç soup', 'uyutma', 'yuvalama soup', 'éisleker ham', 'ahtapot salata', 'air sirap', 'arabaşı soup', 'aranese cream', 'barley based dishes', 'beyin salata', 'binyang dog leg dis

(8, 4)

Unnamed: 0,name,page,summary,image_ref
14,gala,https://fy.wikipedia.org/wiki/Frysk_Sjongers_Gala,It Frysk Sjongers Gala is in muzykfeest dêr't ...,
27,krûdkoeke,https://fy.wikipedia.org/wiki/Fryske_kr%C3%BBd...,Fryske krûdkoeke is in koeke dy't by de kofje ...,https://upload.wikimedia.org/wikipedia/fy/d/d2...
98,eba,https://fy.wikipedia.org/wiki/Slach_by_Laaksum,De Slach by Laaksum waard op 10 juny 1498 foch...,https://upload.wikimedia.org/wikipedia/commons...
99,ed,https://fy.wikipedia.org/wiki/Ed_Nijpels,Eduardes Hermannes Theresia Maria (Ed) Nijpels...,https://upload.wikimedia.org/wikipedia/commons...
103,gala,https://fy.wikipedia.org/wiki/Frysk_Sjongers_Gala,It Frysk Sjongers Gala is in muzykfeest dêr't ...,


In [73]:
list_with_issues_tr = get_wikipedia_page_summaries(list_with_issues_fy, 'taste_atlas_info_tr.jsonl',lang='tr')



  lis = BeautifulSoup(html).find_all('li')


Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 records to taste_atlas_info_tr.jsonl
Wrote 1 re

In [75]:
df_check_tr = pd.read_json('taste_atlas_info_tr.jsonl',lines=True)
df_check_tr.shape
df_check_tr.head()

df_check_tr[df_check_tr.duplicated(['name'],False)]

list_with_issues_tr = remove_duplicate_list_items(list(df_check_tr[df_check_tr['page'].isna()]['name']))
print(len(list_with_issues_tr),list_with_issues_tr)

list_with_issues_tr = [re.sub("q[0-9]+","",item.lower()) for item in list_with_issues_tr]
list_with_issues_tr = sorted([re.sub("\-", " ",item) for item in list_with_issues_tr])

df_check_tr.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
df_check_tr.drop_duplicates(inplace=True)
df_check_tr.shape
df_check_tr.head()

df_check_tr.to_csv('df_check_tr.csv')

(364, 4)

Unnamed: 0,name,page,summary,image_ref
0,ahtapot salata,https://tr.wikipedia.org/wiki/Ahtapot_salatas%...,"Ahtapot salatası, Türk mutfağında (sh. 371) bi...",https://upload.wikimedia.org/wikipedia/commons...
1,air sirap,,,
2,alcamo wine,,,
3,alcamo wine,,,
4,arabaşı soup,,,


Unnamed: 0,name,page,summary,image_ref
0,ahtapot salata,https://tr.wikipedia.org/wiki/Ahtapot_salatas%...,"Ahtapot salatası, Türk mutfağında (sh. 371) bi...",https://upload.wikimedia.org/wikipedia/commons...
1,air sirap,,,
2,alcamo wine,,,
3,alcamo wine,,,
4,arabaşı soup,,,
...,...,...,...,...
359,yuvalama soup,,,
360,yüksük soup,,,
361,zambousies,,,
362,zeytinyağlı enginar,,,


129 ['air sirap', 'alcamo wine', 'arabaşı soup', 'aranese cream', 'barley based dishes', 'bavarian beers', "bey's soup", 'binyang dog leg dish', 'blood dishes', 'boil up', 'boterkoek', 'bouyiourdi', 'brassica dishes', 'breaded mushrooms', 'budyń', 'burgossan cheese', 'cachena cattle', 'calv steak adlon', 'carp soup', "cheese from l'alt urgell y la cerdanya", 'chilean pisco', 'chistorramesta', 'cider confit', 'cokodok', 'collard liquor', 'corn steak', 'culinary speciality', 'dairy drink', 'deram deram', 'dondonyaki', 'east friesland tea blend', 'egg–anchovy salad', 'fried srhimp', 'gerbeaud cake', 'groninger cake', 'guaiwei', 'gyulai kolbász', 'hibiscus sabdariffa costanera', 'ikinaridango', 'imbuljuta tal qastan', 'issen yōshoku', 'i̇slim kebap', "jeppson's malört", 'jugged dog meat', 'kabak kalye', 'kaisen don', 'keshi yena', 'kibi dango', 'kinugasadon', 'komanmelna', 'kurkkusalaatti', 'ladro di colonnata', 'lomnice biscuits', 'lëng viçi', 'malai laddu', 'mallorcan soup', 'marsala win

(10, 4)

Unnamed: 0,name,page,summary,image_ref
0,ahtapot salata,https://tr.wikipedia.org/wiki/Ahtapot_salatas%...,"Ahtapot salatası, Türk mutfağında (sh. 371) bi...",https://upload.wikimedia.org/wikipedia/commons...
13,beyin salata,https://tr.wikipedia.org/wiki/Sakatat,"Sakatat, kesimi yapılan hayvanların kasları dı...",https://upload.wikimedia.org/wikipedia/commons...
14,beyran,https://tr.wikipedia.org/wiki/Behram,"Behram (veya Bahram), Pers mitolojisinde gezeg...",https://upload.wikimedia.org/wikipedia/commons...
39,chacha,https://tr.wikipedia.org/wiki/Chacha_Saat_Kulesi,"Chacha Saat Kulesi, Batum'da bulunan eser. Mim...",https://upload.wikimedia.org/wikipedia/commons...
54,cracknel,https://tr.wikipedia.org/wiki/Peppermint_Crisp,"Peppermint Crisp, Nestlé tarafından üretilen ç...",https://upload.wikimedia.org/wikipedia/commons...


In [76]:
list_with_issues_id = get_wikipedia_page_summaries(list_with_issues_tr, 'taste_atlas_info_id.jsonl',lang='id')

Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 records to taste_atlas_info_id.jsonl
Wrote 1 re

In [77]:
df_check_id = pd.read_json('taste_atlas_info_id.jsonl',lines=True)
df_check_id.shape
df_check_id.head()

df_check_id[df_check_id.duplicated(['name'],False)]

list_with_issues_id = remove_duplicate_list_items(list(df_check_id[df_check_id['page'].isna()]['name']))
print(len(list_with_issues_id),list_with_issues_id)

list_with_issues_id = [re.sub("q[0-9]+","",item.lower()) for item in list_with_issues_id]
list_with_issues_id = sorted([re.sub("\-", " ",item) for item in list_with_issues_id])

df_check_id.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
df_check_id.drop_duplicates(inplace=True)
df_check_id.shape
df_check_id.head()

df_check_id.to_csv('df_check_id.csv')

(129, 4)

Unnamed: 0,name,page,summary,image_ref
0,air sirap,https://id.wikipedia.org/wiki/Sirap_bandung,"Sirap bandung, atau air bandung adalah nama da...",https://upload.wikimedia.org/wikipedia/commons...
1,alcamo wine,,,
2,arabaşı soup,,,
3,aranese cream,,,
4,barley based dishes,,,


Unnamed: 0,name,page,summary,image_ref


118 ['alcamo wine', 'arabaşı soup', 'aranese cream', 'barley based dishes', 'bavarian beers', "bey's soup", 'binyang dog leg dish', 'blood dishes', 'boil up', 'boterkoek', 'bouyiourdi', 'brassica dishes', 'breaded mushrooms', 'burgossan cheese', 'cachena cattle', 'calv steak adlon', 'carp soup', "cheese from l'alt urgell y la cerdanya", 'chilean pisco', 'chistorramesta', 'cider confit', 'collard liquor', 'corn steak', 'culinary speciality', 'dairy drink', 'east friesland tea blend', 'egg–anchovy salad', 'fried srhimp', 'gerbeaud cake', 'groninger cake', 'guaiwei', 'gyulai kolbász', 'hibiscus sabdariffa costanera', 'ikinaridango', 'imbuljuta tal qastan', 'issen yōshoku', 'i̇slim kebap', "jeppson's malört", 'jugged dog meat', 'kabak kalye', 'kaisen don', 'keshi yena', 'kibi dango', 'kinugasadon', 'komanmelna', 'kurkkusalaatti', 'ladro di colonnata', 'lomnice biscuits', 'lëng viçi', 'malai laddu', 'mallorcan soup', 'marsala wine', 'masmouta salad', 'meal soup', 'minced meat patties', 'mis

(11, 4)

Unnamed: 0,name,page,summary,image_ref
0,air sirap,https://id.wikipedia.org/wiki/Sirap_bandung,"Sirap bandung, atau air bandung adalah nama da...",https://upload.wikimedia.org/wikipedia/commons...
14,budyń,https://id.wikipedia.org/wiki/Budaya,Budaya atau kebudayaan berasal dari bahasa San...,https://upload.wikimedia.org/wikipedia/commons...
23,cokodok,https://id.wikipedia.org/wiki/Cokodok,"Cokodok (terkadang dieja juga: Cekodok), juga ...",
28,deram deram,https://id.wikipedia.org/wiki/Deram-deram,Kue deram-deram merupakan kue khas dari Kepula...,https://upload.wikimedia.org/wikipedia/commons...
29,dondonyaki,https://id.wikipedia.org/wiki/Okonomiyaki,Okonomiyaki (お好み焼き) adalah makanan Jepang deng...,https://upload.wikimedia.org/wikipedia/commons...


In [78]:
list_with_issues_ms = get_wikipedia_page_summaries(list_with_issues_id, 'taste_atlas_info_ms.jsonl',lang='ms')

Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 records to taste_atlas_info_ms.jsonl
Wrote 1 re

In [0]:
def fix_dataframe(input_jsonl_file):
  df = pd.read_json(input_jsonl_file, lines=True)
  df.shape
  df.head()

  df[df.duplicated(['name'],False)]

  output_list = remove_duplicate_list_items(list(df[df['page'].isna()]['name']))
  print(len(output_list),output_list)

  output_list = [re.sub("q[0-9]+","",item.lower()) for item in output_list]
  output_list = sorted([re.sub("\-", " ",item) for item in output_list])

  df.dropna(axis=0,how='all',subset=['page','summary','image_ref'],inplace=True)
  df.drop_duplicates(inplace=True)
  df.shape
  df.head()

  return df




In [81]:
df_check_ms = fix_dataframe('taste_atlas_info_ms.jsonl')
df_check_ms.to_csv('df_check_ms.csv')

115 ['alcamo wine', 'arabaşı soup', 'aranese cream', 'barley based dishes', 'bavarian beers', "bey's soup", 'binyang dog leg dish', 'blood dishes', 'boil up', 'boterkoek', 'bouyiourdi', 'brassica dishes', 'breaded mushrooms', 'burgossan cheese', 'cachena cattle', 'calv steak adlon', 'carp soup', "cheese from l'alt urgell y la cerdanya", 'chistorramesta', 'cider confit', 'collard liquor', 'corn steak', 'culinary speciality', 'dairy drink', 'east friesland tea blend', 'egg–anchovy salad', 'fried srhimp', 'gerbeaud cake', 'groninger cake', 'gyulai kolbász', 'hibiscus sabdariffa costanera', 'ikinaridango', 'imbuljuta tal qastan', 'issen yōshoku', 'i̇slim kebap', "jeppson's malört", 'jugged dog meat', 'kabak kalye', 'kaisen don', 'keshi yena', 'kibi dango', 'kinugasadon', 'komanmelna', 'kurkkusalaatti', 'ladro di colonnata', 'lomnice biscuits', 'lëng viçi', 'malai laddu', 'mallorcan soup', 'marsala wine', 'masmouta salad', 'meal soup', 'minced meat patties', 'misoltin', 'miyan kuka', 'murci

In [82]:
df_check_ms.head()

Unnamed: 0,name,page,summary,image_ref
18,chilean pisco,https://ms.wikipedia.org/wiki/Senarai_bandar_d...,Ini adalah sebuah senarai bandar di Peru.\n\nA...,https://upload.wikimedia.org/wikipedia/commons...
30,guaiwei,https://ms.wikipedia.org/wiki/Masakan_Sichuan,"Masakan Sichuan, masakan Szechwan, atau masaka...",https://upload.wikimedia.org/wikipedia/commons...
100,thai dish,https://ms.wikipedia.org/wiki/Pad_Thai,"Pad thai, phat thai atau phad thai (Bahasa Tha...",https://upload.wikimedia.org/wikipedia/commons...


In [19]:
len(matched_list)

8025

In [115]:
len(list(df_consolidated['itemLabel']))

8026

In [120]:
cons_list = list(df_consolidated['itemLabel'])
full_list = sorted([re.sub("q[0-9]+","",item.lower()) for item in cons_list])
full_list = sorted([re.sub("\-", " ",item) for item in full_list])
full_list = remove_duplicate_list_items(full_list)
print(len(cons_list), len(full_list))

8026 6004


In [125]:
[item for item in full_list if item[0] == 'q']

['qamar al din',
 'qarta',
 'qatayef',
 'qedra',
 'qishr',
 'qottab',
 'quad city style pizza',
 'quarkkäulchen',
 'quartirolo lombardo',
 'quarts de chaume',
 'quatre épices',
 'quattro stagioni',
 'queen of puddings',
 'queens',
 'queijada',
 'queijo coalho',
 'queijo de azeitão',
 'queijo de cabra transmontano',
 'queijo do pico',
 'queijo mestiço de tolosa',
 'queijo rabaçal',
 'queijo terrincho',
 'quenelle',
 'quentão',
 'quesada pasiega',
 'quesadilla',
 'quesito',
 'queso camerano',
 'queso chihuahua',
 'queso flameado',
 'quetschentaart',
 'quibebe',
 'quiche',
 'quick fuck',
 'quince cheese',
 'quince dessert',
 'quincy aoc',
 'quindim',
 'qurut',
 'qurutob',
 'qutab',
 'quzi',
 'qvevri']

# pattern

In [50]:
full_list = list(itertools.chain.from_iterable(result_lists))
full_list.extend(taste_atlas_list)

full_list = sorted([item for item in full_list if not item[0].isdigit()])

full_counter = Counter(full_list)
full_list=[]
for key in full_counter.keys():
  if len(key) != 0:
    full_list.append(key)
print(len(full_list))

5996


In [126]:
major_list = []

for item in full_list:
  pattern_list = []
  pattern_dict = {}
  id_str = ''
  for word in item.split():
    token_dict = {}
    token_dict['LOWER'] = word
    pattern_list.append(token_dict)
    id_str += '_' + word 

  pattern_dict['label'] = 'dish'
  pattern_dict['pattern'] = pattern_list
  if id_str[0] == '_':
    id_str = id_str[1:]
  pattern_dict['id'] = id_str

  major_list.append(pattern_dict)

major_list


[{'id': "'mpanatigghi",
  'label': 'dish',
  'pattern': [{'LOWER': "'mpanatigghi"}]},
 {'id': "'nduja", 'label': 'dish', 'pattern': [{'LOWER': "'nduja"}]},
 {'id': "'ota_'ika",
  'label': 'dish',
  'pattern': [{'LOWER': "'ota"}, {'LOWER': "'ika"}]},
 {'id': '.amaro_ramazzotti',
  'label': 'dish',
  'pattern': [{'LOWER': '.amaro'}, {'LOWER': 'ramazzotti'}]},
 {'id': '15_bean_soup',
  'label': 'dish',
  'pattern': [{'LOWER': '15'}, {'LOWER': 'bean'}, {'LOWER': 'soup'}]},
 {'id': '1519_tequila',
  'label': 'dish',
  'pattern': [{'LOWER': '1519'}, {'LOWER': 'tequila'}]},
 {'id': '2007_vietnam_food_scare',
  'label': 'dish',
  'pattern': [{'LOWER': '2007'},
   {'LOWER': 'vietnam'},
   {'LOWER': 'food'},
   {'LOWER': 'scare'}]},
 {'id': '20th_century',
  'label': 'dish',
  'pattern': [{'LOWER': '20th'}, {'LOWER': 'century'}]},
 {'id': '3_a.m._vodka',
  'label': 'dish',
  'pattern': [{'LOWER': '3'}, {'LOWER': 'a.m.'}, {'LOWER': 'vodka'}]},
 {'id': '4_copas',
  'label': 'dish',
  'pattern': [{

In [0]:
dish_keyword_patterns = np.array(major_list)
np.savez("dish_keyword_patterns", dish_keyword_patterns)

In [129]:
ls

 CoherenceScore.csv          pyLDAvis_13.html
 datalist_corrected.csv      pyLDAvis_14.html
 df_check_de.csv             pyLDAvis_15.html
 df_check_en.csv             pyLDAvis_16.html
 df_check_es.csv             pyLDAvis_17.html
 df_check_fr.csv             pyLDAvis_18.html
 df_check_fy.csv             pyLDAvis_19.html
 df_check_id.csv             pyLDAvis_20.html
 df_check_it.csv             pyLDAvis_21.html
 df_check_ms.csv             pyLDAvis_22.html
 df_check_tr.csv             pyLDAvis_23.html
 df_coffee.csv               pyLDAvis_24.html
 df_consolidated.csv         pyLDAvis_25.html
 df_dish.csv                 pyLDAvis_26.html
 df_drink.csv                pyLDAvis_27.html
 df_hot_bev.csv              pyLDAvis_28.html
 df_noodle.csv               pyLDAvis_29.html
 df_rice.csv                 pyLDAvis_3.html
 df_salad.csv                pyLDAvis_4.html
 df_soup.csv                 pyLDAvis_5.html
 df_taste_atlas.csv          pyLDAvis_6.html
 dictionary                  pyLDAvis_

In [130]:
for item in major_list:
  dump_jsonl([item],'patterns.jsonl', append=True)

Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 records to patterns.jsonl
Wrote 1 record

In [60]:
df_consolidated.drop_duplicates(inplace=True)
df_consolidated.shape

(8025, 8)

In [0]:
non_duplicated_list = list(df_consolidated['itemLabel'])

In [63]:
wiki_info_list = []
items_with_issues_list = []
for item in non_duplicated_list:
  time.sleep(1)
  wiki_dict = {}
  try:
    wiki_dict['name'] = item
    wiki_dict['page'] = wikipedia.page(item).url
    wiki_dict['summary'] = wikipedia.summary(item)
    wiki_dict['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list.append(item)
  wiki_info_list.append(wiki_dict)


    



  lis = BeautifulSoup(html).find_all('li')


In [65]:
wiki_info_list[0:10]

[{'image_ref': 'https://upload.wikimedia.org/wikipedia/commons/1/1d/%27Mpanatigghi.JPG',
  'name': "'Mpanatigghi",
  'page': 'https://en.wikipedia.org/wiki/Empanada',
  'summary': 'An empanada is a type of baked or fried turnover consisting of pastry and filling, common in Latin American and Filipino cultures. The name comes from the Spanish verb empanar, and literally translates as "enbreaded", that is, wrapped or coated in bread.  They are made by folding dough over a filling, which may consist of meat, cheese, corn, or other ingredients, and then cooking the resulting turnover, either by baking or frying.  \nThey resemble turnovers from many other cuisines and cultures, including the pasty from the British Isles, the samosa from the Central and South Asia, or the pirozhki from Russia.'},
 {'image_ref': 'https://upload.wikimedia.org/wikipedia/commons/0/04/%27Nduja.JPG',
  'name': "'Nduja",
  'page': 'https://en.wikipedia.org/wiki/%27Nduja',
  'summary': "'Nduja (Calabrian: [nˈduːja])

In [66]:
for item in wiki_info_list:
  dump_jsonl([item],'taste_atlas_info_en.jsonl', append=True)

Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 records to taste_atlas_info_en.jsonl
Wrote 1 re

In [68]:
len(items_with_issues_list)

1957

In [1]:
items_with_issues_list[-300:]

NameError: ignored

In [0]:
print(wikipedia.summary(taste_atlas_list[6]))
print(wikipedia.page(taste_atlas_list[6]).url)
print(wikipedia.page(taste_atlas_list[6]).content)


Aamras (or amras) is a sweet dish featuring in the cuisine of the Indian subcontinent and made from the pulp of the mango fruit. The pulp of a ripe mango is extracted, usually by hand, and is consumed together with pooris or chapati(Indian breads). Sometimes ghee and milk are added to the pulp to enhance its flavour. Sugar is also added to adjust the sweetness.It is often had at celebrations and weddings with cardamon and chopped fruits.
A regional version of aamras is a popular dessert in Rajasthani cuisine and Marwari, Marathi, and  Gujarati homes, especially during festivities.
Since the fruit is seasonal, being harvested at the end of summer, the need to preserve the fruit in the form of pulp has given rise to a moderately large mango-processing industry.
https://en.wikipedia.org/wiki/Aamras
Aamras (or amras) is a sweet dish featuring in the cuisine of the Indian subcontinent and made from the pulp of the mango fruit. The pulp of a ripe mango is extracted, usually by hand, and is c

In [0]:
df_taste_atlas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6079 entries, 0 to 6078
Data columns (total 3 columns):
subclass         6079 non-null object
subclassLabel    6079 non-null object
TasteAtlas_ID    6079 non-null object
dtypes: object(3)
memory usage: 142.6+ KB


In [0]:
df_taste_atlas.query('subclassLabel == "Bosna"')

Unnamed: 0,subclass,subclassLabel,TasteAtlas_ID
3323,http://www.wikidata.org/entity/Q358507,Bosna,bosna


In [0]:
wikipedia.set_lang("de")
wiki_info_list_de = []
items_with_issues_list_de = []
for item in items_with_issues_list:
  time.sleep(1)
  wiki_dict_de = {}
  try:
    wiki_dict_de['name'] = item
    wiki_dict_de['page'] = wikipedia.page(item).url
    wiki_dict_de['summary'] = wikipedia.summary(item)
    wiki_dict_de['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list_de.append(item)
  wiki_info_list_de.append(wiki_dict_de)



  lis = BeautifulSoup(html).find_all('li')


In [0]:
for item in wiki_info_list_de:
  dump_jsonl([item],'taste_atlas_info_de.jsonl', append=True)

Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 records to taste_atlas_info_de.jsonl
Wrote 1 re

In [0]:
len(items_with_issues_list_de)

188

In [0]:
wikipedia.set_lang("it")
wiki_info_list_it = []
items_with_issues_list_it = []
for item in items_with_issues_list_de:
  time.sleep(1)
  wiki_dict_it = {}
  try:
    wiki_dict_it['name'] = item
    wiki_dict_it['page'] = wikipedia.page(item).url
    wiki_dict_it['summary'] = wikipedia.summary(item)
    wiki_dict_it['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list_it.append(item)
  wiki_info_list_it.append(wiki_dict_it)



  lis = BeautifulSoup(html).find_all('li')


In [0]:
for item in wiki_info_list_it:
  dump_jsonl([item],'taste_atlas_info_it.jsonl', append=True)

Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 records to taste_atlas_info_it.jsonl
Wrote 1 re

In [0]:
len(items_with_issues_list_it)

117

In [0]:
wikipedia.set_lang("es")
wiki_info_list_es = []
items_with_issues_list_es = []
for item in items_with_issues_list_it:
  time.sleep(1)
  wiki_dict_es = {}
  try:
    wiki_dict_es['name'] = item
    wiki_dict_es['page'] = wikipedia.page(item).url
    wiki_dict_es['summary'] = wikipedia.summary(item)
    wiki_dict_es['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list_es.append(item)
  wiki_info_list_es.append(wiki_dict_es)



  lis = BeautifulSoup(html).find_all('li')


In [0]:
for item in wiki_info_list_es:
  dump_jsonl([item],'taste_atlas_info_es.jsonl', append=True)

Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 records to taste_atlas_info_es.jsonl
Wrote 1 re

In [0]:
len(items_with_issues_list_es)

77

In [0]:
wikipedia.set_lang("fr")
wiki_info_list_fr = []
items_with_issues_list_fr = []
for item in items_with_issues_list_es:
  time.sleep(1)
  wiki_dict_fr = {}
  try:
    wiki_dict_fr['name'] = item
    wiki_dict_fr['page'] = wikipedia.page(item).url
    wiki_dict_fr['summary'] = wikipedia.summary(item)
    wiki_dict_fr['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list_fr.append(item)
  wiki_info_list_fr.append(wiki_dict_fr)



  lis = BeautifulSoup(html).find_all('li')


In [0]:
for item in wiki_info_list_fr:
  dump_jsonl([item],'taste_atlas_info_fr.jsonl', append=True)

Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 records to taste_atlas_info_fr.jsonl
Wrote 1 re

In [0]:
len(items_with_issues_list_fr)

51

In [0]:
wikipedia.set_lang("fy")
wiki_info_list_fy = []
items_with_issues_list_fy = []
for item in items_with_issues_list_fr:
  time.sleep(1)
  wiki_dict_fy = {}
  try:
    wiki_dict_fy['name'] = item
    wiki_dict_fy['page'] = wikipedia.page(item).url
    wiki_dict_fy['summary'] = wikipedia.summary(item)
    wiki_dict_fy['image_ref'] = wikipedia.page(item).images[0]
  except:
    items_with_issues_list_fy.append(item)
  wiki_info_list_fy.append(wiki_dict_fy)

In [0]:
for item in wiki_info_list_fy:
  dump_jsonl([item],'taste_atlas_info_fy.jsonl', append=True)

Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 records to taste_atlas_info_fy.jsonl
Wrote 1 re

In [0]:
wiki_info_list_fy

[{'name': 'alcamo wine'},
 {'name': 'bavarian beers'},
 {'name': "bey's soup"},
 {'name': 'beyran'},
 {'name': 'boil up'},
 {'name': 'boterkoek'},
 {'name': 'budyń'},
 {'name': 'burgossan cheese'},
 {'name': 'carp soup'},
 {'name': 'chacha'},
 {'name': "cheese from l'alt urgell y la cerdanya"},
 {'name': 'chistorramesta'},
 {'name': 'cokodok'},
 {'name': 'fried srhimp'},
 {'name': 'gala',
  'page': 'https://fy.wikipedia.org/wiki/Frysk_Sjongers_Gala',
  'summary': "It Frysk Sjongers Gala is in muzykfeest dêr't it Fryske liet sintraal stiet. Dit muzykfestival wurdt oan de ein fan it jier holden yn De Lawei yn Drachten. It Gala, dat sûnt 2012 bestiet, wie in inisjatyf fan Piter Wilkens, Griet Wiersma, Anneke Douma en Gurbe Douwstra. It orkest bestiet út Fryske muzikanten, mei in eftergrûnkoar, in strykkwartet en in blazerssseksje. De muzikale lieding is yn hannen fan Peter van der Zwaag. Op it programma steane neist nije lieten ek medleys fan lieten út de ôfrûne desennia. It is in griemma

In [0]:
len(items_with_issues_list_fy)

50

In [0]:
df_json = pd.read_json('taste_atlas_info.jsonl', lines=True)
df_json

Unnamed: 0,name,page,summary,image_ref
0,'mpanatigghi,https://en.wikipedia.org/wiki/Empanada,An empanada is a type of baked or fried turnov...,https://upload.wikimedia.org/wikipedia/commons...
1,'nduja,https://en.wikipedia.org/wiki/%27Nduja,'Nduja (Calabrian: [nˈduːja]) is a particularl...,https://upload.wikimedia.org/wikipedia/commons...
2,'ota 'ika,https://en.wikipedia.org/wiki/%27Ota_%27ika,"'Ota ika is a Polynesian dish, similar to Lati...",https://upload.wikimedia.org/wikipedia/commons...
3,.amaro ramazzotti,https://en.wikipedia.org/wiki/Amaro_(liqueur),"Amaro (Italian for ""bitter"") is an Italian her...",https://upload.wikimedia.org/wikipedia/commons...
4,20th century,https://en.wikipedia.org/wiki/20th_Century_Fox,Twentieth Century Fox Film Corporation (colloq...,https://upload.wikimedia.org/wikipedia/commons...
...,...,...,...,...
5050,štramberk ears,https://en.wikipedia.org/wiki/%C5%A0tramberk_ears,Štramberk ears (Czech: Štramberské uši) is a M...,https://upload.wikimedia.org/wikipedia/commons...
5051,štruklji,https://en.wikipedia.org/wiki/%C5%A0truklji,"Štruklji are a traditional Slovene dish, compo...",https://upload.wikimedia.org/wikipedia/commons...
5052,żymlok,https://en.wikipedia.org/wiki/Kaszanka,Kaszanka is a traditional blood sausage in eas...,https://upload.wikimedia.org/wikipedia/commons...
5053,žemlovka,https://en.wikipedia.org/wiki/%C5%BDemlovka,Žemlovka (Moravian dialect: zemlbába) is a swe...,https://upload.wikimedia.org/wikipedia/commons...


In [0]:
df_json[['name','summary']].dropna().to_csv('test_summaries.csv')