<a href="https://colab.research.google.com/github/ravi-gopalan/DAND_Data_Wrangling/blob/master/city_list_wikidata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install modules

In [1]:
# Install sparqlwrapper for retrieving wikidata info, wikipedia and pymediawiki to retrieve data from wikipedia

!pip install sparqlwrapper
!pip install wikipedia
!pip install pymediawiki

Collecting sparqlwrapper
  Downloading https://files.pythonhosted.org/packages/00/9b/443fbe06996c080ee9c1f01b04e2f683b2b07e149905f33a2397ee3b80a2/SPARQLWrapper-1.8.5-py3-none-any.whl
Collecting rdflib>=4.0
[?25l  Downloading https://files.pythonhosted.org/packages/3c/fe/630bacb652680f6d481b9febbb3e2c3869194a1a5fc3401a4a41195a2f8f/rdflib-4.2.2-py3-none-any.whl (344kB)
[K     |████████████████████████████████| 348kB 5.7MB/s 
Collecting isodate
[?25l  Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)
[K     |████████████████████████████████| 51kB 7.3MB/s 
Installing collected packages: isodate, rdflib, sparqlwrapper
Successfully installed isodate-0.6.0 rdflib-4.2.2 sparqlwrapper-1.8.5
Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected

## Import Libraries

In [0]:
# Import pandas, numpy, SPARQLWrapper, re, json, collections, time, itertools, requests, json_normalize, spacy, wikipedia, mediawiki and interactiveshell

import pandas as pd
import numpy as np
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import re
from collections import Counter
import time
import itertools

import requests
from pandas.io.json import json_normalize

import spacy

import wikipedia
from mediawiki import MediaWiki

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load helper functions

In [0]:
# get results from wikidata
def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
      item = []
      for c in cols:
        item.append(row.get(c, {}).get('value'))
      out.append(item)
    return pd.DataFrame(out, columns = cols)


def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
#    print('Wrote {} records to {}'.format(len(data), output_path))



def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

def get_wikidata_id(col):
  rx = re.compile(r'(http\:\/\/www\.wikidata\.org\/entity\/)(Q[0-9]+)')
  m = rx.match(col)
  if m is not None:
    return m.group(2)

def get_wikidata_description(row):
  base_string = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&languages=en&format=json&props=descriptions&ids='
  query_string = base_string + row
#  print(query_string)
  r = requests.get(query_string)
  return json.loads(r.content.decode('utf-8'))['entities'][row]

def get_wikidata_label(row):
  base_string = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&languages=en&format=json&props=labels&ids='
  query_string = base_string + row
#  print(query_string)
  r = requests.get(query_string)
  return json.loads(r.content.decode('utf-8'))['entities'][row]

def check_Qcodes(x):
  step1 = re.sub("Q[0-9]+","UNKNOWN",x)
  return step1

def query_and_process_results(url, query_parameter):
  query = query_parameter
#  print(query)

  df_initial = get_results(url, query)
  print(df_initial.shape)
  print(df_initial.head())

  df_initial['city_check'] = df_initial['instance_ofLabel'].apply(lambda x: check_Qcodes(x))

  df = df_initial.query('city_check != "UNKNOWN"').reset_index(drop=True)
  print(df.shape)
  print(df.head())  

  res_list =  sorted([re.sub("q[0-9]+","",item.lower()) for item in df.groupby(['instance_ofLabel'])['instance_of'].agg('count').index.values.tolist()])
  res_list = sorted([re.sub("\-", " ",item) for item in res_list])
  res_list = [item for item in res_list if not re.findall("[0-9]+",item)]
  res_counter = Counter(res_list)
  res_list = []
  for key in res_counter.keys():
    if len(key) != 0:
      res_list.append(key)

  print(len(res_list), res_list)

  return df, res_list, res_counter



def clean_pat(x):
  step1 = re.sub("None\,","",x)
  step2 = re.sub("None","",step1)
  step2 = '{"label": "GPE_city", "pattern": ' + '['+ step2.strip()[:-1] + ']'
  return step2


def clean_id(x):
  step1 = re.sub("None","",x)
  step2 = ', "id": "' + re.sub("(\_)([\'\_a-zA-Z\u0080-\uFFFF]+)",r"\2",step1) + '"}'
  return step2

def get_city_summary(x):
  try:
    out = wiki.summary(x)
  except:
    out = 'error'
  return out

In [0]:

url = "https://query.wikidata.org/sparql"

query = '''
SELECT ?instance_of ?instance_ofLabel ?country ?countryLabel ?located_in_the_administrative_territorial_entity ?located_in_the_administrative_territorial_entityLabel WHERE 
{
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?instance_of wdt:P31 wd:Q515.
  OPTIONAL { ?instance_of wdt:P17 ?country. }
  OPTIONAL { ?instance_of wdt:P131 ?located_in_the_administrative_territorial_entity. }
}
'''
#  print(query)

df = get_results(url, query)

In [0]:
url = "https://query.wikidata.org/sparql"

query = '''
SELECT ?instance_of ?instance_ofLabel ?country ?countryLabel ?located_in_the_administrative_territorial_entity ?located_in_the_administrative_territorial_entityLabel WHERE 
{
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?instance_of wdt:P31 wd:Q515.
  OPTIONAL { ?instance_of wdt:P17 ?country. }
  OPTIONAL { ?instance_of wdt:P131 ?located_in_the_administrative_territorial_entity. }
}
'''

In [12]:
df, r_list, r_counter = query_and_process_results(url, query)

(10087, 6)
                            instance_of  ... located_in_the_administrative_territorial_entityLabel
0  http://www.wikidata.org/entity/Q1903  ...                       Metropolitan City of Catania   
1  http://www.wikidata.org/entity/Q1906  ...                                Province of Caserta   
2  http://www.wikidata.org/entity/Q1947  ...                                  Central Equatoria   
3  http://www.wikidata.org/entity/Q1960  ...                               Dar es Salaam Region   
4  http://www.wikidata.org/entity/Q1963  ...                                           Khartoum   

[5 rows x 6 columns]
(8778, 7)
                            instance_of  ...     city_check
0  http://www.wikidata.org/entity/Q1903  ...        Catania
1  http://www.wikidata.org/entity/Q1906  ...        Caserta
2  http://www.wikidata.org/entity/Q1947  ...           Juba
3  http://www.wikidata.org/entity/Q1960  ...  Dar es Salaam
4  http://www.wikidata.org/entity/Q1963  ...       Khartoum

[5

In [13]:
df.head()

Unnamed: 0,instance_of,instance_ofLabel,country,countryLabel,located_in_the_administrative_territorial_entity,located_in_the_administrative_territorial_entityLabel,city_check
0,http://www.wikidata.org/entity/Q1903,Catania,http://www.wikidata.org/entity/Q38,Italy,http://www.wikidata.org/entity/Q20991246,Metropolitan City of Catania,Catania
1,http://www.wikidata.org/entity/Q1906,Caserta,http://www.wikidata.org/entity/Q38,Italy,http://www.wikidata.org/entity/Q16153,Province of Caserta,Caserta
2,http://www.wikidata.org/entity/Q1947,Juba,http://www.wikidata.org/entity/Q958,South Sudan,http://www.wikidata.org/entity/Q487709,Central Equatoria,Juba
3,http://www.wikidata.org/entity/Q1960,Dar es Salaam,http://www.wikidata.org/entity/Q924,Tanzania,http://www.wikidata.org/entity/Q557539,Dar es Salaam Region,Dar es Salaam
4,http://www.wikidata.org/entity/Q1963,Khartoum,http://www.wikidata.org/entity/Q1049,Sudan,http://www.wikidata.org/entity/Q310385,Khartoum,Khartoum


In [14]:
df.columns = ['city_entity','city','country_entity','country','admin_entity','admin','city_check']
df.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,city_check
0,http://www.wikidata.org/entity/Q1903,Catania,http://www.wikidata.org/entity/Q38,Italy,http://www.wikidata.org/entity/Q20991246,Metropolitan City of Catania,Catania
1,http://www.wikidata.org/entity/Q1906,Caserta,http://www.wikidata.org/entity/Q38,Italy,http://www.wikidata.org/entity/Q16153,Province of Caserta,Caserta
2,http://www.wikidata.org/entity/Q1947,Juba,http://www.wikidata.org/entity/Q958,South Sudan,http://www.wikidata.org/entity/Q487709,Central Equatoria,Juba
3,http://www.wikidata.org/entity/Q1960,Dar es Salaam,http://www.wikidata.org/entity/Q924,Tanzania,http://www.wikidata.org/entity/Q557539,Dar es Salaam Region,Dar es Salaam
4,http://www.wikidata.org/entity/Q1963,Khartoum,http://www.wikidata.org/entity/Q1049,Sudan,http://www.wikidata.org/entity/Q310385,Khartoum,Khartoum


In [0]:
df.to_csv('city_details.csv')

In [0]:
df['city'] = df['city'].apply(lambda x: x.lower())

In [18]:
df['admin'] = df['admin'].map(lambda x: x if type(x) != str else x.lower())
df.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,city_check
0,http://www.wikidata.org/entity/Q1903,catania,http://www.wikidata.org/entity/Q38,Italy,http://www.wikidata.org/entity/Q20991246,metropolitan city of catania,Catania
1,http://www.wikidata.org/entity/Q1906,caserta,http://www.wikidata.org/entity/Q38,Italy,http://www.wikidata.org/entity/Q16153,province of caserta,Caserta
2,http://www.wikidata.org/entity/Q1947,juba,http://www.wikidata.org/entity/Q958,South Sudan,http://www.wikidata.org/entity/Q487709,central equatoria,Juba
3,http://www.wikidata.org/entity/Q1960,dar es salaam,http://www.wikidata.org/entity/Q924,Tanzania,http://www.wikidata.org/entity/Q557539,dar es salaam region,Dar es Salaam
4,http://www.wikidata.org/entity/Q1963,khartoum,http://www.wikidata.org/entity/Q1049,Sudan,http://www.wikidata.org/entity/Q310385,khartoum,Khartoum


In [19]:
df['country'] = df['country'].map(lambda x: x if type(x) != str else x.lower())
df.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,city_check
0,http://www.wikidata.org/entity/Q1903,catania,http://www.wikidata.org/entity/Q38,italy,http://www.wikidata.org/entity/Q20991246,metropolitan city of catania,Catania
1,http://www.wikidata.org/entity/Q1906,caserta,http://www.wikidata.org/entity/Q38,italy,http://www.wikidata.org/entity/Q16153,province of caserta,Caserta
2,http://www.wikidata.org/entity/Q1947,juba,http://www.wikidata.org/entity/Q958,south sudan,http://www.wikidata.org/entity/Q487709,central equatoria,Juba
3,http://www.wikidata.org/entity/Q1960,dar es salaam,http://www.wikidata.org/entity/Q924,tanzania,http://www.wikidata.org/entity/Q557539,dar es salaam region,Dar es Salaam
4,http://www.wikidata.org/entity/Q1963,khartoum,http://www.wikidata.org/entity/Q1049,sudan,http://www.wikidata.org/entity/Q310385,khartoum,Khartoum


In [0]:
nlp = spacy.load('en_core_web_sm')

In [0]:
patterns = []
id_list = []
for item in list(df['city']):
  patterns.append(['{"LOWER": "'+ token.text + '"}' for token in nlp(item)])
  id_list.append(['_'+ token.text for token in nlp(item)])


In [23]:
print(len(patterns),len(id_list))

8778 8778


In [24]:
df_pats = pd.DataFrame(patterns)
df_pats.head()
df_ids = pd.DataFrame(id_list)
df_ids.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,"{""LOWER"": ""catania""}",,,,,,,
1,"{""LOWER"": ""caserta""}",,,,,,,
2,"{""LOWER"": ""juba""}",,,,,,,
3,"{""LOWER"": ""dar""}","{""LOWER"": ""es""}","{""LOWER"": ""salaam""}",,,,,
4,"{""LOWER"": ""khartoum""}",,,,,,,


Unnamed: 0,0,1,2,3,4,5,6,7
0,_catania,,,,,,,
1,_caserta,,,,,,,
2,_juba,,,,,,,
3,_dar,_es,_salaam,,,,,
4,_khartoum,,,,,,,


In [25]:
df_pats['combined'] = df_pats[0].astype('str') + ', ' + df_pats[1].astype('str') + ', ' + df_pats[2].astype('str') + ', ' + df_pats[3].astype('str')\
+ ', ' + df_pats[4].astype('str') + ', ' + df_pats[5].astype('str') + ', ' + df_pats[6].astype('str') + ', ' + df_pats[7].astype('str')
df_pats.drop(columns=[0,1,2,3,4,5,6,7],inplace=True)
df_pats['cleaned'] = df_pats['combined'].apply(lambda x: clean_pat(x))
df_pats.drop(columns=['combined'],inplace=True)
df_pats.columns=['cleaned_pattern']

df_pats.head()

df_ids['combined'] = df_ids[0].astype('str') + df_ids[1].astype('str') + df_ids[2].astype('str') + df_ids[3].astype('str')\
+ df_ids[4].astype('str') + df_ids[5].astype('str') + df_ids[6].astype('str') + df_ids[7].astype('str')
df_ids.drop(columns=[0,1,2,3,4,5,6,7],inplace=True)

df_ids['cleaned'] = df_ids['combined'].apply(lambda x: clean_id(x))
df_ids.drop(columns=['combined'],inplace=True)
df_ids.columns=['cleaned_id']
df_ids.head()

Unnamed: 0,cleaned_pattern
0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
1,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
2,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""j..."
3,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""d..."
4,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."


Unnamed: 0,cleaned_id
0,", ""id"": ""catania""}"
1,", ""id"": ""caserta""}"
2,", ""id"": ""juba""}"
3,", ""id"": ""dar_es_salaam""}"
4,", ""id"": ""khartoum""}"


In [26]:
df_pat_ids = pd.concat([df_pats,df_ids],axis=1)
df_pat_ids['combined'] = df_pat_ids['cleaned_pattern'].astype('str') + df_pat_ids['cleaned_id'].astype('str')
df_pat_ids.info()
df_pat_ids.head()
df_pat_ids.tail()

df_pat_ids.drop(columns=['cleaned_pattern','cleaned_id'],inplace=True)
df_pat_ids.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8778 entries, 0 to 8777
Data columns (total 3 columns):
cleaned_pattern    8778 non-null object
cleaned_id         8778 non-null object
combined           8778 non-null object
dtypes: object(3)
memory usage: 205.9+ KB


Unnamed: 0,cleaned_pattern,cleaned_id,combined
0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c...",", ""id"": ""catania""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
1,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c...",", ""id"": ""caserta""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
2,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""j...",", ""id"": ""juba""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""j..."
3,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""d...",", ""id"": ""dar_es_salaam""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""d..."
4,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k...",", ""id"": ""khartoum""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."


Unnamed: 0,cleaned_pattern,cleaned_id,combined
8773,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""r...",", ""id"": ""rumelange""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""r..."
8774,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a...",", ""id"": ""ayn_al_-basha""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."
8775,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""p...",", ""id"": ""pedro_vicente_maldonado""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""p..."
8776,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k...",", ""id"": ""kerkrade""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."
8777,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a...",", ""id"": ""al_-mashariqah""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."


Unnamed: 0,combined
0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
1,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
2,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""j..."
3,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""d..."
4,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."


In [27]:
df = pd.concat([df,df_pat_ids],axis=1)
df.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,city_check,combined
0,http://www.wikidata.org/entity/Q1903,catania,http://www.wikidata.org/entity/Q38,italy,http://www.wikidata.org/entity/Q20991246,metropolitan city of catania,Catania,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
1,http://www.wikidata.org/entity/Q1906,caserta,http://www.wikidata.org/entity/Q38,italy,http://www.wikidata.org/entity/Q16153,province of caserta,Caserta,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
2,http://www.wikidata.org/entity/Q1947,juba,http://www.wikidata.org/entity/Q958,south sudan,http://www.wikidata.org/entity/Q487709,central equatoria,Juba,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""j..."
3,http://www.wikidata.org/entity/Q1960,dar es salaam,http://www.wikidata.org/entity/Q924,tanzania,http://www.wikidata.org/entity/Q557539,dar es salaam region,Dar es Salaam,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""d..."
4,http://www.wikidata.org/entity/Q1963,khartoum,http://www.wikidata.org/entity/Q1049,sudan,http://www.wikidata.org/entity/Q310385,khartoum,Khartoum,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."


In [29]:
df['combined'][100]

'{"label": "GPE_city", "pattern": [{"LOWER": "new"}, {"LOWER": "york"}, {"LOWER": "city"}], "id": "new_york_city"}'

In [0]:
df.drop(columns=['city_check'],inplace=True)
df.to_csv('city_detail_w_pattern.csv',index=False)

In [0]:
dump_jsonl(list(df['combined']), 'city_patterns.jsonl')

In [0]:
df[['combined']].to_csv('city_patterns.csv',index=False)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8778 entries, 0 to 8777
Data columns (total 7 columns):
city_entity       8778 non-null object
city              8778 non-null object
country_entity    8694 non-null object
country           8694 non-null object
admin_entity      8171 non-null object
admin             8171 non-null object
combined          8778 non-null object
dtypes: object(7)
memory usage: 480.2+ KB


In [34]:
df.tail()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined
8773,http://www.wikidata.org/entity/Q3917005,rumelange,http://www.wikidata.org/entity/Q32,luxembourg,,,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""r..."
8774,http://www.wikidata.org/entity/Q29014747,ayn al-basha,http://www.wikidata.org/entity/Q810,jordan,,,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."
8775,http://www.wikidata.org/entity/Q28883745,pedro vicente maldonado,http://www.wikidata.org/entity/Q736,ecuador,,,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""p..."
8776,http://www.wikidata.org/entity/Q28914824,kerkrade,http://www.wikidata.org/entity/Q55,netherlands,http://www.wikidata.org/entity/Q9796,kerkrade,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."
8777,http://www.wikidata.org/entity/Q29001009,al-mashariqah,http://www.wikidata.org/entity/Q810,jordan,,,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."


In [39]:
df.query('country == "afghanistan"')

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined
253,http://www.wikidata.org/entity/Q5838,kabul,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q6344428,kabul,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."
934,http://www.wikidata.org/entity/Q45313,herat,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q3696278,herat district,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""h..."
935,http://www.wikidata.org/entity/Q45604,kandahar,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q173808,kandahar,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."
1686,http://www.wikidata.org/entity/Q685808,asadabad,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q2663257,asadabad,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."
1798,http://www.wikidata.org/entity/Q214495,bamyan,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q171382,bamyan,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b..."
2006,http://www.wikidata.org/entity/Q958621,chaghcharan,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q186392,ghōr,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c..."
2177,http://www.wikidata.org/entity/Q130469,mazar-i-sharif,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q121104,balkh,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""m..."
2178,http://www.wikidata.org/entity/Q130469,mazar-i-sharif,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q12497658,mazar-i-sharif,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""m..."
2394,http://www.wikidata.org/entity/Q732879,baghlan,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q170309,baghlan,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b..."
3680,http://www.wikidata.org/entity/Q173731,ghazni,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q3694458,ghazni district,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g..."


In [0]:
df.fillna(0,inplace=True)

In [36]:
df_admins = pd.read_csv('country_subdivisions_detail.csv')
df_admins.head()

Unnamed: 0,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code
0,AF,AF-BAL,balkh,Province,
1,AF,AF-BAM,bāmyān,Province,
2,AF,AF-BDG,bādghīs,Province,
3,AF,AF-BDS,badakhshān,Province,
4,AF,AF-BGL,baghlān,Province,


In [40]:
df_admins.head(40)

Unnamed: 0,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code
0,AF,AF-BAL,balkh,Province,
1,AF,AF-BAM,bāmyān,Province,
2,AF,AF-BDG,bādghīs,Province,
3,AF,AF-BDS,badakhshān,Province,
4,AF,AF-BGL,baghlān,Province,
5,AF,AF-DAY,dāykundī,Province,
6,AF,AF-FRA,farāh,Province,
7,AF,AF-FYB,fāryāb,Province,
8,AF,AF-GHA,ghaznī,Province,
9,AF,AF-GHO,ghōr,Province,


In [37]:
df_country = pd.read_csv('country_detail.csv')
df_country.head()

Unnamed: 0,country_alpha_2,country_alpha_3,country_name,country_numeric
0,AW,ABW,aruba,533
1,AF,AFG,afghanistan,4
2,AO,AGO,angola,24
3,AI,AIA,anguilla,660
4,AX,ALA,åland islands,248


In [134]:
df_country_admins = df_admins.merge(df_country,how='left',on='country_alpha_2')
df_country_admins.head()

Unnamed: 0,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric
0,AF,AF-BAL,balkh,Province,,AFG,afghanistan,4
1,AF,AF-BAM,bāmyān,Province,,AFG,afghanistan,4
2,AF,AF-BDG,bādghīs,Province,,AFG,afghanistan,4
3,AF,AF-BDS,badakhshān,Province,,AFG,afghanistan,4
4,AF,AF-BGL,baghlān,Province,,AFG,afghanistan,4


In [137]:
df_country_admins['sd_name'][4234] = "nan"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [138]:
df_country_admins[df_country_admins['country_alpha_3'] == "THA"][40:50]

Unnamed: 0,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric
4232,TH,TH-53,uttaradit,Province,,THA,thailand,764
4233,TH,TH-54,phrae,Province,,THA,thailand,764
4234,TH,TH-55,,Province,,THA,thailand,764
4235,TH,TH-56,phayao,Province,,THA,thailand,764
4236,TH,TH-57,chiang rai,Province,,THA,thailand,764
4237,TH,TH-58,mae hong son,Province,,THA,thailand,764
4238,TH,TH-60,nakhon sawan,Province,,THA,thailand,764
4239,TH,TH-61,uthai thani,Province,,THA,thailand,764
4240,TH,TH-62,kamphaeng phet,Province,,THA,thailand,764
4241,TH,TH-63,tak,Province,,THA,thailand,764


In [139]:
df_updated = df.merge(df_country_admins,how='left',left_on='admin',right_on='sd_name').reset_index(drop=True)
df_updated.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric
0,http://www.wikidata.org/entity/Q1903,catania,http://www.wikidata.org/entity/Q38,italy,http://www.wikidata.org/entity/Q20991246,metropolitan city of catania,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c...",,,,,,,,
1,http://www.wikidata.org/entity/Q1906,caserta,http://www.wikidata.org/entity/Q38,italy,http://www.wikidata.org/entity/Q16153,province of caserta,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c...",,,,,,,,
2,http://www.wikidata.org/entity/Q1947,juba,http://www.wikidata.org/entity/Q958,south sudan,http://www.wikidata.org/entity/Q487709,central equatoria,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""j...",SS,SS-EC,central equatoria,State,,SSD,south sudan,728.0
3,http://www.wikidata.org/entity/Q1960,dar es salaam,http://www.wikidata.org/entity/Q924,tanzania,http://www.wikidata.org/entity/Q557539,dar es salaam region,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""d...",,,,,,,,
4,http://www.wikidata.org/entity/Q1963,khartoum,http://www.wikidata.org/entity/Q1049,sudan,http://www.wikidata.org/entity/Q310385,khartoum,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k...",,,,,,,,


In [140]:
df_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8868 entries, 0 to 8867
Data columns (total 15 columns):
city_entity        8868 non-null object
city               8868 non-null object
country_entity     8868 non-null object
country            8868 non-null object
admin_entity       8868 non-null object
admin              8868 non-null object
combined           8868 non-null object
country_alpha_2    1412 non-null object
sd_code            1412 non-null object
sd_name            1412 non-null object
sd_type            1412 non-null object
sd_parent_code     121 non-null object
country_alpha_3    1412 non-null object
country_name       1412 non-null object
country_numeric    1412 non-null float64
dtypes: float64(1), object(14)
memory usage: 1.0+ MB


In [142]:
df_updated.query('city == "kerkrade"')

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric
8866,http://www.wikidata.org/entity/Q28914824,kerkrade,http://www.wikidata.org/entity/Q55,netherlands,http://www.wikidata.org/entity/Q9796,kerkrade,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k...",,,,,,,,


In [44]:
grouped = df_updated.groupby(['country'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [66]:
r_list[:10]

["'s hertogenbosch",
 'a coruña',
 'aabenraa',
 'aasiaat',
 'aba',
 'abala',
 'abancay',
 'abaradira',
 'abasan al kabera',
 'abbeyleix']

In [63]:
list(grouped)[1][1]['city']

255              kabul
943              herat
944           kandahar
1706          asadabad
1818            bamyan
2026       chaghcharan
2199    mazar-i-sharif
2200    mazar-i-sharif
2416           baghlan
3719            ghazni
3887         sar-e pol
3945             qalat
4116            gardēz
4131            zaranj
4134       lashkar gah
4135      pul-e khomri
4140           taloqan
4373        sheberghan
5050         jalalabad
5150             farah
5712             kholm
6793          samangan
6962           andkhoy
7290         khan abad
7382             kishm
7540        euthydemia
7675     qalʻah-ye zāl
7747           sharana
7874           watapur
7944          shindand
Name: city, dtype: object

In [9]:
wikipedia.summary("Catania")

"Catania (UK: , US: , Sicilian and Italian: [kaˈtaːnja] (listen)) is the second largest city of Sicily after Palermo; it is located on the east coast facing the Ionian Sea. It is the capital of the Metropolitan City of Catania, one of the ten biggest cities in Italy, and the seventh largest metropolitan city in Italy. The population of the city proper is 311,584 while the population of the Metropolitan City of Catania is 1,107,702.Catania was destroyed by catastrophic earthquakes in 1169 and 1693, and by several volcanic eruptions from the neighbouring Mount Etna, the most violent of which was in 1669.Catania was founded in the 8th century BC by Chalcidians. In 1434, the first university in Sicily was founded in the city. In the 14th century and into the Renaissance period, Catania was one of Italy's most important cultural, artistic and political centres.The city is noted for its history, culture, architecture and gastronomy. Its old town, besides being one of the biggest examples of 

In [0]:
wiki = MediaWiki()

In [0]:
summaries = []
for item in r_list:
  summaries.append(get_city_summary(item))

In [70]:
len(r_list)

7602

In [69]:
len(summaries)

7602

In [71]:
summaries[:10]

["'s-Hertogenbosch (UK: , US: , Dutch: [ˌsɛrtoːɣə(m)ˈbɔs] (listen); French: Bois-le-Duc [bwa l(ə) dyk]), colloquially known as Den Bosch (IPA: [dɛm ˈbɔs] (listen)), is a city and municipality in the Netherlands with a population of 152,968. It is the capital of the province of North Brabant.",
 'A Coruña (Galician: [ɐ koˈɾuɲɐ]; historical English: Corunna) is a city and municipality of Galicia, Spain. It is the most populated city and the second most populated municipality in the autonomous community and seventeenth overall in the country. The city is the provincial capital of the province of the same name, having also served as political capital of the Kingdom of Galicia from the 16th to the 19th centuries, and as a regional administrative centre between 1833 and 1982, before being replaced by Santiago de Compostela.\nA Coruña is a busy port located on a promontory in the Golfo Ártabro, a large gulf on the Atlantic Ocean. It is the main industrial and financial centre of northern Gali

In [102]:
df_nlp = pd.DataFrame({'city_names':r_list,'wiki_summary_projected':summaries})
df_nlp.head()


Unnamed: 0,city_names,wiki_summary_projected
0,'s hertogenbosch,"'s-Hertogenbosch (UK: , US: , Dutch: [ˌsɛrtoːɣ..."
1,a coruña,A Coruña (Galician: [ɐ koˈɾuɲɐ]; historical En...
2,aabenraa,Aabenraa or Åbenrå (Danish pronunciation: [ɔːp...
3,aasiaat,"Aasiaat or Ausiait, formerly Egedesminde, is a..."
4,aba,ABBA (Swedish pronunciation: [ˇabːa]) is a Swe...


In [117]:
df_nlp.query('city_names == "kerkrade"')['wiki_summary_projected'][3295]

'Kerkrade (Kerkrade dialect: Kirchroa; German: Kerkrade or Kirchrath) is a town and a municipality in the southeast of Limburg, the southernmost province of the Netherlands. It forms part of the Parkstad Limburg agglomeration.\nKerkrade is the western half of a divided city; it was part of the German town of Herzogenrath until the Congress of Vienna in 1815 drew the current Dutch-German border and separated the towns. This means that the eastern end of the city marks the international border.\nThe two towns, including outlying suburban settlements, have a population approaching 100,000, of which nearly 47,000 are in Kerkrade.'

In [83]:
for item in nlp.pipe(df_nlp['wiki_summary_projected'][:10]):
  print([(ent.text.lower(), ent.label_) for ent in item.ents if ent.label_ == 'GPE'])



[('uk', 'GPE'), ('us', 'GPE'), ('netherlands', 'GPE'), ('north brabant', 'GPE')]
[('galicia', 'GPE'), ('spain', 'GPE'), ('the kingdom of galicia', 'GPE')]
[('southern denmark', 'GPE'), ('denmark', 'GPE'), ('germany', 'GPE'), ('flensburg', 'GPE'), ('south jutland county', 'GPE'), ('aabenraa municipality', 'GPE')]
[('ausiait', 'GPE'), ('greenland', 'GPE')]
[('stockholm', 'GPE'), ('sweden', 'GPE'), ('the united kingdom', 'GPE'), ('the united kingdom', 'GPE'), ('ireland', 'GPE'), ('canada', 'GPE'), ('australia', 'GPE'), ('new zealand', 'GPE'), ('south africa', 'GPE'), ('the united states', 'GPE'), ('uk', 'GPE')]
[]
[('quechua', 'GPE'), ('peru', 'GPE'), ('the abancay province', 'GPE')]
[('byzacena', 'GPE'), ('tunisia', 'GPE'), ('banja luka', 'GPE'), ('bosnia', 'GPE')]
[('gaza strip', 'GPE'), ('qudayh', 'GPE'), ('alshawaf', 'GPE'), ('al-daghmah', 'GPE')]
[('county laois', 'GPE'), ('ireland', 'GPE'), ('dublin', 'GPE')]


In [0]:
sum_entities = []
for item in nlp.pipe(df_nlp['wiki_summary_projected']):
  sum_entities.append([k  for k in Counter(sorted([(ent.text.lower(), ent.label_) for ent in item.ents if ent.label_ == 'GPE'])).keys()])


In [96]:
sum_entities[:20]

[[('netherlands', 'GPE'),
  ('north brabant', 'GPE'),
  ('uk', 'GPE'),
  ('us', 'GPE')],
 [('galicia', 'GPE'), ('spain', 'GPE'), ('the kingdom of galicia', 'GPE')],
 [('aabenraa municipality', 'GPE'),
  ('denmark', 'GPE'),
  ('flensburg', 'GPE'),
  ('germany', 'GPE'),
  ('south jutland county', 'GPE'),
  ('southern denmark', 'GPE')],
 [('ausiait', 'GPE'), ('greenland', 'GPE')],
 [('australia', 'GPE'),
  ('canada', 'GPE'),
  ('ireland', 'GPE'),
  ('new zealand', 'GPE'),
  ('south africa', 'GPE'),
  ('stockholm', 'GPE'),
  ('sweden', 'GPE'),
  ('the united kingdom', 'GPE'),
  ('the united states', 'GPE'),
  ('uk', 'GPE')],
 [],
 [('peru', 'GPE'), ('quechua', 'GPE'), ('the abancay province', 'GPE')],
 [('banja luka', 'GPE'),
  ('bosnia', 'GPE'),
  ('byzacena', 'GPE'),
  ('tunisia', 'GPE')],
 [('al-daghmah', 'GPE'),
  ('alshawaf', 'GPE'),
  ('gaza strip', 'GPE'),
  ('qudayh', 'GPE')],
 [('county laois', 'GPE'), ('dublin', 'GPE'), ('ireland', 'GPE')],
 [('abbotsford', 'GPE'),
  ('british co

In [103]:
df_nlp_entities = pd.concat([df_nlp,pd.DataFrame(sum_entities)],axis=1)
df_nlp_entities.head()

Unnamed: 0,city_names,wiki_summary_projected,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97
0,'s hertogenbosch,"'s-Hertogenbosch (UK: , US: , Dutch: [ˌsɛrtoːɣ...","(netherlands, GPE)","(north brabant, GPE)","(uk, GPE)","(us, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,a coruña,A Coruña (Galician: [ɐ koˈɾuɲɐ]; historical En...,"(galicia, GPE)","(spain, GPE)","(the kingdom of galicia, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,aabenraa,Aabenraa or Åbenrå (Danish pronunciation: [ɔːp...,"(aabenraa municipality, GPE)","(denmark, GPE)","(flensburg, GPE)","(germany, GPE)","(south jutland county, GPE)","(southern denmark, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,aasiaat,"Aasiaat or Ausiait, formerly Egedesminde, is a...","(ausiait, GPE)","(greenland, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,aba,ABBA (Swedish pronunciation: [ˇabːa]) is a Swe...,"(australia, GPE)","(canada, GPE)","(ireland, GPE)","(new zealand, GPE)","(south africa, GPE)","(stockholm, GPE)","(sweden, GPE)","(the united kingdom, GPE)","(the united states, GPE)","(uk, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [110]:
df_nlp_entities.tail()

Unnamed: 0,city_names,wiki_summary_projected,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97
7597,ștefan vodă,Ștefan Vodă is a city and the administrative c...,"(moldova, GPE)","(ștefan vodă district, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7598,ʿamrān,ʿAmrān (Arabic: عمران‎; Old South Arabian: 𐩲𐩣𐩧...,"(yemen, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7599,الجلفة,Djelfa (Arabic: ولاية الجلفة‎) is a province (...,"(algeria, GPE)","(djelfa, GPE)","(el khemis, GPE)","(tadmit, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7600,مأرب,"The Saudi-led intervention in Yemen, also call...","(bahrain, GPE)","(britain, GPE)","(djibouti, GPE)","(egypt, GPE)","(eritrea, GPE)","(iran, GPE)","(jordan, GPE)","(kuwait, GPE)","(morocco, GPE)","(qatar, GPE)","(saudi arabia, GPE)","(somalia, GPE)","(sudan, GPE)","(the united arab emirates, GPE)","(the united states, GPE)","(us, GPE)","(yemen, GPE)",,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7601,ḍera ismaīl k͟hān,Dera Ismail Khan (Urdu/Saraiki: ڈیرہ اسماعیل ...,"(khyber pakhtunkhwa province, GPE)","(multan, GPE)","(pakistan, GPE)","(peshawar, GPE)","(punjab, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [118]:
df_nlp_entities.query('city_names == "kerkrade"')

Unnamed: 0,city_names,wiki_summary_projected,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97
3295,kerkrade,Kerkrade (Kerkrade dialect: Kirchroa; German: ...,"(herzogenrath, GPE)","(kerkrade, GPE)","(limburg, GPE)","(netherlands, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
df_updated_2 = df_updated.merge(df_nlp_entities,how='left',left_on='city',right_on='city_names')

In [0]:
df_updated_2.to_csv('cities_updated_with_admin.csv',index=False)

In [144]:
df_updated_2.tail(4)

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric,city_names,wiki_summary_projected,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,...,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97
8864,http://www.wikidata.org/entity/Q29014747,ayn al-basha,http://www.wikidata.org/entity/Q810,jordan,0,0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8865,http://www.wikidata.org/entity/Q28883745,pedro vicente maldonado,http://www.wikidata.org/entity/Q736,ecuador,0,0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""p...",,,,,,,,,pedro vicente maldonado,"Pedro Vicente Maldonado y Flores, (Riobamba, R...","(ecuador, GPE)","(england, GPE)","(london, GPE)",,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8866,http://www.wikidata.org/entity/Q28914824,kerkrade,http://www.wikidata.org/entity/Q55,netherlands,http://www.wikidata.org/entity/Q9796,kerkrade,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k...",,,,,,,,,kerkrade,Kerkrade (Kerkrade dialect: Kirchroa; German: ...,"(herzogenrath, GPE)","(kerkrade, GPE)","(limburg, GPE)","(netherlands, GPE)",,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8867,http://www.wikidata.org/entity/Q29001009,al-mashariqah,http://www.wikidata.org/entity/Q810,jordan,0,0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [107]:
wiki.summary('franca')

"France (French: [fʁɑ̃s] (listen)), officially the French Republic (French: République française, pronounced [ʁepyblik fʁɑ̃sɛːz] (listen)), is a country whose territory consists of metropolitan France in Western Europe and several overseas regions and territories. The metropolitan area of France extends from the Mediterranean Sea to the English Channel and the North Sea, and from the Rhine to the Atlantic Ocean. It is bordered by Belgium, Luxembourg and Germany to the northeast, Switzerland and Italy to the east, and Andorra and Spain to the south. The overseas territories include French Guiana in South America and several islands in the Atlantic, Pacific and Indian oceans. The country's 18 integral regions (five of which are situated overseas) span a combined area of 643,801 square kilometres (248,573 sq mi) and a total population of 67.02 million (as of July 2019). France is a unitary semi-presidential republic with its capital in Paris, the country's largest city and main cultural a

In [108]:
sum_entities[:20]

[[('netherlands', 'GPE'),
  ('north brabant', 'GPE'),
  ('uk', 'GPE'),
  ('us', 'GPE')],
 [('galicia', 'GPE'), ('spain', 'GPE'), ('the kingdom of galicia', 'GPE')],
 [('aabenraa municipality', 'GPE'),
  ('denmark', 'GPE'),
  ('flensburg', 'GPE'),
  ('germany', 'GPE'),
  ('south jutland county', 'GPE'),
  ('southern denmark', 'GPE')],
 [('ausiait', 'GPE'), ('greenland', 'GPE')],
 [('australia', 'GPE'),
  ('canada', 'GPE'),
  ('ireland', 'GPE'),
  ('new zealand', 'GPE'),
  ('south africa', 'GPE'),
  ('stockholm', 'GPE'),
  ('sweden', 'GPE'),
  ('the united kingdom', 'GPE'),
  ('the united states', 'GPE'),
  ('uk', 'GPE')],
 [],
 [('peru', 'GPE'), ('quechua', 'GPE'), ('the abancay province', 'GPE')],
 [('banja luka', 'GPE'),
  ('bosnia', 'GPE'),
  ('byzacena', 'GPE'),
  ('tunisia', 'GPE')],
 [('al-daghmah', 'GPE'),
  ('alshawaf', 'GPE'),
  ('gaza strip', 'GPE'),
  ('qudayh', 'GPE')],
 [('county laois', 'GPE'), ('dublin', 'GPE'), ('ireland', 'GPE')],
 [('abbotsford', 'GPE'),
  ('british co

In [109]:
for item in sum_entities:
  if len(item) > 10:
    print(item)

[('abidjan', 'GPE'), ('cairo', 'GPE'), ('dar es salaam', 'GPE'), ('france', 'GPE'), ('ivory coast', 'GPE'), ('johannesburg', 'GPE'), ('kinshasa', 'GPE'), ('lagos', 'GPE'), ('the abidjan autonomous district', 'GPE'), ('west africa', 'GPE'), ('yamoussoukro', 'GPE')]
[('abong', 'GPE'), ('abong-mbang', 'GPE'), ('bakola', 'GPE'), ('bantu', 'GPE'), ('east province', 'GPE'), ("east province's", 'GPE'), ('ewondo', 'GPE'), ('germany', 'GPE'), ('great lakes', 'GPE'), ('mbang', 'GPE'), ('nyong', 'GPE'), ('the centre province', 'GPE'), ('ulmaceae', 'GPE')]
[('acapulco', 'GPE'), ('california', 'GPE'), ('guerrero', 'GPE'), ('hollywood', 'GPE'), ('manila', 'GPE'), ('mexico', 'GPE'), ('mexico city', 'GPE'), ('nahuatl', 'GPE'), ('panama', 'GPE'), ('philippines', 'GPE'), ('san francisco', 'GPE'), ('united states', 'GPE'), ('us', 'GPE'), ('zócalo', 'GPE')]
[('aguascalientes', 'GPE'), ('bajío', 'GPE'), ('flextronics', 'GPE'), ('jatco', 'GPE'), ('kansei', 'GPE'), ('mexico', 'GPE'), ('mexico city', 'GPE'), 