<a href="https://colab.research.google.com/github/ravi-gopalan/DAND_Data_Wrangling/blob/master/city_list_wikidata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install modules

In [1]:
# Install sparqlwrapper for retrieving wikidata info, wikipedia and pymediawiki to retrieve data from wikipedia

!pip install sparqlwrapper
!pip install wikipedia
!pip install pymediawiki

Collecting sparqlwrapper
  Downloading https://files.pythonhosted.org/packages/00/9b/443fbe06996c080ee9c1f01b04e2f683b2b07e149905f33a2397ee3b80a2/SPARQLWrapper-1.8.5-py3-none-any.whl
Collecting rdflib>=4.0
[?25l  Downloading https://files.pythonhosted.org/packages/3c/fe/630bacb652680f6d481b9febbb3e2c3869194a1a5fc3401a4a41195a2f8f/rdflib-4.2.2-py3-none-any.whl (344kB)
[K     |████████████████████████████████| 348kB 8.0MB/s 
Collecting isodate
[?25l  Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)
[K     |████████████████████████████████| 51kB 8.0MB/s 
Installing collected packages: isodate, rdflib, sparqlwrapper
Successfully installed isodate-0.6.0 rdflib-4.2.2 sparqlwrapper-1.8.5
Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected

## Import Libraries

In [0]:
# Import pandas, numpy, SPARQLWrapper, re, json, collections, time, itertools, requests, json_normalize, spacy, wikipedia, mediawiki and interactiveshell

import pandas as pd
import numpy as np
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import re
from collections import Counter
import time
import itertools

import requests
from pandas.io.json import json_normalize

import spacy

import wikipedia
from mediawiki import MediaWiki

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load helper functions

In [0]:
# get results from wikidata
def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
      item = []
      for c in cols:
        item.append(row.get(c, {}).get('value'))
      out.append(item)
    return pd.DataFrame(out, columns = cols)


def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
#    print('Wrote {} records to {}'.format(len(data), output_path))



def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

def get_wikidata_id(col):
  rx = re.compile(r'(http\:\/\/www\.wikidata\.org\/entity\/)(Q[0-9]+)')
  m = rx.match(col)
  if m is not None:
    return m.group(2)

def get_wikidata_description(row):
  base_string = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&languages=en&format=json&props=descriptions&ids='
  query_string = base_string + row
#  print(query_string)
  r = requests.get(query_string)
  return json.loads(r.content.decode('utf-8'))['entities'][row]

def get_wikidata_label(row):
  base_string = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&languages=en&format=json&props=labels&ids='
  query_string = base_string + row
#  print(query_string)
  r = requests.get(query_string)
  return json.loads(r.content.decode('utf-8'))['entities'][row]

def check_Qcodes(x):
  step1 = re.sub("Q[0-9]+","UNKNOWN",x)
  return step1

def query_and_process_results(url, query_parameter):
  query = query_parameter
#  print(query)

  df_initial = get_results(url, query)
  print(df_initial.shape)
  print(df_initial.head())

  df_initial['city_check'] = df_initial['instance_ofLabel'].apply(lambda x: check_Qcodes(x))

  df = df_initial.query('city_check != "UNKNOWN"').reset_index(drop=True)
  print(df.shape)
  print(df.head())  

  res_list =  sorted([re.sub("q[0-9]+","",item.lower()) for item in df.groupby(['instance_ofLabel'])['instance_of'].agg('count').index.values.tolist()])
  res_list = sorted([re.sub("\-", " ",item) for item in res_list])
  res_list = [item for item in res_list if not re.findall("[0-9]+",item)]
  res_counter = Counter(res_list)
  res_list = []
  for key in res_counter.keys():
    if len(key) != 0:
      res_list.append(key)

  print(len(res_list), res_list)

  return df, res_list, res_counter



def clean_pat(x):
  step1 = re.sub("None\,","",x)
  step2 = re.sub("None","",step1)
  step2 = '{"label": "GPE_city", "pattern": ' + '['+ step2.strip()[:-1] + ']'
  return step2


def clean_id(x):
  step1 = re.sub("None","",x)
  step2 = ', "id": "' + re.sub("(\_)([\'\_a-zA-Z\u0080-\uFFFF]+)",r"\2",step1) + '"}'
  return step2

def get_city_summary(x,n):
  try:
    out = wiki.summary(title=x,sentences=n)
  except:
    out = 'error'
  return out

In [0]:
url = "https://query.wikidata.org/sparql"

query = '''
SELECT ?instance_of ?instance_ofLabel ?country ?countryLabel ?located_in_the_administrative_territorial_entity ?located_in_the_administrative_territorial_entityLabel WHERE 
{
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?instance_of wdt:P31 wd:Q1637706.
  OPTIONAL { ?instance_of wdt:P17 ?country. }
  OPTIONAL { ?instance_of wdt:P131 ?located_in_the_administrative_territorial_entity. }
}
'''

In [62]:
df, r_list, r_counter = query_and_process_results(url, query)

(673, 6)
                          instance_of  ... located_in_the_administrative_territorial_entityLabel
0  http://www.wikidata.org/entity/Q60  ...                                           New York   
1  http://www.wikidata.org/entity/Q64  ...                                            Germany   
2  http://www.wikidata.org/entity/Q65  ...                                 Los Angeles County   
3  http://www.wikidata.org/entity/Q84  ...                                     Greater London   
4  http://www.wikidata.org/entity/Q87  ...                             Alexandria Governorate   

[5 rows x 6 columns]
(673, 7)
                          instance_of  ...     city_check
0  http://www.wikidata.org/entity/Q60  ...  New York City
1  http://www.wikidata.org/entity/Q64  ...         Berlin
2  http://www.wikidata.org/entity/Q65  ...    Los Angeles
3  http://www.wikidata.org/entity/Q84  ...         London
4  http://www.wikidata.org/entity/Q87  ...     Alexandria

[5 rows x 7 columns]
604 ['ab

In [63]:
df.head()

Unnamed: 0,instance_of,instance_ofLabel,country,countryLabel,located_in_the_administrative_territorial_entity,located_in_the_administrative_territorial_entityLabel,city_check
0,http://www.wikidata.org/entity/Q60,New York City,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1384,New York,New York City
1,http://www.wikidata.org/entity/Q64,Berlin,http://www.wikidata.org/entity/Q183,Germany,http://www.wikidata.org/entity/Q183,Germany,Berlin
2,http://www.wikidata.org/entity/Q65,Los Angeles,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q104994,Los Angeles County,Los Angeles
3,http://www.wikidata.org/entity/Q84,London,http://www.wikidata.org/entity/Q145,United Kingdom,http://www.wikidata.org/entity/Q23306,Greater London,London
4,http://www.wikidata.org/entity/Q87,Alexandria,http://www.wikidata.org/entity/Q79,Egypt,http://www.wikidata.org/entity/Q29943,Alexandria Governorate,Alexandria


In [64]:
df.columns = ['city_entity','city','country_entity','country','admin_entity','admin','city_check']
df.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,city_check
0,http://www.wikidata.org/entity/Q60,New York City,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1384,New York,New York City
1,http://www.wikidata.org/entity/Q64,Berlin,http://www.wikidata.org/entity/Q183,Germany,http://www.wikidata.org/entity/Q183,Germany,Berlin
2,http://www.wikidata.org/entity/Q65,Los Angeles,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q104994,Los Angeles County,Los Angeles
3,http://www.wikidata.org/entity/Q84,London,http://www.wikidata.org/entity/Q145,United Kingdom,http://www.wikidata.org/entity/Q23306,Greater London,London
4,http://www.wikidata.org/entity/Q87,Alexandria,http://www.wikidata.org/entity/Q79,Egypt,http://www.wikidata.org/entity/Q29943,Alexandria Governorate,Alexandria


In [0]:
df.to_csv('city_details.csv')

In [0]:
df['city'] = df['city'].apply(lambda x: x.lower())

In [67]:
df['admin'] = df['admin'].map(lambda x: x if type(x) != str else x.lower())
df.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,city_check
0,http://www.wikidata.org/entity/Q60,new york city,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1384,new york,New York City
1,http://www.wikidata.org/entity/Q64,berlin,http://www.wikidata.org/entity/Q183,Germany,http://www.wikidata.org/entity/Q183,germany,Berlin
2,http://www.wikidata.org/entity/Q65,los angeles,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q104994,los angeles county,Los Angeles
3,http://www.wikidata.org/entity/Q84,london,http://www.wikidata.org/entity/Q145,United Kingdom,http://www.wikidata.org/entity/Q23306,greater london,London
4,http://www.wikidata.org/entity/Q87,alexandria,http://www.wikidata.org/entity/Q79,Egypt,http://www.wikidata.org/entity/Q29943,alexandria governorate,Alexandria


In [68]:
df['country'] = df['country'].map(lambda x: x if type(x) != str else x.lower())
df.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,city_check
0,http://www.wikidata.org/entity/Q60,new york city,http://www.wikidata.org/entity/Q30,united states of america,http://www.wikidata.org/entity/Q1384,new york,New York City
1,http://www.wikidata.org/entity/Q64,berlin,http://www.wikidata.org/entity/Q183,germany,http://www.wikidata.org/entity/Q183,germany,Berlin
2,http://www.wikidata.org/entity/Q65,los angeles,http://www.wikidata.org/entity/Q30,united states of america,http://www.wikidata.org/entity/Q104994,los angeles county,Los Angeles
3,http://www.wikidata.org/entity/Q84,london,http://www.wikidata.org/entity/Q145,united kingdom,http://www.wikidata.org/entity/Q23306,greater london,London
4,http://www.wikidata.org/entity/Q87,alexandria,http://www.wikidata.org/entity/Q79,egypt,http://www.wikidata.org/entity/Q29943,alexandria governorate,Alexandria


In [0]:
nlp = spacy.load('en_core_web_sm')

In [0]:
patterns = []
id_list = []
for item in list(df['city']):
  patterns.append(['{"LOWER": "'+ token.text + '"}' for token in nlp(item)])
  id_list.append(['_'+ token.text for token in nlp(item)])


In [71]:
print(len(patterns),len(id_list))

673 673


In [72]:
df_pats = pd.DataFrame(patterns)
df_pats.head()
df_ids = pd.DataFrame(id_list)
df_ids.head()

Unnamed: 0,0,1,2,3,4
0,"{""LOWER"": ""new""}","{""LOWER"": ""york""}","{""LOWER"": ""city""}",,
1,"{""LOWER"": ""berlin""}",,,,
2,"{""LOWER"": ""los""}","{""LOWER"": ""angeles""}",,,
3,"{""LOWER"": ""london""}",,,,
4,"{""LOWER"": ""alexandria""}",,,,


Unnamed: 0,0,1,2,3,4
0,_new,_york,_city,,
1,_berlin,,,,
2,_los,_angeles,,,
3,_london,,,,
4,_alexandria,,,,


In [73]:
df_pats['combined'] = df_pats[0].astype('str') + ', ' + df_pats[1].astype('str') + ', ' + df_pats[2].astype('str') + ', ' + df_pats[3].astype('str')\
+ ', ' + df_pats[4].astype('str') #+ ', ' + df_pats[5].astype('str') + ', ' + df_pats[6].astype('str') + ', ' + df_pats[7].astype('str')
df_pats.drop(columns=[0,1,2,3,4],inplace=True)
df_pats['cleaned'] = df_pats['combined'].apply(lambda x: clean_pat(x))
df_pats.drop(columns=['combined'],inplace=True)
df_pats.columns=['cleaned_pattern']

df_pats.head()

df_ids['combined'] = df_ids[0].astype('str') + df_ids[1].astype('str') + df_ids[2].astype('str') + df_ids[3].astype('str')\
+ df_ids[4].astype('str') #+ df_ids[5].astype('str') + df_ids[6].astype('str') + df_ids[7].astype('str') + df_ids[8].astype('str')
df_ids.drop(columns=[0,1,2,3,4],inplace=True)

df_ids['cleaned'] = df_ids['combined'].apply(lambda x: clean_id(x))
df_ids.drop(columns=['combined'],inplace=True)
df_ids.columns=['cleaned_id']
df_ids.head()

Unnamed: 0,cleaned_pattern
0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""n..."
1,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b..."
2,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
3,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
4,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."


Unnamed: 0,cleaned_id
0,", ""id"": ""new_york_city""}"
1,", ""id"": ""berlin""}"
2,", ""id"": ""los_angeles""}"
3,", ""id"": ""london""}"
4,", ""id"": ""alexandria""}"


In [74]:
df_pat_ids = pd.concat([df_pats,df_ids],axis=1)
df_pat_ids['combined'] = df_pat_ids['cleaned_pattern'].astype('str') + df_pat_ids['cleaned_id'].astype('str')
df_pat_ids.info()
df_pat_ids.head()
df_pat_ids.tail()

df_pat_ids.drop(columns=['cleaned_pattern','cleaned_id'],inplace=True)
df_pat_ids.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673 entries, 0 to 672
Data columns (total 3 columns):
cleaned_pattern    673 non-null object
cleaned_id         673 non-null object
combined           673 non-null object
dtypes: object(3)
memory usage: 15.9+ KB


Unnamed: 0,cleaned_pattern,cleaned_id,combined
0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""n...",", ""id"": ""new_york_city""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""n..."
1,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b...",", ""id"": ""berlin""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b..."
2,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l...",", ""id"": ""los_angeles""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
3,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l...",", ""id"": ""london""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
4,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a...",", ""id"": ""alexandria""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."


Unnamed: 0,cleaned_pattern,cleaned_id,combined
668,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b...",", ""id"": ""blantyre""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b..."
669,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l...",", ""id"": ""león""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
670,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g...",", ""id"": ""guilin""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g..."
671,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""h...",", ""id"": ""haikou""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""h..."
672,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g...",", ""id"": ""guiyang""}","{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g..."


Unnamed: 0,combined
0,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""n..."
1,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b..."
2,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
3,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
4,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."


In [75]:
df = pd.concat([df,df_pat_ids],axis=1)
df.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,city_check,combined
0,http://www.wikidata.org/entity/Q60,new york city,http://www.wikidata.org/entity/Q30,united states of america,http://www.wikidata.org/entity/Q1384,new york,New York City,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""n..."
1,http://www.wikidata.org/entity/Q64,berlin,http://www.wikidata.org/entity/Q183,germany,http://www.wikidata.org/entity/Q183,germany,Berlin,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b..."
2,http://www.wikidata.org/entity/Q65,los angeles,http://www.wikidata.org/entity/Q30,united states of america,http://www.wikidata.org/entity/Q104994,los angeles county,Los Angeles,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
3,http://www.wikidata.org/entity/Q84,london,http://www.wikidata.org/entity/Q145,united kingdom,http://www.wikidata.org/entity/Q23306,greater london,London,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
4,http://www.wikidata.org/entity/Q87,alexandria,http://www.wikidata.org/entity/Q79,egypt,http://www.wikidata.org/entity/Q29943,alexandria governorate,Alexandria,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a..."


In [76]:
df['combined'][100]

'{"label": "GPE_city", "pattern": [{"LOWER": "brazzaville"}], "id": "brazzaville"}'

In [0]:
df.drop(columns=['city_check'],inplace=True)
df.to_csv('city_detail_w_pattern.csv',index=False)

In [0]:
dump_jsonl(list(df['combined']), 'city_patterns.jsonl')

In [0]:
df[['combined']].to_csv('city_patterns.csv',index=False)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3719 entries, 0 to 3718
Data columns (total 7 columns):
city_entity       3719 non-null object
city              3719 non-null object
country_entity    3719 non-null object
country           3719 non-null object
admin_entity      3698 non-null object
admin             3698 non-null object
combined          3719 non-null object
dtypes: object(7)
memory usage: 203.5+ KB


In [80]:
df.tail()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined
668,http://www.wikidata.org/entity/Q188693,blantyre,http://www.wikidata.org/entity/Q1020,malawi,http://www.wikidata.org/entity/Q1059262,blantyre district,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b..."
669,http://www.wikidata.org/entity/Q189128,león,http://www.wikidata.org/entity/Q96,mexico,http://www.wikidata.org/entity/Q9022143,león municipality,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l..."
670,http://www.wikidata.org/entity/Q189633,guilin,http://www.wikidata.org/entity/Q148,people's republic of china,http://www.wikidata.org/entity/Q15176,guangxi zhuang autonomous region,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g..."
671,http://www.wikidata.org/entity/Q189823,haikou,http://www.wikidata.org/entity/Q148,people's republic of china,http://www.wikidata.org/entity/Q42200,hainan,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""h..."
672,http://www.wikidata.org/entity/Q192271,guiyang,http://www.wikidata.org/entity/Q148,people's republic of china,http://www.wikidata.org/entity/Q47097,guizhou,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g..."


In [81]:
df.query('country == "afghanistan"')

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined
118,http://www.wikidata.org/entity/Q5838,kabul,http://www.wikidata.org/entity/Q889,afghanistan,http://www.wikidata.org/entity/Q6344428,kabul,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""k..."


In [0]:
df.fillna(0,inplace=True)

In [83]:
df_admins = pd.read_csv('country_subdivisions_detail.csv')
df_admins.head()

Unnamed: 0,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code
0,AF,AF-BAL,balkh,Province,
1,AF,AF-BAM,bāmyān,Province,
2,AF,AF-BDG,bādghīs,Province,
3,AF,AF-BDS,badakhshān,Province,
4,AF,AF-BGL,baghlān,Province,


In [84]:
df_country = pd.read_csv('country_detail.csv')
df_country.head()

Unnamed: 0,country_alpha_2,country_alpha_3,country_name,country_numeric
0,AW,ABW,aruba,533
1,AF,AFG,afghanistan,4
2,AO,AGO,angola,24
3,AI,AIA,anguilla,660
4,AX,ALA,åland islands,248


In [85]:
df_country_admins = df_admins.merge(df_country,how='left',on='country_alpha_2')
df_country_admins.head()

Unnamed: 0,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric
0,AF,AF-BAL,balkh,Province,,AFG,afghanistan,4
1,AF,AF-BAM,bāmyān,Province,,AFG,afghanistan,4
2,AF,AF-BDG,bādghīs,Province,,AFG,afghanistan,4
3,AF,AF-BDS,badakhshān,Province,,AFG,afghanistan,4
4,AF,AF-BGL,baghlān,Province,,AFG,afghanistan,4


In [86]:
df_country_admins.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4844 entries, 0 to 4843
Data columns (total 8 columns):
country_alpha_2    4831 non-null object
sd_code            4844 non-null object
sd_name            4843 non-null object
sd_type            4844 non-null object
sd_parent_code     1315 non-null object
country_alpha_3    4844 non-null object
country_name       4844 non-null object
country_numeric    4844 non-null int64
dtypes: int64(1), object(7)
memory usage: 340.6+ KB


In [87]:
df_country_admins['sd_name'][4234] = "nan"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
df_country_admins[df_country_admins['country_alpha_3'] == "THA"][40:50]

Unnamed: 0,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric
4232,TH,TH-53,uttaradit,Province,,THA,thailand,764
4233,TH,TH-54,phrae,Province,,THA,thailand,764
4234,TH,TH-55,,Province,,THA,thailand,764
4235,TH,TH-56,phayao,Province,,THA,thailand,764
4236,TH,TH-57,chiang rai,Province,,THA,thailand,764
4237,TH,TH-58,mae hong son,Province,,THA,thailand,764
4238,TH,TH-60,nakhon sawan,Province,,THA,thailand,764
4239,TH,TH-61,uthai thani,Province,,THA,thailand,764
4240,TH,TH-62,kamphaeng phet,Province,,THA,thailand,764
4241,TH,TH-63,tak,Province,,THA,thailand,764


In [88]:
df_updated = df.merge(df_country_admins,how='left',left_on='admin',right_on='sd_name').reset_index(drop=True)
df_updated.head()

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric
0,http://www.wikidata.org/entity/Q60,new york city,http://www.wikidata.org/entity/Q30,united states of america,http://www.wikidata.org/entity/Q1384,new york,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""n...",US,US-NY,new york,State,,USA,united states,840.0
1,http://www.wikidata.org/entity/Q64,berlin,http://www.wikidata.org/entity/Q183,germany,http://www.wikidata.org/entity/Q183,germany,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""b...",,,,,,,,
2,http://www.wikidata.org/entity/Q65,los angeles,http://www.wikidata.org/entity/Q30,united states of america,http://www.wikidata.org/entity/Q104994,los angeles county,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l...",,,,,,,,
3,http://www.wikidata.org/entity/Q84,london,http://www.wikidata.org/entity/Q145,united kingdom,http://www.wikidata.org/entity/Q23306,greater london,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l...",,,,,,,,
4,http://www.wikidata.org/entity/Q87,alexandria,http://www.wikidata.org/entity/Q79,egypt,http://www.wikidata.org/entity/Q29943,alexandria governorate,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""a...",,,,,,,,


In [89]:
df_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 678 entries, 0 to 677
Data columns (total 15 columns):
city_entity        678 non-null object
city               678 non-null object
country_entity     678 non-null object
country            678 non-null object
admin_entity       678 non-null object
admin              678 non-null object
combined           678 non-null object
country_alpha_2    59 non-null object
sd_code            59 non-null object
sd_name            59 non-null object
sd_type            59 non-null object
sd_parent_code     5 non-null object
country_alpha_3    59 non-null object
country_name       59 non-null object
country_numeric    59 non-null float64
dtypes: float64(1), object(14)
memory usage: 79.6+ KB


In [42]:
df_updated.query('city == "coquimbo"')

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric
4,http://www.wikidata.org/entity/Q3871,coquimbo,http://www.wikidata.org/entity/Q298,chile,http://www.wikidata.org/entity/Q23660214,coquimbo,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""c...",CL,CL-CO,coquimbo,Region,,CHL,chile,152.0


In [44]:
grouped = df_updated.groupby(['country'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [43]:
r_list[:10]

["'s hertogenbosch",
 'a coruña',
 'aachen',
 'aalborg',
 'aarhus',
 'aba',
 'abadan',
 'abakan',
 'abbotsford',
 'abbottabad']

In [63]:
list(grouped)[1][1]['city']

255              kabul
943              herat
944           kandahar
1706          asadabad
1818            bamyan
2026       chaghcharan
2199    mazar-i-sharif
2200    mazar-i-sharif
2416           baghlan
3719            ghazni
3887         sar-e pol
3945             qalat
4116            gardēz
4131            zaranj
4134       lashkar gah
4135      pul-e khomri
4140           taloqan
4373        sheberghan
5050         jalalabad
5150             farah
5712             kholm
6793          samangan
6962           andkhoy
7290         khan abad
7382             kishm
7540        euthydemia
7675     qalʻah-ye zāl
7747           sharana
7874           watapur
7944          shindand
Name: city, dtype: object

In [0]:
wiki = MediaWiki()

In [45]:
wiki.summary(title='Catania',sentences=3)

'Catania (UK: , US: , Sicilian and Italian: [kaˈtaːnja] (listen)) is the second largest city of Sicily after Palermo; it is located on the east coast facing the Ionian Sea. It is the capital of the Metropolitan City of Catania, one of the ten biggest cities in Italy, and the seventh largest metropolitan city in Italy. The population of the city proper is 311,584 while the population of the Metropolitan City of Catania is 1,107,702.Catania was destroyed by catastrophic earthquakes in 1169 and 1693, and by several volcanic eruptions from the neighbouring Mount Etna, the most violent of which was in 1669.Catania was founded in the 8th century BC by Chalcidians.'

In [46]:
wikipedia.summary("Catania")

"Catania (UK: , US: , Sicilian and Italian: [kaˈtaːnja] (listen)) is the second largest city of Sicily after Palermo; it is located on the east coast facing the Ionian Sea. It is the capital of the Metropolitan City of Catania, one of the ten biggest cities in Italy, and the seventh largest metropolitan city in Italy. The population of the city proper is 311,584 while the population of the Metropolitan City of Catania is 1,107,702.Catania was destroyed by catastrophic earthquakes in 1169 and 1693, and by several volcanic eruptions from the neighbouring Mount Etna, the most violent of which was in 1669.Catania was founded in the 8th century BC by Chalcidians. In 1434, the first university in Sicily was founded in the city. In the 14th century and into the Renaissance period, Catania was one of Italy's most important cultural, artistic and political centres.The city is noted for its history, culture, architecture and gastronomy. Its old town, besides being one of the biggest examples of 

In [0]:
summaries = []
for item in r_list:
  summaries.append(get_city_summary(item,3))

In [92]:
len(r_list)

604

In [93]:
len(summaries)

604

In [94]:
summaries[:10]

["Abidjan ( AB-ih-JAHN, French: [abidʒɑ̃]) is the economic capital of Ivory Coast and one of the most populous French-speaking cities in Africa. According to the 2014 census, Abidjan's population was 4.7 million, which is 20 percent of the overall population of the country, and this also makes it the sixth most populous city proper in Africa, after Lagos, Cairo, Kinshasa, Dar es Salaam, and Johannesburg. A cultural crossroads of West Africa, Abidjan is characterised by a high level of industrialisation and urbanisation.",
 'Accra  is the capital of Ghana covering an area of 225.67 km2 (87.13 sq mi) with an estimated urban population of 2.27 million as of 2012. It is organized into 12 local government districts – 11 municipal districts and the Accra Metropolitan District, which is the only district within the capital to be granted city status. "Accra" usually refers to the Accra Metropolitan Area, which serves as the capital of Ghana, while the district which is within the jurisdiction 

In [95]:
df_nlp = pd.DataFrame({'city_names':r_list,'wiki_summary_projected':summaries})
df_nlp.head()


Unnamed: 0,city_names,wiki_summary_projected
0,abidjan,"Abidjan ( AB-ih-JAHN, French: [abidʒɑ̃]) is th..."
1,accra,Accra is the capital of Ghana covering an are...
2,adana,Adana (pronounced [aˈda.na]; Armenian: Ադանա; ...
3,addis ababa,"Addis Ababa (Amharic: አዲስ አበባ, Addis Abäba IPA..."
4,agra,Agra ( (listen)) is a city on the banks of the...


In [117]:
df_nlp.query('city_names == "kerkrade"')['wiki_summary_projected'][3295]

'Kerkrade (Kerkrade dialect: Kirchroa; German: Kerkrade or Kirchrath) is a town and a municipality in the southeast of Limburg, the southernmost province of the Netherlands. It forms part of the Parkstad Limburg agglomeration.\nKerkrade is the western half of a divided city; it was part of the German town of Herzogenrath until the Congress of Vienna in 1815 drew the current Dutch-German border and separated the towns. This means that the eastern end of the city marks the international border.\nThe two towns, including outlying suburban settlements, have a population approaching 100,000, of which nearly 47,000 are in Kerkrade.'

In [96]:
for item in nlp.pipe(df_nlp['wiki_summary_projected'][:10]):
  print([(ent.text.lower(), ent.label_) for ent in item.ents if ent.label_ == 'GPE'])



[('ivory coast', 'GPE'), ('abidjan', 'GPE'), ('lagos', 'GPE'), ('cairo', 'GPE'), ('kinshasa', 'GPE'), ('dar es salaam', 'GPE'), ('johannesburg', 'GPE'), ('west africa', 'GPE'), ('abidjan', 'GPE')]
[('ghana', 'GPE'), ('ghana', 'GPE')]
[('turkey', 'GPE'), ('anatolia', 'GPE'), ('the adana province', 'GPE'), ('turkey', 'GPE')]
[('addis ababa', 'GPE'), ('ethiopia', 'GPE'), ('addis ababa', 'GPE'), ('oromia', 'GPE')]
[('new delhi', 'GPE'), ('india', 'GPE')]
[('gujarati', 'GPE'), ('gujarat', 'GPE'), ('ahmadabad', 'GPE'), ('india', 'GPE'), ('india', 'GPE')]
[('iran', 'GPE'), ('khuzestan', 'GPE'), ('bakhtiaris', 'GPE'), ('dezfulis', 'GPE'), ('shushtaris', 'GPE')]
[("saudi arabia's", 'GPE'), ('eastern province', 'GPE'), ('hofuf', 'GPE'), ('classical arabic', 'GPE')]
[('beeʼeldííl dahsinil', 'GPE'), ('u.s.', 'GPE'), ('new mexico', 'GPE'), ('the united states', 'GPE'), ('the duke city', 'GPE'), ('la villa de alburquerque', 'GPE'), ('pueblo', 'GPE'), ('los ranchos', 'GPE')]
[('syria', 'GPE'), ('syri

In [0]:
sum_entities = []
for item in nlp.pipe(df_nlp['wiki_summary_projected']):
  sum_entities.append([k  for k in Counter(sorted([(ent.text.lower(), ent.label_) for ent in item.ents if ent.label_ in ['GPE','NORP']])).keys()])


In [98]:
sum_entities[:20]

[[('abidjan', 'GPE'),
  ('cairo', 'GPE'),
  ('dar es salaam', 'GPE'),
  ('french', 'NORP'),
  ('ivory coast', 'GPE'),
  ('johannesburg', 'GPE'),
  ('kinshasa', 'GPE'),
  ('lagos', 'GPE'),
  ('west africa', 'GPE')],
 [('ghana', 'GPE')],
 [('anatolia', 'GPE'),
  ('armenian', 'NORP'),
  ('the adana province', 'GPE'),
  ('turkey', 'GPE')],
 [('addis ababa', 'GPE'), ('ethiopia', 'GPE'), ('oromia', 'GPE')],
 [('india', 'GPE'),
  ('indian', 'NORP'),
  ('new delhi', 'GPE'),
  ('uttar pradesh', 'NORP')],
 [('ahmadabad', 'GPE'),
  ('gujarat', 'GPE'),
  ('gujarati', 'GPE'),
  ('india', 'GPE'),
  ('indian', 'NORP')],
 [('arabic', 'NORP'),
  ('arabs', 'NORP'),
  ('bakhtiaris', 'GPE'),
  ('dezfulis', 'GPE'),
  ('iran', 'GPE'),
  ('khuzestan', 'GPE'),
  ('persian', 'NORP'),
  ('persians', 'NORP'),
  ('sheybani', 'NORP'),
  ('shushtaris', 'GPE')],
 [('arabic', 'NORP'),
  ('classical arabic', 'GPE'),
  ('eastern province', 'GPE'),
  ('hofuf', 'GPE'),
  ("saudi arabia's", 'GPE')],
 [('beeʼeldííl dahsini

In [99]:
df_nlp_entities = pd.concat([df_nlp,pd.DataFrame(sum_entities)],axis=1)
df_nlp_entities.head()

Unnamed: 0,city_names,wiki_summary_projected,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,abidjan,"Abidjan ( AB-ih-JAHN, French: [abidʒɑ̃]) is th...","(abidjan, GPE)","(cairo, GPE)","(dar es salaam, GPE)","(french, NORP)","(ivory coast, GPE)","(johannesburg, GPE)","(kinshasa, GPE)","(lagos, GPE)","(west africa, GPE)",,,,,,
1,accra,Accra is the capital of Ghana covering an are...,"(ghana, GPE)",,,,,,,,,,,,,,
2,adana,Adana (pronounced [aˈda.na]; Armenian: Ադանա; ...,"(anatolia, GPE)","(armenian, NORP)","(the adana province, GPE)","(turkey, GPE)",,,,,,,,,,,
3,addis ababa,"Addis Ababa (Amharic: አዲስ አበባ, Addis Abäba IPA...","(addis ababa, GPE)","(ethiopia, GPE)","(oromia, GPE)",,,,,,,,,,,,
4,agra,Agra ( (listen)) is a city on the banks of the...,"(india, GPE)","(indian, NORP)","(new delhi, GPE)","(uttar pradesh, NORP)",,,,,,,,,,,


In [100]:
df_nlp_entities.tail()

Unnamed: 0,city_names,wiki_summary_projected,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
599,zoucheng,Zoucheng (simplified Chinese: 邹城; traditional ...,"(china, GPE)","(chinese, NORP)","(jining, GPE)","(shandong province, GPE)","(zou county, GPE)","(zouxian, NORP)",,,,,,,,,
600,zunyi,Zunyi (simplified Chinese: 遵义; traditional Chi...,"(bozhou, GPE)","(chinese, NORP)","(chongqing, GPE)","(guiyang, GPE)","(guizhou, GPE)","(honghuagang, GPE)","(huichuan, GPE)","(people's republic of china, GPE)","(sichuan, GPE)",,,,,,
601,ürümqi,"Ürümqi (UK: , US: ; Uyghur pronunciation: [ʏrʏ...","(china, GPE)","(chinese, NORP)","(the people's republic of china, GPE)","(the xinjiang uygur autonomous region, GPE)","(uk, GPE)","(us, GPE)","(ürümqi, GPE)",,,,,,,,
602,ōsaka,"Osaka (Japanese: 大阪市, Hepburn: Ōsaka-shi, pron...","(japan, GPE)","(japanese, NORP)","(osaka, GPE)","(osaka prefecture, GPE)",,,,,,,,,,,
603,şanlıurfa,Şanlıurfa Province (Turkish: Şanlıurfa ili) or...,"(turkey, GPE)","(turkish, NORP)","(urfa province, GPE)","(şanlıurfa, GPE)","(şanlıurfa province, GPE)",,,,,,,,,,


In [118]:
df_nlp_entities.query('city_names == "kerkrade"')

Unnamed: 0,city_names,wiki_summary_projected,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97
3295,kerkrade,Kerkrade (Kerkrade dialect: Kirchroa; German: ...,"(herzogenrath, GPE)","(kerkrade, GPE)","(limburg, GPE)","(netherlands, GPE)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
df_updated_2 = df_updated.merge(df_nlp_entities,how='left',left_on='city',right_on='city_names')

In [0]:
df_updated_2.to_csv('cities_updated_with_admin.csv',index=False)

In [103]:
df_updated_2.tail(4)

Unnamed: 0,city_entity,city,country_entity,country,admin_entity,admin,combined,country_alpha_2,sd_code,sd_name,sd_type,sd_parent_code,country_alpha_3,country_name,country_numeric,city_names,wiki_summary_projected,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
674,http://www.wikidata.org/entity/Q189128,león,http://www.wikidata.org/entity/Q96,mexico,http://www.wikidata.org/entity/Q9022143,león municipality,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""l...",,,,,,,,,león,error,,,,,,,,,,,,,,,
675,http://www.wikidata.org/entity/Q189633,guilin,http://www.wikidata.org/entity/Q148,people's republic of china,http://www.wikidata.org/entity/Q15176,guangxi zhuang autonomous region,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g...",,,,,,,,,guilin,Guilin (Standard Zhuang: Gveilinz; alternately...,"(china, GPE)","(guangxi zhuang autonomous region, GPE)","(hunan, GPE)",,,,,,,,,,,,
676,http://www.wikidata.org/entity/Q189823,haikou,http://www.wikidata.org/entity/Q148,people's republic of china,http://www.wikidata.org/entity/Q42200,hainan,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""h...",,,,,,,,,haikou,Haikou (Chinese: 海口; pinyin: Hǎikǒu) is the ca...,"(chinese, NORP)","(haikou, GPE)","(hainan, GPE)",,,,,,,,,,,,
677,http://www.wikidata.org/entity/Q192271,guiyang,http://www.wikidata.org/entity/Q148,people's republic of china,http://www.wikidata.org/entity/Q47097,guizhou,"{""label"": ""GPE_city"", ""pattern"": [{""LOWER"": ""g...",,,,,,,,,guiyang,Guiyang is the capital of Guizhou province of ...,"(guizhou province, GPE)",,,,,,,,,,,,,,


In [107]:
wiki.summary('franca')

"France (French: [fʁɑ̃s] (listen)), officially the French Republic (French: République française, pronounced [ʁepyblik fʁɑ̃sɛːz] (listen)), is a country whose territory consists of metropolitan France in Western Europe and several overseas regions and territories. The metropolitan area of France extends from the Mediterranean Sea to the English Channel and the North Sea, and from the Rhine to the Atlantic Ocean. It is bordered by Belgium, Luxembourg and Germany to the northeast, Switzerland and Italy to the east, and Andorra and Spain to the south. The overseas territories include French Guiana in South America and several islands in the Atlantic, Pacific and Indian oceans. The country's 18 integral regions (five of which are situated overseas) span a combined area of 643,801 square kilometres (248,573 sq mi) and a total population of 67.02 million (as of July 2019). France is a unitary semi-presidential republic with its capital in Paris, the country's largest city and main cultural a

In [108]:
sum_entities[:20]

[[('netherlands', 'GPE'),
  ('north brabant', 'GPE'),
  ('uk', 'GPE'),
  ('us', 'GPE')],
 [('galicia', 'GPE'), ('spain', 'GPE'), ('the kingdom of galicia', 'GPE')],
 [('aabenraa municipality', 'GPE'),
  ('denmark', 'GPE'),
  ('flensburg', 'GPE'),
  ('germany', 'GPE'),
  ('south jutland county', 'GPE'),
  ('southern denmark', 'GPE')],
 [('ausiait', 'GPE'), ('greenland', 'GPE')],
 [('australia', 'GPE'),
  ('canada', 'GPE'),
  ('ireland', 'GPE'),
  ('new zealand', 'GPE'),
  ('south africa', 'GPE'),
  ('stockholm', 'GPE'),
  ('sweden', 'GPE'),
  ('the united kingdom', 'GPE'),
  ('the united states', 'GPE'),
  ('uk', 'GPE')],
 [],
 [('peru', 'GPE'), ('quechua', 'GPE'), ('the abancay province', 'GPE')],
 [('banja luka', 'GPE'),
  ('bosnia', 'GPE'),
  ('byzacena', 'GPE'),
  ('tunisia', 'GPE')],
 [('al-daghmah', 'GPE'),
  ('alshawaf', 'GPE'),
  ('gaza strip', 'GPE'),
  ('qudayh', 'GPE')],
 [('county laois', 'GPE'), ('dublin', 'GPE'), ('ireland', 'GPE')],
 [('abbotsford', 'GPE'),
  ('british co

In [60]:
for item in sum_entities:
  if len(item) > 10:
    print(item)

[('alans', 'NORP'), ('aryan', 'NORP'), ('caucasian', 'NORP'), ('chinese', 'NORP'), ('iranian', 'NORP'), ('latin', 'NORP'), ('massagetae', 'GPE'), ('roman', 'NORP'), ('sarmatians', 'NORP'), ('the central asian', 'NORP'), ('the parthian empire', 'GPE'), ('the roman empire', 'GPE')]
[('arabs', 'NORP'), ('assyrians', 'NORP'), ('babylonians', 'NORP'), ('greek', 'NORP'), ('hasmoneans', 'NORP'), ('hebrew', 'NORP'), ('israel', 'GPE'), ('mamluks', 'GPE'), ('persians', 'NORP'), ('phoenicians', 'NORP'), ('romans', 'NORP'), ('tel aviv', 'GPE'), ('the ancient egyptians', 'NORP'), ('the gaza strip', 'GPE'), ('אַשְׁקְלוֹן', 'GPE')]
[('banya', 'GPE'), ('bucharest', 'GPE'), ('german', 'NORP'), ('hungarian', 'NORP'), ('hungary', 'GPE'), ('igniș', 'NORP'), ('latin', 'NORP'), ('maramureș county', 'GPE'), ('romania', 'GPE'), ('romanian', 'NORP'), ('ukraine', 'GPE'), ('us', 'GPE'), ('yiddish', 'NORP'), ('באניע\u200e', 'NORP')]
[('bolognese', 'NORP'), ('bononia', 'GPE'), ('bulåggna', 'NORP'), ('emilia-romagn