## World Cities

source: [‰∏ñÁïå„ÅÆÁôæ‰∏áÈÉΩÂ∏Ç„ÅÆ‰ΩçÁΩÆ„Éá„Éº„Çø Location Data of Megacities - ASTI „Ç¢„Éû„ÉéÊäÄÁ†î](https://amano-tec.com/data/megacities.html)

In [1]:
import json
import re
from collections import defaultdict

import pandas as pd
import jaconv

In [2]:
df = pd.read_csv("./asti-dats59r03wm/s59r03megacities_utf8.csv", sep="\t")
df.head().T

Unnamed: 0,0,1,2,3,4
country_code,AZ,AF,US,US,US
name_jp,„Ç¢„Çº„É´„Éê„Ç§„Ç∏„É£„É≥ÂÖ±ÂíåÂõΩ,„Ç¢„Éï„Ç¨„Éã„Çπ„Çø„É≥„Éª„Ç§„Çπ„É©„É†ÂÖ±ÂíåÂõΩ,„Ç¢„É°„É™„Ç´ÂêàË°ÜÂõΩ,„Ç¢„É°„É™„Ç´ÂêàË°ÜÂõΩ,„Ç¢„É°„É™„Ç´ÂêàË°ÜÂõΩ
name_jps,„Ç¢„Çº„É´„Éê„Ç§„Ç∏„É£„É≥,„Ç¢„Éï„Ç¨„Éã„Çπ„Çø„É≥,„Ç¢„É°„É™„Ç´ÔºàÁ±≥ÂõΩÔºâ,„Ç¢„É°„É™„Ç´ÔºàÁ±≥ÂõΩÔºâ,„Ç¢„É°„É™„Ç´ÔºàÁ±≥ÂõΩÔºâ
name_en,Republic of Azerbaijan,Islamic Republic of Afghanistan,United States of America,United States of America,United States of America
name_ens,Azerbaijan,Afghanistan,United States,United States,United States
capital_jp,„Éê„ÇØ„Éº,„Ç´„Éñ„Éº„É´,„Çµ„É≥„Ç¢„É≥„Éà„Éã„Ç™,„Çµ„É≥„Éá„Ç£„Ç®„Ç¥,„Ç∑„Ç´„Ç¥
capital_en,Baku,Kabul,San Antonio,San Diego,Chicago
iscapital,1,1,0,0,0
lat,40.36704,34.521096,29.424587,32.721944,41.883823
lon,49.832039,69.173672,-98.495145,-117.171918,-87.632078


In [3]:
def remove_bracket(name):
        if "Ôºà" in name:
            parts = name.split("Ôºà")
            assert len(parts) == 2
            assert parts[1][-1] == "Ôºâ"
            return parts[0]
        return name

In [4]:
hiragana_pattern = r'^[\u3040-\u309F]+$'
katakana_pattern = r'^[\u30A0-\u30FF]+$'

reading_dict = {
    "Êù±‰∫¨": "„Å®„ÅÜ„Åç„Çá„ÅÜ",
    "Â§ßÈò™": "„Åä„Åä„Åï„Åã",
    "Â∑ùÂ¥é": "„Åã„Çè„Åï„Åç",
    "‰∫¨ÈÉΩ": "„Åç„Çá„ÅÜ„Å®",
    "Á•ûÊà∏": "„Åì„ÅÜ„Åπ",
    "Êú≠Âπå": "„Åï„Å£„ÅΩ„Çç",
    "‰ªôÂè∞": "„Åõ„Çì„Å†„ÅÑ",
    "ÂêçÂè§Â±ã": "„Å™„Åî„ÇÑ",
    "Â∫ÉÂ≥∂": "„Å≤„Çç„Åó„Åæ",
    "Á¶èÂ≤°": "„Åµ„Åè„Åä„Åã",
    "Ê®™Êµú": "„Çà„Åì„ÅØ„Åæ"
}

def name2reading(name):
    if re.match(hiragana_pattern, name):
        return name
    elif re.match(katakana_pattern, name):
        return jaconv.kata2hira(name)

    # „Å≤„Çâ„Åå„Å™, „Ç´„Çø„Ç´„Éä‰ª•Â§ñ
    if "Ôºà" in name:
        parts = name.split("Ôºà")
        assert len(parts) == 2
        assert parts[1][-1] == "Ôºâ"
        katakana = parts[1][:-1]
        return jaconv.kata2hira(katakana)

    assert name in reading_dict, name
    return reading_dict[name]

In [5]:
# Èï∑Èü≥, ‰øÉÈü≥„ÇíËÄÉÊÖÆ
def get_last_char(reading):
    if reading[-1] == "„Éº":
        return get_last_char(reading[:-1])
    if reading[-1] == "„ÅÅ":
        return "„ÅÇ"
    if reading[-1] == "„ÅÉ":
        return "„ÅÑ"
    if reading[-1] == "„ÅÖ":
        return "„ÅÜ"
    if reading[-1] == "„Åá":
        return "„Åà"
    if reading[-1] == "„ÇÉ":
        return "„ÇÑ"
    if reading[-1] =="„ÇÖ":
        return "„ÇÜ"
    return reading[-1]

In [6]:
cities = []

for _, row in df.iterrows():
    d = row.to_dict()
    
    # ÁâπÊÆä„Å™„Ç±„Éº„Çπ„Çí„Çπ„Ç≠„ÉÉ„Éó - „Äå„ÉÄ„Éû„Çπ„Ç´„Çπ„ÄçËá™‰Ωì„ÅØÂà•ÈÄîÂê´„Åæ„Çå„Çã
    if d["capital_jp"] == "„ÉÄ„Éû„Çπ„Ç´„ÇπÈÉäÂ§ñ": continue
        
    # ü•≥
    if d["capital_jp"] == "„Ç¶„É≥„Ç∏„É£„É°„Éä": d["capital_jp"] = "„É≥„Ç∏„É£„É°„Éä"
        
    reading = name2reading(d["capital_jp"])
    
    cities.append({
        "name": remove_bracket(d["capital_jp"]),
        "reading": reading,
        "country": d["name_jp"],
        "population": d["pop"],
        "coordinates": [d["lon"], d["lat"]],
        "shiritori": {
            "first": reading[0],
            "last": get_last_char(reading),
        }
    })

[d for d in cities if d["name"] == "„É≥„Ç∏„É£„É°„Éä"]

[{'name': '„É≥„Ç∏„É£„É°„Éä',
  'reading': '„Çì„Åò„ÇÉ„ÇÅ„Å™',
  'country': '„ÉÅ„É£„ÉâÂÖ±ÂíåÂõΩ',
  'population': 1521882,
  'coordinates': [15.0448322, 12.1052915],
  'shiritori': {'first': '„Çì', 'last': '„Å™'}}]

### deadend

In [7]:
shiritori_first_dict = defaultdict(list)
shiritori_last_dict = defaultdict(list)

for city in cities:
    shiritori_first_dict[city["shiritori"]["first"]].append(city)
    shiritori_last_dict[city["shiritori"]["last"]].append(city)

In [8]:
set(shiritori_last_dict.keys()) - set(shiritori_first_dict.keys())

{'„Å•'}

In [9]:
shiritori_last_dict["„Å•"]

[{'name': 'ËèèÊ≤¢',
  'reading': '„Åª„Éº„Å•„Éº',
  'country': '‰∏≠ËèØ‰∫∫Ê∞ëÂÖ±ÂíåÂõΩ',
  'population': 1280031,
  'coordinates': [115.4738228, 35.2348208],
  'shiritori': {'first': '„Åª', 'last': '„Å•'}}]

In [10]:
shiritori_last_dict["„Åö"]

[{'name': '„Ç¢„Éï„É¥„Ç°„Éº„Ç∫',
  'reading': '„ÅÇ„Åµ„Çî„ÅÅ„Éº„Åö',
  'country': '„Ç§„É©„É≥„Éª„Ç§„Çπ„É©„É†ÂÖ±ÂíåÂõΩ',
  'population': 1184788,
  'coordinates': [48.6629412, 31.3204333],
  'shiritori': {'first': '„ÅÇ', 'last': '„Åö'}},
 {'name': '„Ç∑„Éº„É©„Éº„Ç∫',
  'reading': '„Åó„Éº„Çâ„Éº„Åö',
  'country': '„Ç§„É©„É≥„Éª„Ç§„Çπ„É©„É†ÂÖ±ÂíåÂõΩ',
  'population': 1565572,
  'coordinates': [52.5429486, 29.6182001],
  'shiritori': {'first': '„Åó', 'last': '„Åö'}},
 {'name': '„Çø„Éñ„É™„Éº„Ç∫',
  'reading': '„Åü„Å∂„Çä„Éº„Åö',
  'country': '„Ç§„É©„É≥„Éª„Ç§„Çπ„É©„É†ÂÖ±ÂíåÂõΩ',
  'population': 1558693,
  'coordinates': [46.2956786, 38.0736806],
  'shiritori': {'first': '„Åü', 'last': '„Åö'}},
 {'name': '„Éï„Çß„Ç∫',
  'reading': '„Åµ„Åá„Åö',
  'country': '„É¢„É≠„ÉÉ„Ç≥ÁéãÂõΩ',
  'population': 1249416,
  'coordinates': [-5.016249, 34.034446],
  'shiritori': {'first': '„Åµ', 'last': '„Åö'}}]

In [11]:
shiritori_first_dict["„Åö"]

[{'name': 'Ê∑ÑÂçö',
  'reading': '„Åö„Éº„Åº„Éº',
  'country': '‰∏≠ËèØ‰∫∫Ê∞ëÂÖ±ÂíåÂõΩ',
  'population': 2817479,
  'coordinates': [118.0488091, 36.813085],
  'shiritori': {'first': '„Åö', 'last': '„Åº'}},
 {'name': 'Ë≥áÈôΩ',
  'reading': '„Åö„Éº„ÇÑ„Çì',
  'country': '‰∏≠ËèØ‰∫∫Ê∞ëÂÖ±ÂíåÂõΩ',
  'population': 1016034,
  'coordinates': [104.6251845, 30.1316754],
  'shiritori': {'first': '„Åö', 'last': '„Çì'}},
 {'name': 'ÈÑ≠Â∑û',
  'reading': '„Åö„Åá„Çì„Å¢„Çá„ÅÜ',
  'country': '‰∏≠ËèØ‰∫∫Ê∞ëÂÖ±ÂíåÂõΩ',
  'population': 2589387,
  'coordinates': [113.6193223, 34.7477857],
  'shiritori': {'first': '„Åö', 'last': '„ÅÜ'}},
 {'name': 'ÈÑíÂüé',
  'reading': '„Åö„Åâ„ÅÜ„Å°„Çá„Çì',
  'country': '‰∏≠ËèØ‰∫∫Ê∞ëÂÖ±ÂíåÂõΩ',
  'population': 1101003,
  'coordinates': [117.0016364, 35.4046683],
  'shiritori': {'first': '„Åö', 'last': '„Çì'}},
 {'name': '‰∏≠Â±±',
  'reading': '„Åö„Åâ„Çì„Åó„ÇÉ„Çì',
  'country': '‰∏≠ËèØ‰∫∫Ê∞ëÂÖ±ÂíåÂõΩ',
  'population': 2363322,
  'coordinates': [113.3881505, 22.5196006],
  'shir

In [12]:
[(i, city) for i, city in enumerate(cities) if city["shiritori"]["last"] == "„Å•"]

[(340,
  {'name': 'ËèèÊ≤¢',
   'reading': '„Åª„Éº„Å•„Éº',
   'country': '‰∏≠ËèØ‰∫∫Ê∞ëÂÖ±ÂíåÂõΩ',
   'population': 1280031,
   'coordinates': [115.4738228, 35.2348208],
   'shiritori': {'first': '„Åª', 'last': '„Å•'}})]

In [13]:
cities[340]["shiritori"]["last"] = "„Åö"

In [14]:
cities[340]

{'name': 'ËèèÊ≤¢',
 'reading': '„Åª„Éº„Å•„Éº',
 'country': '‰∏≠ËèØ‰∫∫Ê∞ëÂÖ±ÂíåÂõΩ',
 'population': 1280031,
 'coordinates': [115.4738228, 35.2348208],
 'shiritori': {'first': '„Åª', 'last': '„Åö'}}

### output

In [15]:
with open("../static/cities.json", "w") as fp:
    json.dump(cities, fp, ensure_ascii=False, indent=2)

In [16]:
!ls -lh ../static/cities.json

-rw-r--r--  1 sorami  staff   136K Jul 16 23:58 ../static/cities.json


In [17]:
!head -20 ../static/cities.json

[
  {
    "name": "„Éê„ÇØ„Éº",
    "reading": "„Å∞„Åè„Éº",
    "country": "„Ç¢„Çº„É´„Éê„Ç§„Ç∏„É£„É≥ÂÖ±ÂíåÂõΩ",
    "population": 2285273,
    "coordinates": [
      49.8320385,
      40.3670397
    ],
    "shiritori": {
      "first": "„Å∞",
      "last": "„Åè"
    }
  },
  {
    "name": "„Ç´„Éñ„Éº„É´",
    "reading": "„Åã„Å∂„Éº„Çã",
    "country": "„Ç¢„Éï„Ç¨„Éã„Çπ„Çø„É≥„Éª„Ç§„Çπ„É©„É†ÂÖ±ÂíåÂõΩ",
    "population": 4775074,


## Stats

### long names

In [18]:
sorted([d["name"] for d in cities], reverse=True, key=lambda x: len(x))[:10]

['„Éä„Ç≥„Éº„É≥„Ç∑„Éº„Çø„É≥„Éû„É©„Éº„Éà',
 '„Éä„Ç≥„Éº„É≥„É©„Éº„ÉÅ„É£„Ç∑„Éº„Éû„Éº',
 '„Éî„É≥„Éó„É™„Éª„ÉÅ„É≥„ÉÅ„ÉØ„ÉÉ„Éâ',
 '„Ç¶„Éú„É≥„É©„Éº„ÉÅ„É£„Çø„Éº„Éã„Éº',
 '„É¥„Ç£„Ç∑„É£„Éº„Ç´„Éë„Éà„Éä„É†',
 '„Çµ„É≥„Éà„Éâ„Éü„É≥„Ç¥„Ç®„Çπ„ÉÜ',
 '„Ç¢„Ç∞„Ç¢„Çπ„Ç´„É™„Ç®„É≥„ÉÜ„Çπ',
 '„Çµ„É≥„ÇØ„Éà„Éö„ÉÜ„É´„Éñ„É´„ÇØ',
 '„Ç¢„Ç¶„É©„É≥„Ç¨„Éº„Éê„Éº„Éâ',
 '„Ç¨„Éº„Ç∫„Ç£„É§„Éº„Éê„Éº„Éâ']

### counts

In [19]:
shiritori_first_dict = defaultdict(list)
shiritori_last_dict = defaultdict(list)

for city in cities:
    shiritori_first_dict[city["shiritori"]["first"]].append(city)
    shiritori_last_dict[city["shiritori"]["last"]].append(city)

In [20]:
pd.DataFrame([(k, len(v)) for k, v in shiritori_first_dict.items()], columns=["first", "count"])\
    .sort_values("count", ascending=False)\
    .reset_index(drop=True)\
    .head(10)

Unnamed: 0,first,count
0,„ÅÇ,29
1,„Åó,28
2,„Å°,23
3,„Åã,21
4,„Åï,20
5,„ÅØ,19
6,„Åµ,18
7,„Å∞,17
8,„Åæ,16
9,„Å™,15


In [21]:
pd.DataFrame([(k, len(v)) for k, v in shiritori_last_dict.items()], columns=["last", "count"])\
    .sort_values("count", ascending=False)\
    .reset_index(drop=True)\
    .head(10)

Unnamed: 0,last,count
0,„Çì,149
1,„ÅÜ,44
2,„Çã,37
3,„ÅÑ,27
4,„Å©,21
5,„Çâ,17
6,„Åô,17
7,„Å®,16
8,„Åè,13
9,„Çä,12
