# 地名集日本（Gazetteer of Japan）

[sorami/gazetter-of-japan](https://github.com/sorami/gazetter-of-japan)

- データソース` 000238259.pdf`: 国土地理院 [地名集日本（2021年更新）　(3082KB PDF)](https://www.gsi.go.jp/common/000238259.pdf) (2022-12-22取得)
- 上記データを加工（PDFをパースし、CSV, JSON形式へ変換）
- [国土地理院コンテンツ利用規約 | 国土地理院](https://www.gsi.go.jp/kikakuchousei/kikakuchousei40182.html)

In [1]:
from collections import defaultdict
from decimal import Decimal, ROUND_HALF_EVEN

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTRect
import pandas as pd

## PDFの表ページを取得

In [2]:
%%time
pdf_pages = [p for p in extract_pages("original/000238259.pdf")]

# PDF: 計112ページ
assert len(pdf_pages) == 112

# 地名表: PDFのページ8 - 111
table_pages = pdf_pages[7:111]

CPU times: user 14.6 s, sys: 220 ms, total: 14.9 s
Wall time: 15 s


## 表ページから各行を取得

In [3]:
def extract_column(element):
    return [d.get_text().strip() for d in element]

In [4]:
all_page_columns = {}
error_page_columns = {}

for page_no, page in enumerate(table_pages, start=1):
    elements = [e for e in page]
    
    # Grid列は、分割されているページが多く対応が困難なため、今回は対象外
    page_columns = {
        "Japanese(Kanji)": [],
        "Japanese(Kana)": [],
        "Romanized Japanese": [],
        "Latitude": [],
        "Longitude": [],
         "Classification": []
    }

    # 各列を取得
    for elm in elements:
        if type(elm) != LTTextBoxHorizontal:
            continue

        col = extract_column(elm)
        if col[0] in page_columns:
            page_columns[col[0]] = col[1:]
        elif col[0] == "Latitude Longitude":
            page_columns["Longitude"] = col[1::2]
            page_columns["Latitude"] = col[2::2]
            

    # 各列は、40行からなると想定（最終ページを除く）
    # 上手く列を取得できていないと想定されるページを判定
    # 別途、後段で対応
    if not(
        40 == 
        len(page_columns["Japanese(Kanji)"]) == 
        len(page_columns["Japanese(Kana)"]) == 
        len(page_columns["Romanized Japanese"]) == 
        len(page_columns["Classification"]) == 
        len(page_columns["Latitude"]) == 
        len(page_columns["Longitude"])
    ) and page_no < len(table_pages):
        print(page_no)
        error_page_columns[page_no] = page_columns
    else:
        all_page_columns[page_no] = page_columns

5
35
40
52
75
77


## 正しく取得できなかったページの対処

一部の行では、漢字が画像になっているなどのために、正しくパースできない。これらは以下、半手動で対応する。

In [5]:
print(error_page_columns.keys())

dict_keys([5, 35, 40, 52, 75, 77])


In [6]:
def check_column_counts(page_column):
    return pd.DataFrame([(k, len(v)) for k, v in page_column.items()], columns=["column", "count"])

### ページ5

In [7]:
check_column_counts(error_page_columns[5])

Unnamed: 0,column,count
0,Japanese(Kanji),38
1,Japanese(Kana),40
2,Romanized Japanese,40
3,Latitude,40
4,Longitude,40
5,Classification,40


In [8]:
error_page_columns[5]["Japanese(Kanji)"]

['有福島',
 '有馬',
 '有田町',
 '厚狭川',
 '阿佐山',
 'あさぎり町',
 '朝来市',
 '朝日町',
 '旭岳',
 '朝日岳',
 '朝日岳',
 '旭川',
 '朝日町',
 '朝日町',
 '朝日村',
 '旭市',
 '旭川市',
 '朝霞市',
 '浅川町',
 '浅口市',
 '朝倉市',
 '浅草',
 '浅草岳',
 '浅間山',
 '朝熊ヶ岳',
 '浅間隠山',
 '浅虫',
 '浅瀬石川',
 '芦辺',
 '芦別岳',
 '芦別川',
 '芦別市',
 '芦田川',
 '足利市',
 '芦北町',
 '芦ノ湖',
 '足尾山地',
 '愛鷹山']

[芦屋町](https://www.town.ashiya.lg.jp/)

[芦屋市／芦屋市の「芦」の字の表記について](https://www.city.ashiya.lg.jp/bunsho/documents/ashinojinohyouki.html)

> 質問
> ホームページや市の刊行物等で芦屋市の「芦」の字が草冠の下の戸の字の一画目が斜め下にはねて表記されるものと、「戸」と表記されているものとで混在していますが、正式にはどちらが正しいのでしょうか。
> 
> 回答
> 芦屋市の「芦」の表記の違いについては、デザイン上の差であって、字体の差ではありませんので、公文書等においてはどちらも使用できることとしています。

In [9]:
assert len(error_page_columns[5]["Japanese(Kanji)"]) == 38
error_page_columns[5]["Japanese(Kanji)"] += ["芦屋町", "芦屋市"]
assert len(error_page_columns[5]["Japanese(Kanji)"]) == 40

In [10]:
pd.DataFrame(error_page_columns[5])

Unnamed: 0,Japanese(Kanji),Japanese(Kana),Romanized Japanese,Latitude,Longitude,Classification
0,有福島,ありふくじま,Arifuku Jima,32°55',128°57',Island
1,有馬,ありま,Arima,34°48',135°15',Populated area
2,有田町,ありたちょう,Arita Cho,33°13',129°51',Municipality
3,厚狭川,あさがわ,Asa Gawa,34°00',131°09',River
4,阿佐山,あさやま,Asa Yama,34°47',132°23',Mountain
5,あさぎり町,あさぎりちょう,Asagiri Cho,32°14',130°54',Municipality
6,朝来市,あさごし,Asago Shi,35°20',134°51',Municipality
7,朝日町,あさひちょう,Asahi Cho,35°02',136°40',Municipality
8,旭岳,あさひだけ,Asahi Dake,43°40',142°51',Mountain
9,朝日岳,あさひだけ,Asahi Dake,38°16',139°55',Mountain


In [11]:
assert len(error_page_columns[5]["Japanese(Kanji)"]) == 40
all_page_columns[5] = error_page_columns[5]

### ページ35

In [12]:
check_column_counts(error_page_columns[35])

Unnamed: 0,column,count
0,Japanese(Kanji),15
1,Japanese(Kana),40
2,Romanized Japanese,40
3,Latitude,40
4,Longitude,40
5,Classification,40


In [13]:
error_page_columns[35]["Japanese(Kanji)"]

['かすみがうら市',
 '粕屋町',
 '加太',
 '潟上市',
 '傾山',
 '交野市',
 '片品川',
 '片品村',
 '片山津',
 '加東市',
 '香取海山',
 '香取市',
 '勝本',
 'かつらぎ町',
 '葛城山']

[葛の字の取り扱い／葛城市](https://www.city.katsuragi.nara.jp/soshiki/kikakuseisakuka/20/3215.html)

In [14]:
assert len(error_page_columns[35]["Japanese(Kanji)"]) == 15
error_page_columns[35]["Japanese(Kanji)"] += ["葛城市"]

In [15]:
page = table_pages[35-1]
elements = [e for e in page]
for elm in elements:
    if type(elm) != LTTextBoxHorizontal:
            continue
    col = extract_column(elm)
    if col[0] == "葛尾村":
        print(col)

['葛尾村', '葛飾区', '勝浦町', '勝浦川', '勝浦海盆', '勝浦海底谷', '勝浦市', '勝山市', '川場村', '川辺町', '川辺川', '河内岳', '河内町', '河内長野市', '川越町', '川越市', '河口湖', '川口市', '河合町', '河地', '川島町', '川尻', '川上村', '川上村']


In [16]:
assert len(error_page_columns[35]["Japanese(Kanji)"]) == 16
error_page_columns[35]["Japanese(Kanji)"] += \
    ['葛尾村', '葛飾区', '勝浦町', '勝浦川', '勝浦海盆', '勝浦海底谷', '勝浦市', '勝山市', 
     '川場村', '川辺町', '川辺川', '河内岳', '河内町', '河内長野市', 
     '川越町', '川越市', '河口湖', '川口市', '河合町', '河地', '川島町', '川尻', '川上村', '川上村']
assert len(error_page_columns[35]["Japanese(Kanji)"]) == 40

In [17]:
pd.DataFrame(error_page_columns[35])

Unnamed: 0,Japanese(Kanji),Japanese(Kana),Romanized Japanese,Latitude,Longitude,Classification
0,かすみがうら市,かすみがうらし,Kasumigaura Shi,36°09',140°14',Municipality
1,粕屋町,かすやまち,Kasuya Machi,33°37',130°29',Municipality
2,加太,かた,Kata,34°16',135°05',Populated area
3,潟上市,かたがみし,Katagami Shi,39°51',140°01',Municipality
4,傾山,かたむきやま,Katamuki Yama,32°50',131°29',Mountain
5,交野市,かたのし,Katano Shi,34°47',135°41',Municipality
6,片品川,かたしながわ,Katashina Gawa,36°37',139°03',River
7,片品村,かたしなむら,Katashina Mura,36°46',139°14',Municipality
8,片山津,かたやまづ,Katayamazu,36°21',136°22',Populated area
9,加東市,かとうし,Kato Shi,34°55',134°58',Municipality


In [18]:
assert len(error_page_columns[35]["Japanese(Kanji)"]) == 40
all_page_columns[35] = error_page_columns[35]

### ページ40

In [19]:
check_column_counts(error_page_columns[40])

Unnamed: 0,column,count
0,Japanese(Kanji),0
1,Japanese(Kana),33
2,Romanized Japanese,33
3,Latitude,60
4,Longitude,60
5,Classification,40


In [20]:
pd.DataFrame(
    zip(error_page_columns[40]["Japanese(Kana)"], error_page_columns[40]["Romanized Japanese"]),
    columns=["Japanese(Kana)", "Romanized Japanese"]
)

Unnamed: 0,Japanese(Kana),Romanized Japanese
0,きつきし,Kitsuki Shi
1,きっとやさん,Kittoya San
2,きやまちょう,Kiyama Cho
3,きよかわむら,Kiyokawa Mura
4,きよさとちょう,Kiyosato Cho
5,きよせし,Kiyose Shi
6,きよすし,Kiyosu Shi
7,きよすみやま,Kiyosumi Yama
8,きよつがわ,Kiyotsu Gawa
9,きづがわ,Kizu Gawa


In [21]:
page = table_pages[40-1]
elements = [e for e in page]
for elm in elements:
    if type(elm) != LTTextBoxHorizontal:
            continue
    col = extract_column(elm)
    print(col)

['Gazetteer of Japan']
['2021']
['1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600']
['Grid', '5031', '5540', '5030', '5339', '6544', '5339', '5236', '5240', '5538', '5235', '5235', '5129', '4830', '5235', '5338', '5033', '5033', '5232', '5339', '4329', '6140', '6841', '5233', '5338', '5439', '5030', '4429', '5339', '5031', '4929', '5130', '3623']
['4930', '5339', '5133', '6239']
['Japanese(Kana)', 'きつきし', 'きっとやさん', 'きやまちょう', 'きよかわむら', 'きよさとちょう', 'きよせし', 'きよすし', 'きよすみやま', 'きよつがわ', 'きづがわ', 'きづがわし', 'こうざき', 'こばやしし', 'こうべし', 'こぶしがたけ', 'こうちけん', 'こうちし', 'こうだ', 'こだいらし', 'こだからじま', 'こどまりみさき', 'こえといがわ', 'こうふちょう', 'こうふし', 'こがし', 'こがし', 'こがじゃじま', 'こがねいし', 'こうげまち', 'こうござき', 'こぐし', 'こはまじま', 'コヒトデかいざん']
['Latitude Longitude', 'Japanese(

In [22]:
assert len(error_page_columns[40]["Japanese(Kana)"]) == 33
error_page_columns[40]["Japanese(Kana)"] += \
        ["こひよしかいざん（はちじゅうはちやかいざん）"] + \
        ['こうほうかいけつ', 'こうほうかいれい', 'こうほくまち', 'こいとがわ', 'こじま', 'こじま']
assert len(error_page_columns[40]["Japanese(Kana)"]) == 40

In [23]:
assert len(error_page_columns[40]["Romanized Japanese"]) == 33
error_page_columns[40]["Romanized Japanese"] += \
        ["Ko-Hiyoshi Kaizan（Hachijuhachiya Kaizan）"] + \
        ['Koho Kaiketsu', 'Koho Kairei', 'Kohoku Machi', 'Koito Gawa', 'Kojima', 'Kojima']
assert len(error_page_columns[40]["Japanese(Kana)"]) == 40

In [24]:
pd.DataFrame(
    zip(
        error_page_columns[40]["Japanese(Kana)"],
        error_page_columns[40]["Romanized Japanese"],
        error_page_columns[40]["Classification"]
    ),
    columns=["Japanese(Kana)", "Romanized Japanese", "Classfication"]
)

Unnamed: 0,Japanese(Kana),Romanized Japanese,Classfication
0,きつきし,Kitsuki Shi,Municipality
1,きっとやさん,Kittoya San,Mountain
2,きやまちょう,Kiyama Cho,Municipality
3,きよかわむら,Kiyokawa Mura,Municipality
4,きよさとちょう,Kiyosato Cho,Municipality
5,きよせし,Kiyose Shi,Municipality
6,きよすし,Kiyosu Shi,Municipality
7,きよすみやま,Kiyosumi Yama,Mountain
8,きよつがわ,Kiyotsu Gawa,River
9,きづがわ,Kizu Gawa,River


In [25]:
page = table_pages[40-1]
elements = [e for e in page]
for elm in elements:
    if type(elm) != LTTextBoxHorizontal:
            continue
    col = extract_column(elm)
    if col[0] == "Latitude Longitude":
        print(col)

['Latitude Longitude', 'Japanese(Kanji)', "131°37'", "33°25'", '杵築市', "140°52'", "37°13'", '屹兎屋山', "130°31'", "33°26'", '基山町', "139°17'", "35°29'", '清川村', "144°36'", "43°50'", '清里町', "139°32'", "35°47'", '清瀬市', "136°51'", "35°12'", '清須市', "140°09'", "35°10'", '清澄山', "138°41'", "37°03'", '清津川', "135°41'", "34°53'", '木津川', "135°49'", "34°44'", '木津川市', "129°13'", "34°05'", '神崎', "130°58'", "32°00'", '小林市', "135°12'", "34°41'", '神戸市', "138°44'", "35°55'", '甲武信ヶ岳', "133°32'", "33°34'", '高知県', "133°32'", "33°34'", '高知市', "132°45'", "34°42'", '甲田', "139°29'", "35°44'", '小平市', "129°20'", "29°13'", '小宝島', "140°15'", "41°07'", '小泊岬', "141°45'", "45°24'", '声問川', "133°29'", "35°17'", '江府町', "138°34'", "35°40'", '甲府市', "139°45'", "36°11'", '古河市', "130°28'", "33°44'", '古賀市', "129°37'", "29°53'", '小臥蛇島', "139°30'", "35°42'", '小金井市', "131°10'", "33°35'", '上毛町', "129°40'", "33°06'", '高後崎', "130°56'", "34°10'", '小串', "123°59'", "24°20'", '小浜島', "154°36'", 'コヒトデ海山', "22°09'", "141°58'", "小日吉海山（八十八夜海山） こひ

In [26]:
page_40_lat_lon_col = [
                    'Latitude Longitude', 'Japanese(Kanji)',

                    "131°37'", "33°25'", '杵築市',
                    "140°52'", "37°13'", '屹兎屋山',
                    "130°31'", "33°26'", '基山町',
                    "139°17'", "35°29'", '清川村',
                    "144°36'", "43°50'", '清里町',
                    "139°32'", "35°47'", '清瀬市',
                    "136°51'", "35°12'", '清須市',
                    "140°09'", "35°10'", '清澄山',
                    "138°41'", "37°03'", '清津川',
                    "135°41'", "34°53'", '木津川',
                    "135°49'", "34°44'", '木津川市',
                    "129°13'", "34°05'", '神崎',
                    "130°58'", "32°00'", '小林市',
                    "135°12'", "34°41'", '神戸市',
                    "138°44'", "35°55'", '甲武信ヶ岳',
                    "133°32'", "33°34'", '高知県',
                    "133°32'", "33°34'", '高知市',
                    "132°45'", "34°42'", '甲田',
                    "139°29'", "35°44'", '小平市',
                    "129°20'", "29°13'", '小宝島',
                    "140°15'", "41°07'", '小泊岬',
                    "141°45'", "45°24'", '声問川',
                    "133°29'", "35°17'", '江府町',
                    "138°34'", "35°40'", '甲府市',
                    "139°45'", "36°11'", '古河市',
                    "130°28'", "33°44'", '古賀市',
                    "129°37'", "29°53'", '小臥蛇島',
                    "139°30'", "35°42'", '小金井市',
                    "131°10'", "33°35'", '上毛町',
                    "129°40'", "33°06'", '高後崎',
                    "130°56'", "34°10'", '小串',
                    "123°59'", "24°20'", '小浜島',
    
                    # ズレている
                    "154°36'", 'コヒトデ海山', "22°09'",
    
                    # ズレている、別の列が混入している
                    "141°58'", "小日吉海山（八十八夜海山） こひよしかいざん（はちじゅうはちやかいざん） Ko-Hiyoshi Kaizan（Hachijuhachiya Kaizan） 23°21'",

                    "135°33'", "26°30'", '高鵬海穴',
                    "135°05'", "26°43'", '高鵬海嶺',
                    "130°09'", "33°13'", '江北町',
                    "139°51'", "35°21'", '小糸川',
                    "133°48'", "34°28'", '児島',
                    "139°48'", "41°22'", '小島']

In [27]:
# 人手修正
page_40_lat_lon_col_mod = [
                    # ヘッダー行を人手削除
    
                    "131°37'", "33°25'", '杵築市',
                    "140°52'", "37°13'", '屹兎屋山',
                    "130°31'", "33°26'", '基山町',
                    "139°17'", "35°29'", '清川村',
                    "144°36'", "43°50'", '清里町',
                    "139°32'", "35°47'", '清瀬市',
                    "136°51'", "35°12'", '清須市',
                    "140°09'", "35°10'", '清澄山',
                    "138°41'", "37°03'", '清津川',
                    "135°41'", "34°53'", '木津川',
                    "135°49'", "34°44'", '木津川市',
                    "129°13'", "34°05'", '神崎',
                    "130°58'", "32°00'", '小林市',
                    "135°12'", "34°41'", '神戸市',
                    "138°44'", "35°55'", '甲武信ヶ岳',
                    "133°32'", "33°34'", '高知県',
                    "133°32'", "33°34'", '高知市',
                    "132°45'", "34°42'", '甲田',
                    "139°29'", "35°44'", '小平市',
                    "129°20'", "29°13'", '小宝島',
                    "140°15'", "41°07'", '小泊岬',
                    "141°45'", "45°24'", '声問川',
                    "133°29'", "35°17'", '江府町',
                    "138°34'", "35°40'", '甲府市',
                    "139°45'", "36°11'", '古河市',
                    "130°28'", "33°44'", '古賀市',
                    "129°37'", "29°53'", '小臥蛇島',
                    "139°30'", "35°42'", '小金井市',
                    "131°10'", "33°35'", '上毛町',
                    "129°40'", "33°06'", '高後崎',
                    "130°56'", "34°10'", '小串',
                    "123°59'", "24°20'", '小浜島',
                    "154°36'", "22°09'",  'コヒトデ海山', # 人手修正
                    "141°58'", "23°21'", "小日吉海山（八十八夜海山） ", # 人手修正
                    "135°33'", "26°30'", '高鵬海穴',
                    "135°05'", "26°43'", '高鵬海嶺',
                    "130°09'", "33°13'", '江北町',
                    "139°51'", "35°21'", '小糸川',
                    "133°48'", "34°28'", '児島',
                    "139°48'", "41°22'", '小島']

page_40_lon = page_40_lat_lon_col_mod[0::3]
page_40_lat = page_40_lat_lon_col_mod[1::3]
page_40_kanji = page_40_lat_lon_col_mod[2::3]
assert len(page_40_lon) == len(page_40_lat) == len(page_40_kanji)

error_page_columns[40]["Longitude"] = page_40_lon
error_page_columns[40]["Latitude"] = page_40_lat
error_page_columns[40]["Japanese(Kanji)"] = page_40_kanji

In [28]:
pd.DataFrame(error_page_columns[40])

Unnamed: 0,Japanese(Kanji),Japanese(Kana),Romanized Japanese,Latitude,Longitude,Classification
0,杵築市,きつきし,Kitsuki Shi,33°25',131°37',Municipality
1,屹兎屋山,きっとやさん,Kittoya San,37°13',140°52',Mountain
2,基山町,きやまちょう,Kiyama Cho,33°26',130°31',Municipality
3,清川村,きよかわむら,Kiyokawa Mura,35°29',139°17',Municipality
4,清里町,きよさとちょう,Kiyosato Cho,43°50',144°36',Municipality
5,清瀬市,きよせし,Kiyose Shi,35°47',139°32',Municipality
6,清須市,きよすし,Kiyosu Shi,35°12',136°51',Municipality
7,清澄山,きよすみやま,Kiyosumi Yama,35°10',140°09',Mountain
8,清津川,きよつがわ,Kiyotsu Gawa,37°03',138°41',River
9,木津川,きづがわ,Kizu Gawa,34°53',135°41',River


In [29]:
assert len(error_page_columns[40]["Japanese(Kanji)"]) == 40
all_page_columns[40] = error_page_columns[40]

### ページ52

In [30]:
check_column_counts(error_page_columns[52])

Unnamed: 0,column,count
0,Japanese(Kanji),4
1,Japanese(Kana),40
2,Romanized Japanese,40
3,Latitude,40
4,Longitude,40
5,Classification,40


In [31]:
error_page_columns[52]["Japanese(Kanji)"]

['三種町', '三頭山', '水戸市', '三豊市']

[ホーム／御杖村](https://www.vill.mitsue.nara.jp/)

In [32]:
assert len(error_page_columns[52]["Japanese(Kanji)"]) == 4
error_page_columns[52]["Japanese(Kanji)"] += ["御杖村"]

In [33]:
page = table_pages[52-1]
elements = [e for e in page]
for elm in elements:
    if type(elm) != LTTextBoxHorizontal:
            continue
    col = extract_column(elm)
    if col[0] == "三石":
        print(col)

['三石', '見附市', '三俣蓮華岳', '三ッ島', '美津島', '三ツ峠山', '三浦半島', '三浦市', '宮川', '宮川', '宮田村', '宮城島', '宮城県', '三宅町', '三宅島', '三宅村', 'みやき町', '宮古島', 'みやこ町', '宮古列島', '宮古市', '宮古島市', '都城市', 'みやま市', '宮之浦岳', '宮良海山', '宮代町', '宮塚山', '宮若市', '宮崎県', '宮崎市', '宮崎ノ鼻', '宮津市', '三芳町', 'みよし市']


In [34]:
assert len(error_page_columns[52]["Japanese(Kanji)"]) == 5
error_page_columns[52]["Japanese(Kanji)"] += \
    ['三石', '見附市', '三俣蓮華岳', '三ッ島', '美津島', '三ツ峠山', '三浦半島', '三浦市',
     '宮川', '宮川', '宮田村', '宮城島', '宮城県', '三宅町', '三宅島', '三宅村', 'みやき町',
     '宮古島', 'みやこ町', '宮古列島', '宮古市', '宮古島市', '都城市', 'みやま市', '宮之浦岳',
     '宮良海山', '宮代町', '宮塚山', '宮若市', '宮崎県', '宮崎市', '宮崎ノ鼻', '宮津市', '三芳町', 'みよし市']
assert len(error_page_columns[52]["Japanese(Kanji)"]) == 40

In [35]:
pd.DataFrame(error_page_columns[52])

Unnamed: 0,Japanese(Kanji),Japanese(Kana),Romanized Japanese,Latitude,Longitude,Classification
0,三種町,みたねちょう,Mitane Cho,40°06',140°00',Municipality
1,三頭山,みとうさん,Mito San,35°44',139°01',Mountain
2,水戸市,みとし,Mito Shi,36°22',140°28',Municipality
3,三豊市,みとよし,Mitoyo Shi,34°11',133°43',Municipality
4,御杖村,みつえむら,Mitsue Mura,34°29',136°10',Municipality
5,三石,みついし,Mitsuishi,34°48',134°16',Populated area
6,見附市,みつけし,Mitsuke Shi,37°32',138°55',Municipality
7,三俣蓮華岳,みつまたれんげだけ,Mitsumatarenge Dake,36°23',137°35',Mountain
8,三ッ島,みつしま,Mitsushima,34°43',129°27',Island
9,美津島,みつしま,Mitsushima,34°16',129°19',Populated area


In [36]:
assert len(error_page_columns[52]["Japanese(Kanji)"]) == 40
all_page_columns[52] = error_page_columns[52]

### ページ75

In [37]:
check_column_counts(error_page_columns[75])

Unnamed: 0,column,count
0,Japanese(Kanji),39
1,Japanese(Kana),40
2,Romanized Japanese,40
3,Latitude,40
4,Longitude,40
5,Classification,40


In [38]:
error_page_columns[75]["Japanese(Kanji)"]

['三条市',
 '山上ヶ岳',
 '傘寿海山',
 '山武市',
 '三戸町',
 '佐野市',
 '三福海山',
 '三嶺',
 '讃岐山脈',
 'さぬき市',
 '三和',
 '山陽小野田市',
 '棹崎',
 '札幌岳',
 '札幌市',
 '更別村',
 'サロベツ川',
 '佐呂間町',
 'サロマ湖',
 '佐呂間別川',
 '沙流川',
 '猿払川',
 '猿払村',
 '猿ヶ石川',
 '猿山岬',
 '笹山',
 '篠山',
 '笹ヶ峰',
 '笹子峠',
 '篠栗町',
 '笹谷峠',
 '佐世保市',
 '指臼山',
 '佐多岬',
 '里庄町',
 '皐月海山',
 'さつま町',
 '薩摩半島',
 '薩摩海山']

[薩摩川内市](https://www.city.satsumasendai.lg.jp/www/index.html)

In [39]:
assert len(error_page_columns[75]["Japanese(Kanji)"]) == 39
error_page_columns[75]["Japanese(Kanji)"] += ["薩摩川内市"]
assert len(error_page_columns[75]["Japanese(Kanji)"]) == 40

In [40]:
pd.DataFrame(error_page_columns[75])

Unnamed: 0,Japanese(Kanji),Japanese(Kana),Romanized Japanese,Latitude,Longitude,Classification
0,三条市,さんじょうし,Sanjo Shi,37°38',138°58',Municipality
1,山上ヶ岳,さんじょうがたけ,Sanjogatake,34°15',135°56',Mountain
2,傘寿海山,さんじゅかいざん,Sanju Kaizan,25°00',134°05',Undersea feature
3,山武市,さんむし,Sanmu Shi,35°36',140°25',Municipality
4,三戸町,さんのへまち,Sannohe Machi,40°23',141°16',Municipality
5,佐野市,さのし,Sano Shi,36°19',139°35',Municipality
6,三福海山,さんぷくかいざん,Sanpuku Kaizan,22°49',142°42',Undersea feature
7,三嶺,さんれい,Sanrei,33°50',133°59',Mountain
8,讃岐山脈,さぬきさんみゃく,Sanuki Sanmyaku,34°10',134°08',Extensive natural feature
9,さぬき市,さぬきし,Sanuki Shi,34°20',134°10',Municipality


In [41]:
assert len(error_page_columns[75]["Japanese(Kanji)"]) == 40
all_page_columns[75] = error_page_columns[75]

### ページ77

In [42]:
check_column_counts(error_page_columns[77])

Unnamed: 0,column,count
0,Japanese(Kanji),10
1,Japanese(Kana),40
2,Romanized Japanese,40
3,Latitude,40
4,Longitude,40
5,Classification,40


In [43]:
error_page_columns[77]["Japanese(Kanji)"]

['瀬戸内町', '瀬戸内市', '摂津市', '雪裡川', '釈迦岳', '釈迦ヶ岳', '釈迦ヶ鼻', '積丹町', '積丹半島', '積丹岬']

- [北海道紗那郡紗那村 (01699A1968) | 歴史的行政区域データセットβ版](https://geoshape.ex.nii.ac.jp/city/resource/01699A1968.html)
- [紗那村 - Wikipedia](https://ja.wikipedia.org/wiki/%E7%B4%97%E9%82%A3%E6%9D%91)

In [44]:
assert len(error_page_columns[77]["Japanese(Kanji)"]) == 10
error_page_columns[77]["Japanese(Kanji)"] += ["紗那村"]

In [45]:
page = table_pages[77-1]
elements = [e for e in page]
for elm in elements:
    if type(elm) != LTTextBoxHorizontal:
            continue
    col = extract_column(elm)
    if col[0] == "社日海山":
        print(col)

['社日海山', '斜里町', '斜里岳', '斜里川', '柴田町', '新発田市', '芝山町', '標茶町', '蘂取岳', '蘂取岬', '蘂取村', '標津町', '標津岳', '標津川', '士別市', '士別峠', '紫尾山', '志発島', '渋峠', '渋川市', '渋海川', '志布志市', '志布志湾', '至仏山', '渋谷', '渋谷区', '七ヶ浜町', '七ヶ宿町', '七面山']


In [46]:
assert len(error_page_columns[77]["Japanese(Kanji)"]) == 11
error_page_columns[77]["Japanese(Kanji)"] += \
    ['社日海山', '斜里町', '斜里岳', '斜里川', '柴田町', '新発田市', '芝山町',
     '標茶町', '蘂取岳', '蘂取岬', '蘂取村', '標津町', '標津岳', '標津川',
     '士別市', '士別峠', '紫尾山', '志発島', '渋峠', '渋川市', '渋海川',
     '志布志市', '志布志湾', '至仏山', '渋谷', '渋谷区', '七ヶ浜町', '七ヶ宿町', '七面山']
assert len(error_page_columns[77]["Japanese(Kanji)"]) == 40

In [47]:
pd.DataFrame(error_page_columns[77])

Unnamed: 0,Japanese(Kanji),Japanese(Kana),Romanized Japanese,Latitude,Longitude,Classification
0,瀬戸内町,せとうちちょう,Setouchi Cho,28°09',129°19',Municipality
1,瀬戸内市,せとうちし,Setouchi Shi,34°40',134°06',Municipality
2,摂津市,せっつし,Settsu Shi,34°47',135°34',Municipality
3,雪裡川,せっつりがわ,Settsuri Gawa,43°04',144°24',River
4,釈迦岳,しゃかだけ,Shaka Dake,33°11',130°53',Mountain
5,釈迦ヶ岳,しゃかがだけ,Shakagadake,34°07',135°54',Mountain
6,釈迦ヶ鼻,しゃかがはな,Shakagahana,34°25',134°14',Cape
7,積丹町,しゃこたんちょう,Shakotan Cho,43°18',140°36',Municipality
8,積丹半島,しゃこたんはんとう,Shakotan Hanto,43°11',140°32',Extensive natural feature
9,積丹岬,しゃこたんみさき,Shakotan Misaki,43°22',140°28',Cape


In [48]:
assert len(error_page_columns[77]["Japanese(Kanji)"]) == 40
all_page_columns[77] = error_page_columns[77]

## 取得情報の集約

In [49]:
d4df = defaultdict(list)
for page_no, page_columns in sorted(all_page_columns.items()):
    for k, v in page_columns.items():
        d4df[k] += v

df_raw = pd.DataFrame(d4df)
df_raw

Unnamed: 0,Japanese(Kanji),Japanese(Kana),Romanized Japanese,Latitude,Longitude,Classification
0,網走川,あばしりがわ,Abashiri Gawa,44°01',144°17',River
1,網走湖,あばしりこ,Abashiri Ko,43°58',144°11',Lake
2,網走市,あばしりし,Abashiri Shi,44°01',144°16',Municipality
3,安倍川,あべかわ,Abe Kawa,34°56',138°24',River
4,我孫子市,あびこし,Abiko Shi,35°52',140°02',Municipality
...,...,...,...,...,...,...
4118,銭洲,ぜにす,Zeni Su,33°57',138°50',Island
4119,銭洲海嶺,ぜにすかいれい,Zenisu Kairei,33°42',138°30',Undersea feature
4120,銭洲沖海山,ぜにすおきかいざん,Zenisu-Oki Kaizan,33°26',138°25',Undersea feature
4121,善通寺市,ぜんつうじし,Zentsuji Shi,34°14',133°47',Municipality


In [50]:
def degree_minute_to_decimal(text):
    """
    緯度経度を、度分（60進法）から10進法へ変換
    データは「分」までのため、小数第3位精度へ丸め
    （"1/60 = 0.01666666..." のため、小数第3位程度が妥当と判断）
    """
    assert "°" in text
    assert text.endswith("'")
    text = text[:-1]
    parts = text.split("°")
    assert len(parts) == 2

    degree = Decimal(parts[0])
    minute = Decimal(parts[1])
    return (degree + minute / 60).quantize(Decimal(".001"), ROUND_HALF_EVEN)

In [51]:
df_mod = df_raw.copy()
df_mod["Latitude"] = df_mod["Latitude"].apply(degree_minute_to_decimal)
df_mod["Longitude"] = df_mod["Longitude"].apply(degree_minute_to_decimal)
df_mod.columns = ["kanji", "kana", "roman", "lat", "lng", "class"]
df_mod

Unnamed: 0,kanji,kana,roman,lat,lng,class
0,網走川,あばしりがわ,Abashiri Gawa,44.017,144.283,River
1,網走湖,あばしりこ,Abashiri Ko,43.967,144.183,Lake
2,網走市,あばしりし,Abashiri Shi,44.017,144.267,Municipality
3,安倍川,あべかわ,Abe Kawa,34.933,138.400,River
4,我孫子市,あびこし,Abiko Shi,35.867,140.033,Municipality
...,...,...,...,...,...,...
4118,銭洲,ぜにす,Zeni Su,33.950,138.833,Island
4119,銭洲海嶺,ぜにすかいれい,Zenisu Kairei,33.700,138.500,Undersea feature
4120,銭洲沖海山,ぜにすおきかいざん,Zenisu-Oki Kaizan,33.433,138.417,Undersea feature
4121,善通寺市,ぜんつうじし,Zentsuji Shi,34.233,133.783,Municipality


In [52]:
# 計4,123行を想定
# = (40行　*103ページ) + 最終ページ3行
# PDFページ111、表の最終行でのカウントも"4123"
assert df_mod.shape[0] == 4123, df_mod.shape[0]

# 別称（括弧書き）の分割

- 部分的に別表記のあるケースもある
  - `火山（硫黄）列島	かざん（いおう）れっとう`
  - → `火山列島`, `硫黄列島`
  - [火山列島 - Wikipedia](https://ja.wikipedia.org/wiki/%E7%81%AB%E5%B1%B1%E5%88%97%E5%B3%B6)
- 漢字表記は複数あるが、読みは一つ、というケースもある
    - `沖の島（沖ノ島）	おきのしま`
- 漢字表記は一つだが、読みが複数、というケースもある
    - `蔵王山	ざおうざん（ざおうさん）`

In [53]:
def detect_bracket_rows(row):
    for d in (row["kanji"], row["kana"], row["roman"]):
        if "(" in d:
            return True
        elif "（" in d:
            return True
        elif ")" in d:
            return True
        elif "）" in d:
            return True
    return False

In [54]:
df_mod[df_mod.apply(detect_bracket_rows, axis=1)].reset_index(drop=True)

Unnamed: 0,kanji,kana,roman,lat,lng,class
0,烏帽子岳（乳頭山）,えぼしだけ（にゅうとうざん）,Eboshi Dake (Nyuto Zan),39.8,140.833,Mountain
1,冨崎（観音埼）,ふさき（かんのんさき）,Fu Saki (Kannon Saki),24.367,124.117,Cape
2,江の川（可愛川）,ごうのかわ（えのかわ）,Gonokawa (Eno Kawa),34.8,132.833,River
3,階上岳（種市岳）,はしかみだけ（たねいちだけ）,Hashikami Dake (Taneichi Dake),40.4,141.583,Mountain
4,速日の峰（二子山）,はやひのみね（ふたごやま）,Hayahinomine (Futago Yama),32.567,131.467,Mountain
5,速吸瀬戸（豊予海峡）,はやすいせと（ほうよかいきょう）,Hayasui Seto (Hoyo Kaikyo),33.317,131.983,Sea Area
6,氷ノ山（須賀ノ山）,ひょうのせん（すがのせん）,Hyonosen (Suganosen),35.35,134.517,Mountain
7,厳島（宮島）,いつくしま（みやじま）,Itsukushima (Miya Jima),34.267,132.3,Island
8,冠着山（姨捨山）,かむりきやま（おばすてやま）,Kamuriki Yama (Obasute Yama),36.467,138.1,Mountain
9,火山（硫黄）列島,かざん（いおう）れっとう,Kazan (Io) Retto,24.883,141.467,Extensive natural feature


In [55]:
def has_bracket(d):
    for name in (d["kanji"], d["kana"], d["roman"]):
        if "(" in name:
            return True
        elif "（" in name:
            return True
        elif ")" in name:
            return True
        elif "）" in name:
            return True
    return False

In [56]:
l4df = []

for index, row in df_mod.iterrows():
    d = row.to_dict()
    if not has_bracket(d):
        l4df.append(d)
        continue

    elif d["kanji"].endswith("）") and d["kana"].endswith("）") and d["roman"].endswith(")"):
            assert d["kanji"].count("（") == 1
            assert d["kana"].count("（") == 1
            assert d["roman"].count("(") == 1
            kanjis = d["kanji"][:-1].split("（")
            kanas = d["kana"][:-1].split("（")
            romans = d["roman"][:-1].split("(")
            for x, y, z in zip(kanjis, kanas, romans):
                item = {
                    "kanji": x,
                    "kana": y,
                    "roman": z,
                    "lat": d["lat"],
                    "lng": d["lng"],
                    "class": d["class"]
                }
                print(item)
                l4df.append(item)
    elif d["kanji"] == "火山（硫黄）列島":
        kanjis = ["火山列島", "硫黄列島"]
        kanas = ["かざんれっとう", "いおうれっとう"]
        romans = ["Kazan Retto", "Io Retto"]
        for x, y, z in zip(kanjis, kanas, romans):
                item = {
                    "kanji": x,
                    "kana": y,
                    "roman": z,
                    "lat": d["lat"],
                    "lng": d["lng"],
                    "class": d["class"]
                }
                print(item)
                l4df.append(item)
    elif d["kanji"] == "小日吉海山（八十八夜海山） ":
        assert d["roman"].endswith("）") # 全角括弧
        kanjis = ["小日吉海山", "八十八夜海山"]
        kanas = ["こひよしかいざん", "はちじゅうはちやかいざん"]
        romans = ["Ko-Hiyoshi Kaizan", "Hachijuhachiya Kaizan"]
        for x, y, z in zip(kanjis, kanas, romans):
                item = {
                    "kanji": x,
                    "kana": y,
                    "roman": z,
                    "lat": d["lat"],
                    "lng": d["lng"],
                    "class": d["class"]
                }
                print(item)
                l4df.append(item)
    elif d["kanji"] == "沖の島（沖ノ島）":
        kanjis = ["沖の島", "沖ノ島"]
        kanas = ["おきのしま", "おきのしま"]
        romans = ["Okinoshima", "Okinoshima"]
        for x, y, z in zip(kanjis, kanas, romans):
                item = {
                    "kanji": x,
                    "kana": y,
                    "roman": z,
                    "lat": d["lat"],
                    "lng": d["lng"],
                    "class": d["class"]
                }
                print(item)
                l4df.append(item)
    elif d["kanji"] == "蔵王山":
        kanjis = ["蔵王山", "蔵王山"]
        kanas = ["ざおうざん", "ざおうさん"]
        romans = ["Zao Zan", "Zao San"]
        for x, y, z in zip(kanjis, kanas, romans):
                item = {
                    "kanji": x,
                    "kana": y,
                    "roman": z,
                    "lat": d["lat"],
                    "lng": d["lng"],
                    "class": d["class"]
                }
                print(item)
                l4df.append(item)
    else:
        raise Exception(d)

{'kanji': '烏帽子岳', 'kana': 'えぼしだけ', 'roman': 'Eboshi Dake ', 'lat': Decimal('39.800'), 'lng': Decimal('140.833'), 'class': 'Mountain'}
{'kanji': '乳頭山', 'kana': 'にゅうとうざん', 'roman': 'Nyuto Zan', 'lat': Decimal('39.800'), 'lng': Decimal('140.833'), 'class': 'Mountain'}
{'kanji': '冨崎', 'kana': 'ふさき', 'roman': 'Fu Saki ', 'lat': Decimal('24.367'), 'lng': Decimal('124.117'), 'class': 'Cape'}
{'kanji': '観音埼', 'kana': 'かんのんさき', 'roman': 'Kannon Saki', 'lat': Decimal('24.367'), 'lng': Decimal('124.117'), 'class': 'Cape'}
{'kanji': '江の川', 'kana': 'ごうのかわ', 'roman': 'Gonokawa ', 'lat': Decimal('34.800'), 'lng': Decimal('132.833'), 'class': 'River'}
{'kanji': '可愛川', 'kana': 'えのかわ', 'roman': 'Eno Kawa', 'lat': Decimal('34.800'), 'lng': Decimal('132.833'), 'class': 'River'}
{'kanji': '階上岳', 'kana': 'はしかみだけ', 'roman': 'Hashikami Dake ', 'lat': Decimal('40.400'), 'lng': Decimal('141.583'), 'class': 'Mountain'}
{'kanji': '種市岳', 'kana': 'たねいちだけ', 'roman': 'Taneichi Dake', 'lat': Decimal('40.400'), 'lng': 

In [57]:
df_mod_split = pd.DataFrame(l4df)
assert df_mod_split.shape[0] == 4149 # 4123行+26の分割された行
df_mod_split

Unnamed: 0,kanji,kana,roman,lat,lng,class
0,網走川,あばしりがわ,Abashiri Gawa,44.017,144.283,River
1,網走湖,あばしりこ,Abashiri Ko,43.967,144.183,Lake
2,網走市,あばしりし,Abashiri Shi,44.017,144.267,Municipality
3,安倍川,あべかわ,Abe Kawa,34.933,138.400,River
4,我孫子市,あびこし,Abiko Shi,35.867,140.033,Municipality
...,...,...,...,...,...,...
4144,銭洲,ぜにす,Zeni Su,33.950,138.833,Island
4145,銭洲海嶺,ぜにすかいれい,Zenisu Kairei,33.700,138.500,Undersea feature
4146,銭洲沖海山,ぜにすおきかいざん,Zenisu-Oki Kaizan,33.433,138.417,Undersea feature
4147,善通寺市,ぜんつうじし,Zentsuji Shi,34.233,133.783,Municipality


## 保存

In [58]:
df_mod_split.to_csv(
    "gazetteer-of-japan.csv",
    index=False
)

In [59]:
!wc -l gazetteer-of-japan.csv

    4150 gazetteer-of-japan.csv


In [60]:
!head gazetteer-of-japan.csv

kanji,kana,roman,lat,lng,class
網走川,あばしりがわ,Abashiri Gawa,44.017,144.283,River
網走湖,あばしりこ,Abashiri Ko,43.967,144.183,Lake
網走市,あばしりし,Abashiri Shi,44.017,144.267,Municipality
安倍川,あべかわ,Abe Kawa,34.933,138.400,River
我孫子市,あびこし,Abiko Shi,35.867,140.033,Municipality
安平町,あびらちょう,Abira Cho,42.767,141.817,Municipality
安平川,あびらがわ,Abira Gawa,42.617,141.733,River
安房峠,あぼうとうげ,Abo Toge,36.200,137.583,Pass
阿武町,あぶちょう,Abu Cho,34.500,131.467,Municipality


In [61]:
df_mod_split.to_json(
    "gazetteer-of-japan.json",
    orient="records",
    double_precision=3,
    force_ascii=False,
    indent=2,
)

In [62]:
!head -20 ./gazetteer-of-japan.json

[
  {
    "kanji":"網走川",
    "kana":"あばしりがわ",
    "roman":"Abashiri Gawa",
    "lat":44.017,
    "lng":144.283,
    "class":"River"
  },
  {
    "kanji":"網走湖",
    "kana":"あばしりこ",
    "roman":"Abashiri Ko",
    "lat":43.967,
    "lng":144.183,
    "class":"Lake"
  },
  {
    "kanji":"網走市",
    "kana":"あばしりし",
