# Geonames.org Reference Data

In [None]:
from datetime import datetime
from os import environ
import re
from tempfile import TemporaryDirectory
import urllib
from zipfile import ZipFile

import numpy as np
import pandas as pd
import requests
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [None]:
%%bash

pip3 list

## Places (Geonames.org)

In [None]:
%%time

places_uri = 'https://download.geonames.org/export/dump/US.zip'

session = requests.Session()
r = session.get(places_uri)
r.raise_for_status()
r.status_code

In [None]:
with open('data/US-places.zip', 'wb') as fd:
    for chunk in r.iter_content(chunk_size=128):
        fd.write(chunk)
with ZipFile('data/US-places.zip') as z:
    z.extract('US.txt', path='data')

In [None]:
%%time

header = ['geonameid', 'name', 'ascii_name'
          , 'alternatenames', 'latitude', 'longitude'
          , 'feature_class', 'feature_code', 'country_code'
          , 'cc2', 'admin_code1', 'admin_code2'
          , 'admin_code3', 'admin_code4', 'population'
          , 'elevation', 'dem', 'timezone', 'modification_date'
         ]

places = pd.read_table('data/US.txt'
                       , names=header
                       , dtype={
                           'admin_code1': str, 'admin_code2': str,
                           'admin_code3': str, 'admin_code3': str}
                      )
places.drop(['admin_code4', 'ascii_name'], axis=1, inplace=True)
places.shape

In [None]:
places.dtypes

## Transpose Place Synonyms into Rows
- Built in [explode](https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html#exploding-a-list-like-column) function

In [None]:
%%time

pivot = places.loc[~places.alternatenames.isna(), ['alternatenames']]  #.head(1000)
pivot.shape

# Split csv strings to iterable
to_columns2 = pd.DataFrame(data=pivot.alternatenames.str.split(pat=',', expand=False)
                          ,index=pivot.index)

# Use built in explode (may be more efficient)
to_rows2 = to_columns2.explode('alternatenames')

places2 = to_rows2.join(places.drop(['name'], axis=1)
                        , lsuffix='_tmp')

places2['name'] = places2.alternatenames_tmp
places2.drop(['alternatenames_tmp'], axis=1, inplace=True)

places2.shape

In [None]:
unity = places.append(places2).drop_duplicates()
unity.shape

In [None]:
unity.modification_date = (
    unity.modification_date.str
    .replace(pat='-', repl='', regex=False)
    .astype(np.int32)
)

In [None]:
unity.dtypes