# Geonames.org Reference Data

In [24]:
from datetime import datetime
from os import environ
import re
from tempfile import TemporaryDirectory
import urllib
from zipfile import ZipFile

import numpy as np
import pandas as pd
import requests
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [None]:
%%bash

pip3 list

## Places (Geonames.org)

In [3]:
%%time

places_uri = 'https://download.geonames.org/export/dump/US.zip'

session = requests.Session()
r = session.get(places_uri)
r.raise_for_status()
r.status_code

CPU times: user 1.4 s, sys: 655 ms, total: 2.05 s
Wall time: 14.6 s


200

In [4]:
with open('data/US-places.zip', 'wb') as fd:
    for chunk in r.iter_content(chunk_size=128):
        fd.write(chunk)
with ZipFile('data/US-places.zip') as z:
    z.extract('US.txt', path='data')

In [6]:
%%time

header = ['geonameid', 'name', 'ascii_name'
          , 'alternatenames', 'latitude', 'longitude'
          , 'feature_class', 'feature_code', 'country_code'
          , 'cc2', 'admin_code1', 'admin_code2'
          , 'admin_code3', 'admin_code4', 'population'
          , 'elevation', 'dem', 'timezone', 'modification_date'
         ]

places = pd.read_table('data/US.txt'
                       , names=header
                       , dtype={
                           'admin_code1': str, 'admin_code2': str,
                           'admin_code3': str, 'admin_code3': str}
                      )
places.drop(['admin_code4', 'ascii_name'], axis=1, inplace=True)
places.shape

  call = lambda f, *a, **k: f(*a, **k)


CPU times: user 7.53 s, sys: 464 ms, total: 8 s
Wall time: 7.99 s


(2237919, 17)

In [None]:
places.dtypes

## Transpose Place Synonyms into Rows
- Built in [explode](https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html#exploding-a-list-like-column) function

In [7]:
%%time

# 347_209
pivot = places.loc[~places.alternatenames.isna(), ['alternatenames']]  #.head(1000)
pivot.shape

# Split csv strings to iterable
to_columns2 = pd.DataFrame(data=pivot.alternatenames.str.split(pat=',', expand=False)
                          ,index=pivot.index)

# Use built in explode (may be more efficient)
to_rows2 = to_columns2.explode('alternatenames')

places2 = to_rows2.join(places.drop(['name'], axis=1)
                        , lsuffix='_tmp')

places2['name'] = places2.alternatenames_tmp
places2.drop(['alternatenames_tmp'], axis=1, inplace=True)

places2.shape

CPU times: user 2.95 s, sys: 320 ms, total: 3.27 s
Wall time: 3.27 s


(986417, 17)

In [32]:
unity = places.append(places2).drop_duplicates()

-- 2_912_531
unity.shape

(2908037, 17)

In [33]:
unity.modification_date = (
    unity.modification_date.str
    .replace(pat='-', repl='', regex=False)
    .astype(np.int32)
)

In [34]:
unity.dtypes

geonameid              int64
name                  object
alternatenames        object
latitude             float64
longitude            float64
feature_class         object
feature_code          object
country_code          object
cc2                   object
admin_code1           object
admin_code2           object
admin_code3           object
population             int64
elevation            float64
dem                    int64
timezone              object
modification_date      int32
dtype: object

## Postal Codes (Geonames.org)
- Geonames' [zip codes](http://download.geonames.org/export/zip/US.zip)

In [None]:
%%time

zip_uri = 'http://download.geonames.org/export/zip/US.zip'

session = requests.Session()

r = session.get(zip_uri)
r.raise_for_status()

r.status_code

In [None]:
with open('data/US.zip', 'wb') as fd:
    for chunk in r.iter_content(chunk_size=128):
        fd.write(chunk)
with ZipFile('data/US.zip') as z:
    z.extract('US.txt', path='data')

In [None]:
%%time

header = ['country_code', 'postal_code', 'place_name',
          'admin_name1', 'admin_code1', 'admin_name2',
          'admin_code2', 'admin_name3', 'admin_code3',
          'latitude', 'longitude', 'accuracy']

zips = pd.read_table('data/US.txt', names=header, 
                     dtype={'postal_code': str, 'admin_code2': str, 
                            'admin_code3': str, 'accuracy': str})
zips.shape

In [None]:
zips.dtypes

### Use RegEx to Cleanse Pattern `Mc *`

In [None]:
cleanse_1 = re.compile(pattern=r'\sMc\s')
cleanse_2 = re.compile(pattern=r'^Mc\s')

zips.loc[
    zips.place_name.str.contains(pat=cleanse_1)
    , ['place_name']] = zips.place_name.str.replace(pat=cleanse_1, repl=' Mc')

zips.loc[
    zips.place_name.str.contains(pat=cleanse_2)
    , ['place_name']] = zips.place_name.str.replace(pat=cleanse_2, repl='Mc')

In [None]:
zips.loc[zips.place_name.str.startswith('Mc')]

In [None]:
zips.loc[~zips.admin_code3.isna()]

## Write to SQL DB

In [None]:
driver = environ.get('SQL_DRIVER', '{ODBC Driver 17 for SQL Server}')
host = environ.get('SQL_HOST', 'sql-geonames')
db = environ.get('SQL_DB', 'ScratchDB')
user = environ.get('SQL_USER', 'sa')
pw = environ.get('SQL_PASSWORD', 'HelloWorld1')
con_str = f'DRIVER={driver};SERVER={host};DATABASE={db};UID={user};PWD={pw}'

params = urllib.parse.quote_plus(con_str)
sql_engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}"
                           , echo=True) # echo's emitted sql

In [None]:
created = datetime.now()
createdby = 'bshGeonamesToADLS'

zips['CreatedDateTime'] = created
zips['RecCreatedBy'] = createdby

unity['CreatedDateTime'] = created
unity['RecCreatedBy'] = createdby

In [None]:
%%time

with sql_engine.connect() as c:
    zips.to_sql(name='ZipCode', con=c, schema='Staging', if_exists='append', index=False)
    unity.to_sql(name='GeoPlace', con=c, schema='Staging', if_exists='append', index=False)

## Save to parquet format with snappy compression

In [None]:
%%time

zips.to_parquet(path='data/zipcodes.parquet', engine='fastparquet'
                , compression='snappy')
places.to_parquet(path='data/geoplaces.parquet', engine='fastparquet'
                  , compression='snappy')