**This notebook is purely for convenience in importing geonames data. It is not meant to be part of the actual TA-D evaluation.**

In [None]:
![ -e MZ.txt ] || (wget https://download.geonames.org/export/dump/MZ.zip && unzip -o MZ.zip)
![ -e ZW.txt ] || (wget https://download.geonames.org/export/dump/ZW.zip && unzip -o ZW.zip)
![ -e TZ.txt ] || (wget https://download.geonames.org/export/dump/TZ.zip && unzip -o TZ.zip)

In [None]:
import csv
import pandas as pd
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher


def read_geonames(inputfile):
    # column names found here: http://download.geonames.org/export/dump/readme.txt
    column_names = [
        'geonameid', 'name', 'asciiname', 'alternatenames', 'latitude',
        'longitude', 'feature class', 'feature code', 'country code', 'cc2',
        'admin1 code', 'admin2 code', 'admin3 code', 'admin4 code',
        'population', 'elevation', 'dem', 'timezone', 'modification date'
    ]
    column_types = {'admin1 code': str, 'admin2 code': str, 'admin3 code': str, 'admin4 code': str}
    data = pd.read_csv(inputfile,sep='\t', names=column_names, dtype=column_types)
    data = data.drop(columns={'modification date', 'dem'})
    data = data.rename(columns={'geonameid':'id', 
                                'country code':'countrycode', 
                                'admin1 code':'admin1', 
                                'admin2 code':'admin2', 
                                'admin3 code':'admin3', 
                                'admin4 code':'admin4',                             
                                'feature class': 'featureclass', 
                                'feature code': 'featurecode'})
    return data


mz_data = read_geonames('MZ.txt')
zw_data = read_geonames('ZW.txt')
tz_data = read_geonames('TZ.txt')
data = pd.concat([zw_data, mz_data, tz_data])
normalized_kg = kgtk(data, """
    normalize-nodes
""")
normalized_kg.to_csv('geonames_sample.tsv.gz', sep='\t', index=False)