In [157]:
import pandas as pd

# Define the file paths
global_cities_path = 'allCountries.txt'
alternate_names_path = 'alternateNamesV2.txt'
admin1_codes_path = 'admin1CodesASCII.txt'

# Define the column headers for the global cities file
global_cities_headers = [
    'geoname_id', 'name', 'ascii_name', 'alternate_names', 'latitude', 'longitude',
    'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1_code',
    'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation',
    'dem', 'timezone', 'modification_date'
]

# Define the data types for the columns in the global cities file
global_cities_dtype = {
    'geoname_id': 'Int64', 'name': str, 'asciiname': str, 'alternatenames': str,
    'latitude': float, 'longitude': float, 'feature_class': str, 'feature_code': str,
    'country_code': str, 'cc2': str, 'admin1_code': str, 'admin2_code': str,
    'admin3_code': str, 'admin4_code': str, 'population': float, 'elevation': float,
    'dem': float, 'timezone': str, 'modification_date': str
}

# Define the column headers for the alternate names file
alternate_names_headers = [
    'alternate_name_id', 'geoname_id', 'iso_language', 'alternate_name',
    'is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic', 
    'from', 'to'
]

# Define the data types for the columns in the alternate names file
alternate_names_dtype = {
    'alternate_name_id': 'Int64', 'geoname_id': 'Int64', 'iso_language': str, 'alternate_name': str,
    'is_preferred_name': 'boolean', 'is_short_name': 'boolean', 'is_colloquial': 'boolean', 'is_historic': 'boolean',
    'from': str, 'to': str
}

# Define the column headers for the admin1 codes file
admin1_codes_headers = [
    'code', 'name', 'name_ascii', 'geoname_id_admin1'
]

# Define the data types for the columns in the admin1 codes file
admin1_codes_dtype = {
    'code': str, 'name': str, 'name_ascii': str, 'geoname_id_admin1': 'Int64'
}

# Read the files 'Int64'o pandas DataFrames
alternate_names_df = pd.read_csv(alternate_names_path, sep='\t', header=None, names=alternate_names_headers, dtype=alternate_names_dtype, low_memory=False, keep_default_na=False, na_values='')
cities_df = pd.read_csv(global_cities_path, sep='\t', header=None, names=global_cities_headers, dtype=global_cities_dtype, low_memory=False, keep_default_na=False, na_values='').drop('alternate_names', axis=1)
admin1_codes_df = pd.read_csv(admin1_codes_path, sep='\t', header=None, names=admin1_codes_headers, dtype=admin1_codes_dtype, low_memory=False, keep_default_na=False, na_values='')

In [158]:
admin1_codes_df.head()

Unnamed: 0,code,name,name_ascii,geoname_id_admin1
0,AD.06,Sant Julià de Loria,Sant Julia de Loria,3039162
1,AD.05,Ordino,Ordino,3039676
2,AD.04,La Massana,La Massana,3040131
3,AD.03,Encamp,Encamp,3040684
4,AD.02,Canillo,Canillo,3041203


In [159]:
# Fill <NA> values with False for the specified columns
alternate_names_df[['is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic']] = \
    alternate_names_df[['is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic']].fillna(False)

alternate_names_df.head()

Unnamed: 0,alternate_name_id,geoname_id,iso_language,alternate_name,is_preferred_name,is_short_name,is_colloquial,is_historic,from,to
0,1284819,2994701,,Roc Mélé,False,False,False,False,,
1,1284820,2994701,,Roc Meler,False,False,False,False,,
2,1291197,3017832,,Pic de les Abelletes,False,False,False,False,,
3,4290387,3017832,,Pic de la Font-Nègre,False,False,False,False,,
4,1291198,3017833,,Estany de les Abelletes,False,False,False,False,,


In [160]:
# Generate countries dataset
countries_df = cities_df[cities_df['feature_code'].isin(['PCLI', 'PCLS', 'PCLIX', 'TERR', 'PCLD', 'PCL', 'PCLF'])].rename(columns={'name': 'name_country'})

countries_df.head()

Unnamed: 0,geoname_id,name_country,ascii_name,latitude,longitude,feature_class,feature_code,country_code,cc2,admin1_code,admin2_code,admin3_code,admin4_code,population,elevation,dem,timezone,modification_date
2725,3041565,Principality of Andorra,Principality of Andorra,42.55,1.58333,A,PCLI,AD,,0,,,,77006.0,,1802.0,Europe/Andorra,2021-08-21
3481,290557,United Arab Emirates,United Arab Emirates,23.75,54.5,A,PCLI,AE,,0,,,,9630959.0,,96.0,Asia/Dubai,2024-09-05
40413,1149361,Islamic Republic of Afghanistan,Islamic Republic of Afghanistan,33.0,66.0,A,PCLI,AF,,0,,,,37172386.0,,2260.0,Asia/Kabul,2024-07-23
87954,3576396,Antigua and Barbuda,Antigua and Barbuda,17.05,-61.8,A,PCLI,AG,,0,,,,96286.0,,65.0,America/Antigua,2024-09-05
88413,3573511,Anguilla,Anguilla,18.21667,-63.05,A,PCLD,AI,,0,,,,13254.0,,19.0,America/Anguilla,2021-08-16


In [161]:
feature_codes = [
    'PPLA', 'PPLC', 'PPL', 'PPLW',
    'PPLG', 'PPLL', 'PPLS', 'PPLF', 'PPLR'
]

filtered_cities_df = cities_df[cities_df['feature_code'].isin(feature_codes) & (cities_df['population'] >= 15000)]

In [162]:
# Merge the DataFrames on the country code
cities_with_country = pd.merge(filtered_cities_df, countries_df[['geoname_id', 'name_country', 'country_code']], on='country_code', how='left', suffixes=('_city', '_country'))

cities_with_country.head()

Unnamed: 0,geoname_id_city,name,ascii_name,latitude,longitude,feature_class,feature_code,country_code,cc2,admin1_code,admin2_code,admin3_code,admin4_code,population,elevation,dem,timezone,modification_date,geoname_id_country,name_country
0,3040051,les Escaldes,les Escaldes,42.50729,1.53414,P,PPLA,AD,,8,,,,15853.0,,1033.0,Europe/Andorra,2024-06-20,3041565,Principality of Andorra
1,3041563,Andorra la Vella,Andorra la Vella,42.50779,1.52109,P,PPLC,AD,,7,,,,20430.0,,1037.0,Europe/Andorra,2020-03-03,3041565,Principality of Andorra
2,290503,Warīsān,Warisan,25.16744,55.40708,P,PPL,AE,,3,,,,108759.0,,12.0,Asia/Dubai,2024-06-11,290557,United Arab Emirates
3,290594,Umm Al Quwain City,Umm Al Quwain City,25.56473,55.55517,P,PPLA,AE,,7,,,,62747.0,,2.0,Asia/Dubai,2019-10-24,290557,United Arab Emirates
4,291074,Ras Al Khaimah City,Ras Al Khaimah City,25.78953,55.9432,P,PPLA,AE,,5,,,,351943.0,,2.0,Asia/Dubai,2019-09-09,290557,United Arab Emirates


In [163]:
# Include first-order administrative division in cities_with_country_table
cities_with_country['admin1_geocode'] = cities_with_country['country_code'] + '.' + cities_with_country['admin1_code']

cities_with_country_admin1_geocodes = pd.merge(cities_with_country, admin1_codes_df[['code', 'name', 'geoname_id_admin1']], right_on='code',
                                               left_on='admin1_geocode', how='left',  suffixes=('_city', '_admin1')).drop('code', axis=1)

# Display the filtered DataFrame
from IPython.display import display, HTML
html_table = cities_with_country_admin1_geocodes.head(100).to_html()
display(HTML(html_table))

Unnamed: 0,geoname_id_city,name_city,ascii_name,latitude,longitude,feature_class,feature_code,country_code,cc2,admin1_code,admin2_code,admin3_code,admin4_code,population,elevation,dem,timezone,modification_date,geoname_id_country,name_country,admin1_geocode,name_admin1,geoname_id_admin1
0,3040051,les Escaldes,les Escaldes,42.50729,1.53414,P,PPLA,AD,,8,,,,15853.0,,1033.0,Europe/Andorra,2024-06-20,3041565,Principality of Andorra,AD.08,Escaldes-Engordany,3338529
1,3041563,Andorra la Vella,Andorra la Vella,42.50779,1.52109,P,PPLC,AD,,7,,,,20430.0,,1037.0,Europe/Andorra,2020-03-03,3041565,Principality of Andorra,AD.07,Andorra la Vella,3041566
2,290503,Warīsān,Warisan,25.16744,55.40708,P,PPL,AE,,3,,,,108759.0,,12.0,Asia/Dubai,2024-06-11,290557,United Arab Emirates,AE.03,Dubai,292224
3,290594,Umm Al Quwain City,Umm Al Quwain City,25.56473,55.55517,P,PPLA,AE,,7,,,,62747.0,,2.0,Asia/Dubai,2019-10-24,290557,United Arab Emirates,AE.07,Imārat Umm al Qaywayn,290595
4,291074,Ras Al Khaimah City,Ras Al Khaimah City,25.78953,55.9432,P,PPLA,AE,,5,,,,351943.0,,2.0,Asia/Dubai,2019-09-09,290557,United Arab Emirates,AE.05,Raʼs al Khaymah,291075
5,291580,Zayed City,Zayed City,23.65416,53.70522,P,PPL,AE,,1,103.0,12748055.0,,63482.0,,118.0,Asia/Dubai,2024-03-14,290557,United Arab Emirates,AE.01,Abu Dhabi,292969
6,291696,Khawr Fakkān,Khawr Fakkan,25.33132,56.34199,P,PPL,AE,,6,,,,40677.0,,20.0,Asia/Dubai,2024-09-12,290557,United Arab Emirates,AE.06,Sharjah,292673
7,291763,Kalbā,Kalba,25.07462,56.35545,P,PPL,AE,,6,,,,37545.0,,1.0,Asia/Dubai,2024-04-03,290557,United Arab Emirates,AE.06,Sharjah,292673
8,292223,Dubai,Dubai,25.07725,55.30927,P,PPLA,AE,,3,,,,3478300.0,,24.0,Asia/Dubai,2024-06-19,290557,United Arab Emirates,AE.03,Dubai,292224
9,292231,Dibba Al-Fujairah,Dibba Al-Fujairah,25.59246,56.26176,P,PPL,AE,,4,,,,30000.0,,16.0,Asia/Dubai,2014-08-12,290557,United Arab Emirates,AE.04,Fujairah,292879


In [168]:
# Filter alternate_names_df for French names
filtered_alternate_names = alternate_names_df[alternate_names_df['iso_language'] == 'pl'].copy()

def determine_priority(row):
    if row['is_preferred_name'] == True and row['is_short_name'] == False and row['is_colloquial'] == False and row['is_historic'] == False:
        return 1
    elif row['is_preferred_name'] == False and row['is_short_name'] == False and row['is_colloquial'] == False and row['is_historic'] == False:
        return 2
    elif row['is_preferred_name'] == False and row['is_short_name'] == True and row['is_colloquial'] == False and row['is_historic'] == False:
        return 3
    else:
        return 4

# Add a priority column to the filtered DataFrame
filtered_alternate_names['priority'] = filtered_alternate_names.apply(determine_priority, axis=1)

# Sort the filtered DataFrame by priority and geoname_id
filtered_alternate_names.sort_values(by=['priority', 'geoname_id'], inplace=True)

# Select the first row for each geoname_id in the filtered DataFrame
filtered_alternate_names = filtered_alternate_names.groupby('geoname_id').first().reset_index()

filtered_alternate_names.head()


Unnamed: 0,geoname_id,alternate_name_id,iso_language,alternate_name,is_preferred_name,is_short_name,is_colloquial,is_historic,from,to,priority
0,677,17967561,pl,Stadion Azadi,False,False,False,False,,,2
1,2106,17967737,pl,Port lotniczy Raszt,False,False,False,False,,,2
2,3370,17967837,pl,Aghbal,False,False,False,False,,,2
3,3456,17967873,pl,Nader Goli,False,False,False,False,,,2
4,4867,17967966,pl,Chirokitia,False,False,False,False,,,2


In [171]:
# Add alternate city names
cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_geocodes, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                               how='left', left_on='geoname_id_city', right_on='geoname_id').drop('geoname_id', axis=1)

# Fill missing city names with original values
cities_with_country_admin1_alternates['alternate_name'] = cities_with_country_admin1_alternates['alternate_name'].fillna(
    cities_with_country_admin1_alternates['name_city']
)

# Add alternate admin1 names 
cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_alternates, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                                 how='left', left_on='geoname_id_admin1', right_on='geoname_id', suffixes=('_city','_admin1')).drop('geoname_id', axis=1)

# Fill missing admin1 names with original values
cities_with_country_admin1_alternates['alternate_name_admin1'] = cities_with_country_admin1_alternates['alternate_name_admin1'].fillna(
    cities_with_country_admin1_alternates['name_admin1']
)

# Add alternate country names 
cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_alternates, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                                 how='left', left_on='geoname_id_country', right_on='geoname_id').drop('geoname_id', axis=1).rename(columns={'alternate_name': 'alternate_name_country'})

# Fill missing country names with original values
cities_with_country_admin1_alternates['alternate_name_country'] = cities_with_country_admin1_alternates['alternate_name_country'].fillna(
    cities_with_country_admin1_alternates['name_country']
)

# Display the filtered DataFrame
from IPython.display import display, HTML
html_table = cities_with_country_admin1_alternates[cities_with_country_admin1_alternates['alternate_name_admin1'].isna()].head(100).to_html()
display(HTML(html_table))

Unnamed: 0,geoname_id_city,name_city,ascii_name,latitude,longitude,feature_class,feature_code,country_code,cc2,admin1_code,admin2_code,admin3_code,admin4_code,population,elevation,dem,timezone,modification_date,geoname_id_country,name_country,admin1_geocode,name_admin1,geoname_id_admin1,alternate_name_city,alternate_name_admin1,alternate_name_country
502,3577072,Tanki Leendert,Tanki Leendert,12.53914,-70.02004,P,PPL,AW,,00,,,,21500.0,,26.0,America/Aruba,2018-12-04,3577279,Aruba,AW.00,,,Tanki Leendert,,Aruba
503,3577089,San Nicolas,San Nicolas,12.43624,-69.90713,P,PPL,AW,,00,,,,15200.0,,14.0,America/Aruba,2018-12-04,3577279,Aruba,AW.00,,,San Nicolas,,Aruba
504,3577154,Oranjestad,Oranjestad,12.52398,-70.02703,P,PPLC,AW,,00,,,,29998.0,5.0,22.0,America/Aruba,2024-01-10,3577279,Aruba,AW.00,,,Oranjestad,,Aruba
505,3577159,Noord,Noord,12.56596,-70.03198,P,PPL,AW,,00,,,,24193.0,,11.0,America/Aruba,2024-04-21,3577279,Aruba,AW.00,,,Noord,,Aruba
3664,3513090,Willemstad,Willemstad,12.12246,-68.88641,P,PPLC,CW,,,,,,125000.0,,1.0,America/Curacao,2024-01-10,7626836,Country of Curaçao,,,,Willemstad,,Curaçao
4937,2462881,Laayoune,Laayoune,27.1418,-13.18797,P,PPLA,EH,MA,,,,,196331.0,,68.0,Africa/El_Aaiun,2024-09-05,2461445,Western Sahara,,,,Al-Ujun,,Sahara Zachodnia
4938,2463029,Boujdour,Boujdour,26.13073,-14.48513,P,PPL,EH,,,,,,42651.0,,28.0,Africa/El_Aaiun,2022-12-20,2461445,Western Sahara,,,,Boujdour,,Sahara Zachodnia
4939,2463447,Dakhla,Dakhla,23.68477,-15.95798,P,PPLA,EH,MA,CE,,,,75000.0,,6.0,Africa/El_Aaiun,2020-11-01,2461445,Western Sahara,EH.CE,,,Dakhla,,Sahara Zachodnia
5311,11048860,Chagni,Chagni,10.95627,36.50456,P,PPLL,ET,,28,,,,52300.0,0.0,1637.0,Africa/Addis_Ababa,2023-11-30,337996,Federal Democratic Republic of Ethiopia,ET.28,,,Chagni,,Etiopia
5312,11048861,Shendi,Shendi,10.64061,36.94684,P,PPLL,ET,,28,,,,23900.0,2063.0,2105.0,Africa/Addis_Ababa,2023-11-30,337996,Federal Democratic Republic of Ethiopia,ET.28,,,Shendi,,Etiopia


In [None]:
# TODO: Find an approach to filter out admin1 and country names with same names. For example should avoid name like HongKong, HongKong