In [205]:
import pandas as pd
import numpy as np

# Define the file paths
global_cities_path = 'allCountries.txt'
alternate_names_path = 'alternateNamesV2.txt'
admin1_codes_path = 'admin1CodesASCII.txt'

# Define the column headers for the global cities file
global_cities_headers = [
    'geoname_id', 'name', 'ascii_name', 'alternate_names', 'latitude', 'longitude',
    'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1_code',
    'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation',
    'dem', 'timezone', 'modification_date'
]

# Define the data types for the columns in the global cities file
global_cities_dtype = {
    'geoname_id': 'Int64', 'name': str, 'asciiname': str, 'alternatenames': str,
    'latitude': float, 'longitude': float, 'feature_class': str, 'feature_code': str,
    'country_code': str, 'cc2': str, 'admin1_code': str, 'admin2_code': str,
    'admin3_code': str, 'admin4_code': str, 'population': float, 'elevation': float,
    'dem': float, 'timezone': str, 'modification_date': str
}

# Define the column headers for the alternate names file
alternate_names_headers = [
    'alternate_name_id', 'geoname_id', 'iso_language', 'alternate_name',
    'is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic', 
    'from', 'to'
]

# Define the data types for the columns in the alternate names file
alternate_names_dtype = {
    'alternate_name_id': 'Int64', 'geoname_id': 'Int64', 'iso_language': str, 'alternate_name': str,
    'is_preferred_name': 'boolean', 'is_short_name': 'boolean', 'is_colloquial': 'boolean', 'is_historic': 'boolean',
    'from': str, 'to': str
}

# Define the column headers for the admin1 codes file
admin1_codes_headers = [
    'code', 'name', 'name_ascii', 'geoname_id_admin1'
]

# Define the data types for the columns in the admin1 codes file
admin1_codes_dtype = {
    'code': str, 'name': str, 'name_ascii': str, 'geoname_id_admin1': 'Int64'
}

# Read the files 'Int64'o pandas DataFrames
alternate_names_df = pd.read_csv(alternate_names_path, sep='\t', header=None, names=alternate_names_headers, dtype=alternate_names_dtype, low_memory=False, keep_default_na=False, na_values='')
cities_df = pd.read_csv(global_cities_path, sep='\t', header=None, names=global_cities_headers, dtype=global_cities_dtype, low_memory=False, keep_default_na=False, na_values='').drop('alternate_names', axis=1)
admin1_codes_df = pd.read_csv(admin1_codes_path, sep='\t', header=None, names=admin1_codes_headers, dtype=admin1_codes_dtype, low_memory=False, keep_default_na=False, na_values='')

In [157]:
# Fill <NA> values with False for the specified columns
alternate_names_df[['is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic']] = \
    alternate_names_df[['is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic']].fillna(False)

In [158]:
# Generate countries dataset
countries_df = cities_df[cities_df['feature_code'].isin(['PCLI', 'PCLS', 'PCLIX', 'TERR', 'PCLD', 'PCL', 'PCLF'])].rename(columns={'name': 'name_country'})

In [159]:
feature_codes = [
    'PPLA', 'PPLC', 'PPL', 'PPLW',
    'PPLG', 'PPLL', 'PPLS', 'PPLF', 'PPLR'
]

filtered_cities_df = cities_df[cities_df['feature_code'].isin(feature_codes) & (cities_df['population'] >= 15000)]

In [160]:
# Merge the DataFrames on the country code
cities_with_country = pd.merge(filtered_cities_df, countries_df[['geoname_id', 'name_country', 'country_code']], on='country_code', how='left', suffixes=('_city', '_country'))

In [161]:
# Include first-order administrative division in cities_with_country_table
cities_with_country['admin1_geocode'] = cities_with_country['country_code'] + '.' + cities_with_country['admin1_code']

cities_with_country_admin1_geocodes = pd.merge(cities_with_country, admin1_codes_df[['code', 'name', 'geoname_id_admin1']], right_on='code',
                                               left_on='admin1_geocode', how='left',  suffixes=('_city', '_admin1')).drop('code', axis=1)

In [148]:
# Remove the admin_area column if the city name is unique within a country. Keep it if multiple cities have the same name in the country.
cities_with_country_admin1_geocodes["city_count"] = cities_with_country_admin1_geocodes.groupby(["geoname_id_country", "name_city"])["geoname_id_city"].transform("count")
cities_with_country_admin1_geocodes["geoname_id_admin1"] = cities_with_country_admin1_geocodes.apply(lambda row: row["geoname_id_admin1"] if row["city_count"] > 1 else np.nan, axis=1)

In [198]:
# Filter alternate_names_df for French names
filtered_alternate_names = alternate_names_df[alternate_names_df['iso_language'] == 'fr'].copy()

def determine_priority(row):
    if row['is_preferred_name'] == True and row['is_short_name'] == False and row['is_colloquial'] == False and row['is_historic'] == False:
        return 1
    elif row['is_preferred_name'] == False and row['is_short_name'] == False and row['is_colloquial'] == False and row['is_historic'] == False:
        return 2
    elif row['is_preferred_name'] == False and row['is_short_name'] == True and row['is_colloquial'] == False and row['is_historic'] == False:
        return 3
    else:
        return 4

# Add a priority column to the filtered DataFrame
filtered_alternate_names['priority'] = filtered_alternate_names.apply(determine_priority, axis=1)

# Sort the filtered DataFrame by priority and geoname_id
filtered_alternate_names.sort_values(by=['priority', 'geoname_id'], inplace=True)

# Select the first row for each geoname_id in the filtered DataFrame
filtered_alternate_names = filtered_alternate_names.groupby('geoname_id').first().reset_index()

In [199]:
# Add alternate city names
cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_geocodes, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                               how='left', left_on='geoname_id_city', right_on='geoname_id').drop('geoname_id', axis=1)

# Fill missing city names with original values
cities_with_country_admin1_alternates['alternate_name'] = cities_with_country_admin1_alternates['alternate_name'].fillna(
    cities_with_country_admin1_alternates['name_city']
)

# Add alternate admin1 names 
cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_alternates, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                                 how='left', left_on='geoname_id_admin1', right_on='geoname_id', suffixes=('_city','_admin1')).drop('geoname_id', axis=1)

# Add alternate country names 
cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_alternates, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                                 how='left', left_on='geoname_id_country', right_on='geoname_id').drop('geoname_id', axis=1).rename(columns={'alternate_name': 'alternate_name_country'})

# Fill missing country names with original values
cities_with_country_admin1_alternates['alternate_name_country'] = cities_with_country_admin1_alternates['alternate_name_country'].fillna(
    cities_with_country_admin1_alternates['name_country']
)

In [200]:
# TODO: Find an approach to filter out admin1 and country names with same names. For example should avoid name like HongKong, HongKong

In [201]:
def check_names_city_country(row):
    name = str(row['alternate_name_city']).lower().strip()
    country = str(row['alternate_name_country']).lower().strip()
    return country in name

# Get the indices where the condition is met (using the original DataFrame)
indices_to_remove = cities_with_country_admin1_alternates[
    cities_with_country_admin1_alternates.apply(check_names_city_country, axis=1)
].index

# Remove the 'alternate_name_admin1' values at those indices in the original DataFrame
cities_with_country_admin1_alternates.loc[indices_to_remove, 'alternate_name_country'] = np.nan

In [202]:
def check_names_city_admin1(row):
    name = str(row['alternate_name_city']).lower().strip()
    admin1 = str(row['alternate_name_admin1']).lower().strip()
    return name in admin1 or admin1 in name

# Get the indices where the condition is met (using the copy)
indices_to_remove = cities_with_country_admin1_alternates[
    cities_with_country_admin1_alternates.apply(check_names_city_admin1, axis=1)
].index  # Get the indices

# Remove the 'alternate_name_admin1' values at those indices in the ORIGINAL
cities_with_country_admin1_alternates.loc[indices_to_remove, 'alternate_name_admin1'] = np.nan

In [204]:
# html_table = cities_with_country_admin1_alternates[cities_with_country_admin1_alternates['alternate_name_admin1'].notna()].head(1000).to_html()
# display(HTML(html_table))