In [287]:
import pandas as pd
import numpy as np

languages = ['pl', 'lt', 'ru', 'hu', 'en', 'fr']

global_cities_path = 'allCountries.txt'
alternate_names_path = 'alternateNamesV2.txt'
admin1_codes_path = 'admin1CodesASCII.txt'
population_threshold = int("5000")

global_cities_headers = [
    'geoname_id', 'name', 'ascii_name', 'alternate_names', 'latitude', 'longitude',
    'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1_code',
    'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation',
    'dem', 'timezone', 'modification_date'
]

global_cities_headers_usecols = [
    'geoname_id', 'name', 'ascii_name', 'latitude', 'longitude',
    'feature_code', 'country_code', 'admin1_code', 'population'
]

# Define the data types for the columns in the global cities file
global_cities_dtype = {
    'geoname_id': 'Int64', 'name': str, 'ascii_name': str, 'alternatenames': str,
    'latitude': float, 'longitude': float, 'feature_class': str, 'feature_code': str,
    'country_code': str, 'cc2': str, 'admin1_code': str, 'admin2_code': str,
    'admin3_code': str, 'admin4_code': str, 'population': 'Int64', 'elevation': float,
    'dem': float, 'timezone': str, 'modification_date': str
}

# Define the column headers for the alternate names file
alternate_names_headers = [
    'alternate_name_id', 'geoname_id', 'iso_language', 'alternate_name',
    'is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic', 
    'from', 'to'
]

alternate_names_headers_usecols = [
    'geoname_id', 'iso_language', 'alternate_name',
    'is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic'
]

alternate_names_dtype = {
    'alternate_name_id': 'Int64', 'geoname_id': 'Int64', 'iso_language': str, 'alternate_name': str,
    'is_preferred_name': 'boolean', 'is_short_name': 'boolean', 'is_colloquial': 'boolean', 'is_historic': 'boolean',
    'from': str, 'to': str
}

admin1_codes_headers = [
    'code', 'name', 'name_ascii', 'geoname_id_admin1'
]

admin1_codes_usecols = ['code', 'name', 'geoname_id_admin1', 'name_ascii']

admin1_codes_dtype = {
    'code': str, 'name': str, 'name_ascii': str, 'geoname_id_admin1': 'Int64'
}

alternate_names_df = pd.read_csv(alternate_names_path, sep='\t', header=None, names=alternate_names_headers, dtype=alternate_names_dtype, low_memory=False, keep_default_na=False, na_values='', encoding='utf-8', usecols=alternate_names_headers_usecols)
cities_df = pd.read_csv(global_cities_path, sep='\t', header=None, names=global_cities_headers, dtype=global_cities_dtype, low_memory=False, keep_default_na=False, na_values='', encoding='utf-8', usecols=global_cities_headers_usecols)
admin1_codes_df = pd.read_csv(admin1_codes_path, sep='\t', header=None, names=admin1_codes_headers, dtype=admin1_codes_dtype, low_memory=False, keep_default_na=False, na_values='', encoding='utf-8', usecols=admin1_codes_usecols).rename(columns={"name_ascii": "admin1_ascii_name"})

In [288]:
# Fill <NA> values with False for the specified columns
alternate_names_df[['is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic']] = \
    alternate_names_df[['is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic']].fillna(False)

In [289]:
# Generate countries dataset
countries_df = cities_df[cities_df['feature_code'].isin(['PCLI', 'PCLS', 'PCLIX', 'TERR', 'PCLD', 'PCL', 'PCLF'])].rename(columns={'name': 'name_country'})

In [290]:
feature_codes = [
    'PPLA', 'PPLA2', 'PPLA3', 'PPLC', 'PPL', 'PPLW',
    'PPLG', 'PPLL', 'PPLS', 'PPLF', 'PPLR'
]

filtered_cities_df = cities_df[cities_df['feature_code'].isin(feature_codes) & (cities_df['population'] >= population_threshold)]

filtered_cities_df[filtered_cities_df['feature_code'] == 'PPLA3'].count()

geoname_id      5589
name            5589
ascii_name      5589
latitude        5589
longitude       5589
feature_code    5589
country_code    5589
admin1_code     5589
population      5589
dtype: int64

In [291]:
# Merge the DataFrames on the country code
cities_with_country = pd.merge(filtered_cities_df, countries_df[['geoname_id', 'name_country', 'country_code', 'ascii_name']], on='country_code', how='left', suffixes=('_city', '_country'))

In [292]:
# Include first-order administrative division in cities_with_country_table
cities_with_country['admin1_geocode'] = cities_with_country['country_code'] + '.' + cities_with_country['admin1_code']

cities_with_country_admin1_geocodes = pd.merge(cities_with_country, admin1_codes_df, right_on='code',
                                               left_on='admin1_geocode', how='left',  suffixes=('_city', '_admin1')).drop('code', axis=1)

In [302]:
import geohash

def add_geohash(row):
  """Calculates the geohash for a given latitude and longitude."""
  return geohash.encode(row['latitude'], row['longitude'], precision=12)

cities_with_country_admin1_geocodes['geohash'] = cities_with_country_admin1_geocodes.apply(add_geohash, axis=1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [303]:
def calculate_radius(population):
  if 0 <= population < 50000:
    return 400
  elif 50000 <= population < 100000:
    return 800
  elif 100000 <= population < 500000:
    return 2000
  elif 500000 <= population < 1000000:
    return 4000
  elif 1000000 <= population < 5000000:
    return 12000
  elif 5000000 <= population < 10000000:
    return 15000
  else: 
    return 18000

cities_with_country_admin1_geocodes['estimated_radius'] = cities_with_country_admin1_geocodes['population'].apply(calculate_radius)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [297]:
import geopandas as gpd
from pyproj import CRS, Transformer
from shapely.geometry import Point
from shapely.ops import transform


def geodesic_point_buffer(lat, lon, distance):
    # Azimuthal equidistant projection
    aeqd_proj = CRS.from_proj4(
        f"+proj=aeqd +lat_0={lat} +lon_0={lon} +x_0=0 +y_0=0")
    tfmr = Transformer.from_proj(aeqd_proj, aeqd_proj.geodetic_crs)
    buf = Point(0, 0).buffer(distance)  # distance in metres
    return transform(tfmr.transform, buf)

points_df = cities_with_country_admin1_geocodes

# Convert the points to circles by buffering them
points_buffer_gdf = gpd.GeoDataFrame(
    points_df,
    geometry=points_df.apply(
        lambda row : geodesic_point_buffer(row.latitude, row.longitude, row.estimated_radius), axis=1
    ),
    crs=4326,
)

# Determine the intersecting city buffers (result includes self-intersections)
intersecting_gdf = points_buffer_gdf.sjoin(points_buffer_gdf)

intersecting_larger_population_df = intersecting_gdf.loc[
    (intersecting_gdf.population_left < intersecting_gdf.population_right) 
    & (intersecting_gdf.population_left < 2000000)  # New condition
]

# Remove the city buffers that intersect with a larger population city buffer
cities_with_country_admin1_geocodes = points_buffer_gdf[
    ~points_buffer_gdf.index.isin(intersecting_larger_population_df.index) 
]

In [299]:
# Filter alternate_names_df for French names
import json

def determine_priority(row):
    if row['is_preferred_name'] == True and row['is_short_name'] == False and row['is_colloquial'] == False and row['is_historic'] == False:
        return 1
    elif row['is_preferred_name'] == False and row['is_short_name'] == False and row['is_colloquial'] == False and row['is_historic'] == False:
        return 2
    elif row['is_preferred_name'] == False and row['is_short_name'] == True and row['is_colloquial'] == False and row['is_historic'] == False:
        return 3
    else:
        return 4
    
def check_names_city_country(row):
    name = str(row['ascii_name_city']).lower().strip()
    country = str(row['ascii_name_country']).lower().strip()
    return country in name or name in country

def check_names_city_admin1(row):
    name = str(row['ascii_name_city']).lower().strip()
    admin1 = str(row['admin1_ascii_name']).lower().strip()
    return name in admin1 or admin1 in name

def check_names_admin1_country(row):
    country = str(row['ascii_name_country']).lower().strip()
    admin1 = str(row['admin1_ascii_name']).lower().strip()
    return country in admin1 or admin1 in country

def remove_redundant_admin1(df):
    """
    Removes admin1 information (name_admin1 and admin1_ascii_name) 
    if the ASCII city name is unique within its country.

    Args:
      df: The GeoDataFrame containing city information.

    Returns:
      The modified GeoDataFrame.
    """

    df = df.copy()  # Create a copy to avoid SettingWithCopyWarning

    # Calculate the count of cities with the same ASCII name within each country
    df["city_count"] = df.groupby(["geoname_id_country", "ascii_name_city"])["geoname_id_city"].transform("count")
    # Set name_admin1 and admin1_ascii_name to NaN where city_count is 1
    df.loc[df["city_count"] == 1, ["alternate_name_admin1", "admin1_ascii_name"]] = np.nan
    # Calculate the count of cities with the same name within each country
    df["city_count"] = df.groupby(["geoname_id_country", "name_city"])["geoname_id_city"].transform("count")
    # Set name_admin1 and admin1_ascii_name to NaN where city_count is 1
    df.loc[df["city_count"] == 1, ["alternate_name_admin1", "admin1_ascii_name"]] = np.nan

    return df

# Initialize an empty dictionary to store the combined data
combined_data = {}

for language in languages:
    filtered_alternate_names = alternate_names_df[alternate_names_df['iso_language'] == language].copy()
    filtered_alternate_names['priority'] = filtered_alternate_names.apply(determine_priority, axis=1)
    filtered_alternate_names.sort_values(by=['priority', 'geoname_id'], inplace=True)
    filtered_alternate_names = filtered_alternate_names.groupby('geoname_id').first().reset_index()

    cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_geocodes, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                                     how='left', left_on='geoname_id_city', right_on='geoname_id').drop('geoname_id', axis=1)
    
    cities_with_country_admin1_alternates['alternate_name'] = cities_with_country_admin1_alternates['alternate_name'].fillna(
        cities_with_country_admin1_alternates['ascii_name_city']
    )

    cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_alternates, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                                     how='left', left_on='geoname_id_admin1', right_on='geoname_id', suffixes=('_city','_admin1')).drop('geoname_id', axis=1)
    
    cities_with_country_admin1_alternates['alternate_name_admin1'] = cities_with_country_admin1_alternates['alternate_name_admin1'].fillna(
        cities_with_country_admin1_alternates['admin1_ascii_name']
    )

    cities_with_country_admin1_alternates = remove_redundant_admin1(cities_with_country_admin1_alternates)

    cities_with_country_admin1_alternates = pd.merge(cities_with_country_admin1_alternates, filtered_alternate_names[['geoname_id', 'alternate_name']], 
                                                     how='left', left_on='geoname_id_country', right_on='geoname_id').drop('geoname_id', axis=1).rename(columns={'alternate_name': 'alternate_name_country'})
                                                     
    cities_with_country_admin1_alternates['alternate_name_country'] = cities_with_country_admin1_alternates['alternate_name_country'].fillna(
        cities_with_country_admin1_alternates['name_country']
    )

    country_names_indices_to_remove = cities_with_country_admin1_alternates[
        cities_with_country_admin1_alternates.apply(check_names_city_country, axis=1)
    ].index
    cities_with_country_admin1_alternates.loc[country_names_indices_to_remove, 'alternate_name_country'] = np.nan

    admin1_names_indices_to_remove = cities_with_country_admin1_alternates[
        cities_with_country_admin1_alternates.apply(check_names_city_admin1, axis=1)
    ].index 

    cities_with_country_admin1_alternates.loc[admin1_names_indices_to_remove, 'alternate_name_admin1'] = np.nan

    admin1_names_vs_country_indices_to_remove = cities_with_country_admin1_alternates[
        cities_with_country_admin1_alternates.apply(check_names_admin1_country, axis=1)
    ].index 
    cities_with_country_admin1_alternates.loc[admin1_names_vs_country_indices_to_remove, 'alternate_name_admin1'] = np.nan

    # Iterate through the rows and update the combined_data dictionary
    for _, row in cities_with_country_admin1_alternates.iterrows():
        geoname_id_city = row['geoname_id_city']
        if geoname_id_city not in combined_data:
            combined_data[geoname_id_city] = {
                'geoname_id_city': geoname_id_city,
                'latitude': row['latitude'],
                'longitude': row['longitude'],
                'geohash': row['geohash'],
                'country_code': row['country_code'],
                'population': row['population'],
                'estimated_radius': row['estimated_radius'],
                'feature_code': row['feature_code'],
                'name': {}
            }
        combined_data[geoname_id_city]['name'][language] = {
            'city': row['alternate_name_city'] if pd.notna(row['alternate_name_city']) else None,
            'admin1': row['alternate_name_admin1'] if pd.notna(row['alternate_name_admin1']) else None,
            'country': row['alternate_name_country'] if pd.notna(row['alternate_name_country']) else None
        }
        combined_data[geoname_id_city]['name']['ascii'] = {
            'city': row['ascii_name_city'].lower().replace(" ", "-") if pd.notna(row['ascii_name_city']) else None,
            'admin1': row['admin1_ascii_name'].lower().replace(" ", "-") if pd.notna(row['admin1_ascii_name']) else None,
            'country': None
        }

# Convert the combined_data dictionary to a list
nested_json_list = list(combined_data.values())

# Save the nested JSON to a file
with open('combined_data.json', 'w', encoding='utf-8') as f:
    json.dump(nested_json_list, f, ensure_ascii=False, indent=4)

print("Data saved to combined_data.json")

Data saved to combined_data.json


In [300]:
cities_with_country_admin1_alternates.head()

Unnamed: 0,geoname_id_city,name_city,ascii_name_city,latitude,longitude,feature_code,country_code,admin1_code,population,geoname_id_country,...,name_admin1,admin1_ascii_name,geoname_id_admin1,geohash,estimated_radius,geometry,alternate_name_city,alternate_name_admin1,city_count,alternate_name_country
0,3039163,Sant Julià de Lòria,Sant Julia de Loria,42.46372,1.49129,PPLA,AD,6,8022,3041565,...,Sant Julià de Loria,,3039162,sp919fmcje4k,400,"POLYGON ((1.49615 42.46372, 1.49613 42.46337, ...",Sant Julià de Lòria,,1,Andorre
1,3040051,les Escaldes,les Escaldes,42.50729,1.53414,PPLA,AD,8,15853,3041565,...,Escaldes-Engordany,,3338529,sp91ffjnuj1t,400,"POLYGON ((1.53901 42.50729, 1.53898 42.50694, ...",Escaldes-Engordany,,1,Andorre
2,3040132,la Massana,la Massana,42.54499,1.51483,PPLA,AD,4,7211,3041565,...,La Massana,,3040131,sp9443p4d0gn,400,"POLYGON ((1.5197 42.54499, 1.51968 42.54464, 1...",La Massana,,1,Andorre
3,3040686,Encamp,Encamp,42.53474,1.58014,PPLA,AD,3,11223,3041565,...,Encamp,,3040684,sp91gznwgfjz,400,"POLYGON ((1.58501 42.53474, 1.58499 42.53439, ...",Encamp,,1,Andorre
4,3041563,Andorra la Vella,Andorra la Vella,42.50779,1.52109,PPLC,AD,7,20430,3041565,...,Andorra la Vella,,3041566,sp91fd79gfqu,400,"POLYGON ((1.52596 42.50779, 1.52593 42.50744, ...",Andorre-la-Vieille,,1,Andorre


In [301]:
# import folium

# def create_popup_content(row):
#     popup_content = ""
#     if not pd.isna(row['name_city']):
#         popup_content += f"{row['name_city']}, "
#     # if not pd.isna(row['alternate_name_admin1']):
#     #     popup_content += f"{row['alternate_name_admin1']}, "
#     # if not pd.isna(row['alternate_name_country']):
#     #     popup_content += f"{row['alternate_name_country']}, "
#     # if not pd.isna(row['feature_code']):
#     #     popup_content += f"{row['feature_code']}, " 
    
#     # Format population with commas
#     popup_content += f"Population: {int(row['population']):,}, "  

#     # Add radius
#     radius = calculate_radius(row['population'])
#     # Format radius with commas
#     popup_content += f"Radius: {radius:,} meters"  

#     return popup_content.rstrip(", ")  # Remove trailing comma and space

# # Create a map centered on a specific location
# m = folium.Map(location=[47.4979, 19.0402], zoom_start=10)  # Centered on Budapest

# # Add markers with circles for each city
# for index, row in cities_with_country_admin1_geocodes.iterrows():
#     folium.Circle(
#         location=[row['latitude'], row['longitude']],
#         radius=row['estimated_radius'],  # Example radius in meters
#         popup=create_popup_content(row),
#         color="blue",
#         fill=True,
#         fill_color="blue"
#     ).add_to(m)

# # Save the map as an HTML file
# m.save("cities_map.html")