In [173]:
import pandas as pd

# Define the file paths
global_cities_path = 'allCountries.txt'
alternate_names_path = 'alternateNamesV2.txt'
admin1_codes_path = 'admin1CodesASCII.txt'

# Define the column headers for the global cities file
global_cities_headers = [
    'geoname_id', 'name', 'ascii_name', 'alternate_names', 'latitude', 'longitude',
    'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1_code',
    'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation',
    'dem', 'timezone', 'modification_date'
]

# Define the data types for the columns in the global cities file
global_cities_dtype = {
    'geoname_id': int, 'name': str, 'asciiname': str, 'alternatenames': str,
    'latitude': float, 'longitude': float, 'feature_class': str, 'feature_code': str,
    'country_code': str, 'cc2': str, 'admin1_code': str, 'admin2_code': str,
    'admin3_code': str, 'admin4_code': str, 'population': float, 'elevation': float,
    'dem': float, 'timezone': str, 'modification_date': str
}

# Define the column headers for the alternate names file
alternate_names_headers = [
    'alternate_name_id', 'geoname_id', 'iso_language', 'alternate_name',
    'is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic', 
    'from', 'to'
]

# Define the data types for the columns in the alternate names file
alternate_names_dtype = {
    'alternate_name_id': int, 'geoname_id': int, 'iso_language': str, 'alternate_name': str,
    'is_preferred_name': 'boolean', 'is_short_name': 'boolean', 'is_colloquial': 'boolean', 'is_historic': 'boolean',
    'from': str, 'to': str
}

# Define the column headers for the admin1 codes file
admin1_codes_headers = [
    'code', 'name', 'name_ascii', 'geoname_id'
]

# Define the data types for the columns in the admin1 codes file
admin1_codes_dtype = {
    'code': str, 'name': str, 'name_ascii': str, 'geoname_id': int
}

# Read the files into pandas DataFrames
alternate_names_df = pd.read_csv(alternate_names_path, sep='\t', header=None, names=alternate_names_headers, dtype=alternate_names_dtype, low_memory=False, keep_default_na=False, na_values='')
cities_df = pd.read_csv(global_cities_path, sep='\t', header=None, names=global_cities_headers, dtype=global_cities_dtype, low_memory=False, keep_default_na=False, na_values='')
admin1_codes_df = pd.read_csv(admin1_codes_path, sep='\t', header=None, names=admin1_codes_headers, dtype=admin1_codes_dtype, low_memory=False, keep_default_na=False, na_values='')

In [199]:
# Fill <NA> values with False for the specified columns
alternate_names_df[['is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic']] = \
    alternate_names_df[['is_preferred_name', 'is_short_name', 'is_colloquial', 'is_historic']].fillna(False)

In [200]:
# Generate countries dataset
countries_df = cities_df[cities_df['feature_code'].isin(['PCLI', 'PCLS', 'PCLIX', 'TERR', 'PCLD', 'PCL', 'PCLF'])]

In [201]:
feature_codes = [
    'PPLA', 'PPLC', 'PPL', 'PPLW',
    'PPLG', 'PPLL', 'PPLS', 'PPLF', 'PPLR'
]

filtered_cities_df = cities_df[cities_df['feature_code'].isin(feature_codes) & (cities_df['population'] >= 15000)]

In [218]:
# Merge the DataFrames on the country code
cities_with_country = pd.merge(filtered_cities_df, countries_df[['geoname_id', 'country_code']], on='country_code', how='left')

# Rename the 'geoname_id_x' column from countries_df to 'geoname_id'
cities_with_country = cities_with_country.rename(columns={'geoname_id_x': 'geoname_id'})

# Rename the 'geoname_id' column from countries_df to 'country_geoname_id'
cities_with_country = cities_with_country.rename(columns={'geoname_id_y': 'country_geoname_id'})

# Drop rows with NaN values in 'country_geoname_id'
cities_with_country.dropna(subset=['country_geoname_id'], inplace=True)

# Now it's safe to convert to integer
cities_with_country['country_geoname_id'] = cities_with_country['country_geoname_id'].astype(int)

In [239]:
# Filter alternate_names_df for French names
filtered_alternate_names = alternate_names_df[alternate_names_df['iso_language'] == 'fr'].copy()

def determine_priority(row):
    if row['is_preferred_name'] == True and row['is_short_name'] == False and row['is_colloquial'] == False and row['is_historic'] == False:
        return 1
    elif row['is_preferred_name'] == False and row['is_short_name'] == False and row['is_colloquial'] == False and row['is_historic'] == False:
        return 2
    elif row['is_preferred_name'] == False and row['is_short_name'] == True and row['is_colloquial'] == False and row['is_historic'] == False:
        return 3
    else:
        return 4

# Add a priority column to the filtered DataFrame
filtered_alternate_names['priority'] = filtered_alternate_names.apply(determine_priority, axis=1)

# Sort the filtered DataFrame by priority and geoname_id
filtered_alternate_names.sort_values(by=['priority', 'geoname_id'], inplace=True)

# Select the first row for each geoname_id in the filtered DataFrame
filtered_alternate_names = filtered_alternate_names.groupby('geoname_id').first().reset_index()