In [1]:
import pandas as pd
import numpy as np
import uuid

def get_uid():
    return '{' + str(uuid.uuid4()).upper() + '}'

def make_city_dict(_df):
    city_list = list(_df[['city_ascii', 'lng', 'lat']].to_records(index=False))
    city_dict = {}
    for val in city_list:
        if val[0] not in city_dict:
            city_dict[val[0]] = []
        city_dict[val[0]].append((val[1], val[2]))
    return city_dict

def coord_equal(row, cds):
    return np.abs(row.Longitude - cds[0]) + np.abs(row.Latitude - cds[1]) < 0.06

def in_city_set(row, _city_dict):
    if row.AccentCity not in _city_dict: 
        return False
    else:
        for cds in _city_dict[row.AccentCity]:
            if coord_equal(row, cds): return True
        return False
    
def output_xml(_df, name):
    lines = ['<?xml version="1.0" encoding="utf-8"?>\n', 
             '<FSData version="9.0">\n']   
            
    _df = _df.sort_values('city_ascii')
    _df['xml'] = _df.apply(lambda x: f'<LandmarkLocation instanceId="{get_uid()}" type="City" name="{x.city_ascii}" lat="{x.lat}" lon="{x.lng}" alt="{x.elevation*3.0}" offset="0.0000" />\n', axis=1)
    
    lines += _df['xml'].values.tolist() + ['</FSData>']
    with open(name, "w", encoding="utf-8") as f:
        f.writelines(lines)
    print(f"OK, written {len(lines)} lines")

In [4]:
df1 = pd.read_csv("worldcities.csv").sort_values("city_ascii")
df1 = df1[['city_ascii', 'lat', 'lng']]
df1['elevation'] = 0.0
df1.head()

Unnamed: 0,city_ascii,lat,lng,elevation
13574,'Araba,32.8511,35.3386,0.0
3659,'s-Hertogenbosch,51.6833,5.3167,0.0
1695,A Coruna,43.3713,-8.4188,0.0
2313,Aachen,50.7762,6.0838,0.0
3852,Aalborg,57.0337,9.9166,0.0


In [35]:
df2 = pd.read_csv("worldcitiespop1.csv", encoding="utf-8")
df2['Latitude'] = df2['Latitude'].apply(lambda x: np.round(x, 4))
df2['Longitude'] = df2['Longitude'].apply(lambda x: np.round(x, 4))
df2.head()

Unnamed: 0.1,Unnamed: 0,AccentCity,Latitude,Longitude
0,855176,A Coruna,43.3666,-8.4068
1,2295214,A dos Cunhados,39.1524,-9.2972
2,792982,Aabenraa,55.0362,9.418
3,792989,Aabybro,57.15,9.75
4,722416,Aachen,50.7708,6.1053


In [36]:
city_dict = make_city_dict(df1)
len(city_dict)

23893

In [37]:
l = []

for i, row in df2.iterrows():
    if not in_city_set(row, city_dict):
        l.append({
            "city_ascii": row.AccentCity,
            "lat": row.Latitude,
            "lng": row.Longitude
        })
        
df3 = df1.append(l)
len(df3)

60034

In [38]:
df3 = df3.sort_values('city_ascii')
df3.head(5)

Unnamed: 0,city_ascii,lat,lng
13574,'Araba,32.8511,35.3386
3659,'s-Hertogenbosch,51.6833,5.3167
33372,A,63.9667,10.2167
33373,A,63.9667,10.2
1695,A Coruna,43.3713,-8.4188


In [39]:
df3 = df3.append({'city_ascii': 'Zheleznodorozhnyy', 'lat': 37.9927, 'lng': 55.7482}, ignore_index=True).sort_values('city_ascii')
len(df3)

60035

In [40]:
df3.to_csv("cities3.csv", index=False)

#### Начинаем сразу с cities3.csv

In [41]:
df3 = pd.read_csv('cities3.csv')
df3.head()

Unnamed: 0,city_ascii,lat,lng
0,'Araba,32.8511,35.3386
1,'s-Hertogenbosch,51.6833,5.3167
2,A,63.9667,10.2167
3,A,63.9667,10.2
4,A Coruna,43.3713,-8.4188


#### Что делать с двумя миллионами других

In [54]:
df4 = pd.read_csv("worldcitiespop.csv")
df4 = df4[(df4['Population'].isna()) & ((df4['Country'] == 'fr') | (df4['Country'] == 'it'))]
print(len(df4))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


84987


In [None]:
df5 = df4.groupby(['Longitude','Latitude']).agg({'AccentCity': lambda x: list(x)[0]}).reset_index()
df5['AccentCity'] = df5['AccentCity'].apply(lambda x: x.strip())
df5['Latitude'] = df5['Latitude'].apply(lambda x: np.round(x, 4))
df5['Longitude'] = df5['Longitude'].apply(lambda x: np.round(x, 4))
df5 = df5.sort_values('AccentCity')
print(len(df5))
df5.head()

In [51]:
city_dict = make_city_dict(df3)
len(city_dict)

52727

In [56]:
l = []

for i, row in df5.iterrows():
    if not in_city_set(row, city_dict):
        l.append({
            "city_ascii": row.AccentCity,
            "lat": row.Latitude,
            "lng": row.Longitude
        })
        
df7 = pd.DataFrame(l)
len(df7)

69198

In [57]:
output_xml(df7, "cities_fr_it.xml")

OK, written 69201 lines


#### Geonames

In [None]:
gn = pd.read_csv("cities1000.tsv", sep="\t", header=None)
gn.columns = ["geonameid", "name", "city_ascii", "alternatenames", "lat", "lng", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "population", "elevation", "f11", "f12", "f13"]
gn = gn[['city_ascii', 'lat', 'lng', 'population', 'elevation']]\
    .append({'city_ascii': 'Zheleznodorozhnyy', 'lat': 37.9927, 'lng': 55.7482}, ignore_index=True).sort_values('city_ascii')
gn['elevation'] = gn['elevation'].fillna(0.0)
gn['lat'] = gn['lat'].apply(lambda x: np.round(x, 4))
gn['lng'] = gn['lng'].apply(lambda x: np.round(x, 4))
gn.head(10)

In [25]:
gn1 = gn.groupby(['lat', 'lng']).agg({'city_ascii': lambda x: list(x)}).reset_index()
gn1

Unnamed: 0,lat,lng,city_ascii
0,-77.8460,166.6760,[McMurdo Station]
1,-54.9336,-67.6096,[Puerto Williams]
2,-54.8108,-68.3159,[Ushuaia]
3,-54.5108,-67.1955,[Tolhuin]
4,-54.2811,-36.5092,[Grytviken]
...,...,...,...
137478,71.9635,114.0920,[Saskylakh]
137479,71.9800,102.4711,[Khatanga]
137480,72.7872,-56.1444,[Upernavik]
137481,73.5082,80.5292,[Dikson]


In [26]:
sorted(gn1['city_ascii'].values, key=lambda x: -len(x))

[['Koani', 'Koani Ndogo'],
 ['Nkoaranga', 'Poli'],
 ['Prachamtakham', 'Prachantakham'],
 ['Pa Kham', 'Pakham'],
 ['San Antonio de la Cuesta', 'San Jeronimo'],
 ['Benchalak', 'Siao'],
 ['Pueblo Nuevo', 'San Nicolas'],
 ['Pimienta Vieja', 'Potrerillos'],
 ['Mojiman', 'Nueva Esperanza'],
 ['La Huesa', 'Monterrey'],
 ['El Rancho', 'Quebrada Seca'],
 ['Khuean Ubonrat', 'Ubonratana'],
 ['Sam Chai', 'Samran'],
 ['Suwanna Khuha', 'Suwannakhuha'],
 ['Lap Lae', 'Laplae'],
 ['Phon Phisai', 'Phonphisai'],
 ['Santa Clara', 'Sumidero'],
 ['San Carlos', 'Villa Consuelo'],
 ['Cristo Rey', 'La Agustina'],
 ['Choshi', 'Hasaki'],
 ['Faja de Baixo', 'Rosto de Cao'],
 ['Almisera', 'Llocnou de Sant Jeroni'],
 ['Benirredra', 'Gandia'],
 ['Alcantera de Xuquer', 'Beneixida'],
 ['Montalvinho', 'Montalvo'],
 ['Albalat dels Sorells', 'Foios'],
 ['Benifairo de les Valls', 'Quart de les Valls'],
 ['Furano', 'Shimo-furano'],
 ['Kastel Novi', 'Kastel Stafilic'],
 ['Greater Napanee', 'Napanee'],
 ['Batarasti', 'Glavil

In [16]:
output_xml(gn[:70000], "geo_cities1.xml")

OK, written 70003 lines


In [27]:
output_xml(gn[70000:], "geo_cities2.xml")

OK, written 67537 lines


In [5]:
output_xml(df1, "cities1.xml")

OK, written 26572 lines
