In [1]:
import openai
import pandas as pd
from tqdm import tqdm
from pulemet import Pulemet
import asyncio

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
client = openai.AsyncOpenAI(
    base_url="https://api.llm7.io/v1",
    api_key="unused"
)
pulemet = Pulemet(rps = 2, pbar=tqdm)

Total: 0it [00:00, ?it/s]
[A

In [5]:
df = pd.read_csv('worldcities.csv')
df['description'] = ['']*len(df)
df.head(5)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
0,Tokyo,Tokyo,35.687,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764,
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077,
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604,
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133,
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629,


In [20]:
async def ask_gpt(client, data, model = "gpt-4.1"):
    try:
        city_list = "; ".join([f"{name}, {latitude}, {longitude}" for name, latitude, longitude in data])
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"You are creating riddles. If you get the name of city/town/village, you try to describe it without naming it or its coordinates. \
                    You usually use a description of the appearance of a city, famous facts about it, or the nearest iconic places. Never name the city/town/village.\
                    Use hints that distinguish the place described from other similar places in the world. Your description contains only a couple sentences. Use only english language in answer." },
                {"role": "user", "content": f"You will get a list of names,latitude,longitude of {len(list(data))} cities/towns/villages in <list> tokens, you should answer to them separately, separate them by <answer> token. <list>{city_list}<list>"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Ошибка: {str(e)}"

In [7]:
batch = 10
tasks = pulemet.process([ask_gpt(client, zip(df[i:i+batch]['city_ascii'].values, df[i:i+batch]['lat'].values, df[i:i+batch]['lng'].values)) for i in range(0, len(df), batch)])
responses = await asyncio.gather(*tasks, return_exceptions=True)

Total: 100%|██████████| 4806/4806 [42:12<00:00,  2.00it/s]  

In [99]:
failed = []
responses_fixed = []
for i in range(0, len(df), batch):
    if len(('<answer>'+responses[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1]) != batch:
        responses_fixed.extend(['']*min(batch, len(df)-i))
        failed.append(i)
    else:
        responses_fixed.extend(('<answer>'+responses[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1])
failed

[3160,
 5300,
 6650,
 8200,
 8570,
 17110,
 17940,
 19140,
 20460,
 20480,
 20580,
 20610,
 29500,
 31340,
 31420,
 31430,
 31440,
 35040,
 35220,
 39840,
 42740,
 47030,
 48050]

In [None]:
df['description']= responses_fixed
df[df['description']!=''].to_csv('data/gpt41_markup.csv')
df.head(20)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
0,Tokyo,Tokyo,35.687,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764,This metropolis is renowned for its bright neo...
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077,"This city sits on an archipelago, is known for..."
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604,This urban giant houses centuries-old forts an...
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133,"This city lies along the Pearl River, is celeb..."
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629,This coastal megacity is recognized by its col...
5,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,24922000.0,1608618140,"This is a densely populated capital on a bay, ..."
6,Shanghai,Shanghai,31.2286,121.4747,China,CN,CHN,Shanghai,admin,24073000.0,1156073548,This city is recognized by its towering Orient...
7,São Paulo,Sao Paulo,-23.5504,-46.6339,Brazil,BR,BRA,São Paulo,admin,23086000.0,1076532519,This sprawling urban center is Brazil’s econom...
8,Seoul,Seoul,37.5667,126.9833,"Korea, South",KR,KOR,Seoul,primary,23016000.0,1410836482,"This place is famous for its K-pop stars, perf..."
9,Mexico City,Mexico City,19.4333,-99.1333,Mexico,MX,MEX,Ciudad de México,primary,21804000.0,1484247881,This high-altitude city was built on an ancien...


In [7]:
batch = 10
tasks = pulemet.process([ask_gpt(client, zip(df[i:i+batch]['city_ascii'].values, df[i:i+batch]['lat'].values, df[i:i+batch]['lng'].values), model='grok') for i in range(0, len(df), batch)])
responses_grok = await asyncio.gather(*tasks, return_exceptions=True)

Total: 100%|██████████| 4806/4806 [53:12<00:00,  1.97it/s]  

In [9]:
failed = []
responses_fixed_grok = []
for i in range(0, len(df), batch):
    if len(('<answer>'+responses_grok[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1]) != batch:
        responses_fixed_grok.extend(['']*min(batch, len(df)-i))
        failed.append(i)
    else:
        responses_fixed_grok.extend(('<answer>'+responses_grok[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1])
failed

[70,
 210,
 640,
 1580,
 2010,
 2530,
 2650,
 2830,
 2910,
 2930,
 3130,
 3140,
 3170,
 4090,
 4140,
 4150,
 4310,
 4400,
 4680,
 4880,
 4940,
 5220,
 5400,
 5700,
 5910,
 5920,
 5930,
 6290,
 6470,
 6940,
 7210,
 7240,
 7370,
 7490,
 7910,
 9110,
 9500,
 9590,
 9890,
 9940,
 10090,
 10730,
 10880,
 10910,
 11050,
 11410,
 11620,
 12340,
 12430,
 12580,
 12730,
 12970,
 13750,
 14700,
 14880,
 15360,
 15840,
 16220,
 16560,
 16930,
 16940,
 17460,
 17780,
 17820,
 18100,
 18260,
 18500,
 18820,
 19120,
 19130,
 19330,
 20290,
 20850,
 21260,
 21350,
 22100,
 23260,
 23320,
 23630,
 24180,
 24400,
 25070,
 26080,
 26110,
 27310,
 27390,
 27620,
 27760,
 27770,
 27890,
 27940,
 28730,
 28960,
 29070,
 29250,
 29390,
 29760,
 30920,
 31720,
 31730,
 32450,
 32510,
 32750,
 33210,
 33450,
 33890,
 34060,
 34600,
 34890,
 34960,
 35120,
 35250,
 35670,
 35900,
 36100,
 37200,
 37340,
 37990,
 38000,
 38380,
 38750,
 38990,
 39270,
 39980,
 40140,
 40450,
 41260,
 41300,
 41770,
 41920,
 420

In [None]:
df['description_grok']= responses_fixed_grok
df[df['description_grok']!=''].drop(['description'], axis=1).rename({'description_grok':'description'}, axis=1).to_csv('data/grok_markup.csv')


In [28]:
async def top5_landmarks(client, data, model = "gpt-4.1"):
    try:
        city_list = "; ".join([f"{name}, {latitude}, {longitude}" for name, latitude, longitude in data])
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"You are a tour guide and your task is to advise the most famous and interesting places of the settlement. Don't name coordinates of this places. \
                    You can name this landmarks or describe it in couple words. Don't name settlement in description. You should give exactly a five landmarks for each settlement.\
                    Try to make these sights show the uniqueness of the settlement together. Use only english language in answer.\
                    Answer format: <answer><landmark>Name of landmark of 1st city<text>Description of landmark of 1st city<landmark>Next name of landmark of 1st city<text>Description of that landmark of 1st city<landmark><answer><landmark>Name of landmark of 2nd city<text>Description of landmark of 2nd city<landmark>Next name of landmark of 2nd city<text>Description of that landmark of 2nd city<landmark><answer>" },
                {"role": "user", "content": f"You will get a list of names,latitude,longitude of {len(list(data))} cities/towns/villages in <list> tokens. You should answer to them separately. <list>{city_list}<list>"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Ошибка: {str(e)}"

In [51]:
batch = 10
tasks = pulemet.process([top5_landmarks(client, zip(df[i:i+batch]['city_ascii'].values, df[i:i+batch]['lat'].values, df[i:i+batch]['lng'].values)) for i in range(0, len(df), batch)])
responses_landmarks = await asyncio.gather(*tasks, return_exceptions=True)

Total: 100%|██████████| 9621/9621 [3:23:04<00:00,  1.47it/s]     

In [82]:
failed = []
df_name_landmarks = []
df_description_landmarks = []
for i in range(0, len(df), batch):
    cities = ('<answer>'+responses_landmarks[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1]
    if len(cities) != batch:
        failed.append(i)
    else:
        for j, city in enumerate(cities):
            landmarks = ('<landmark>'+city+'<landmark>').replace('<landmark><landmark>', '<landmark>').replace('<landmark><landmark>', '<landmark>').split('<landmark>')[1:-1]
            if len(landmarks) != 5:
                print(i, j, 'Error landmarks')
            else:
                for k, landmark in enumerate(landmarks):
                    try:
                        place, description = landmark.split('<text>')
                        df_name_landmarks.append(dict(list(df.iloc[i+j].drop(['description','description_grok']).to_dict().items()) + list({'description':place}.items())))
                        df_description_landmarks.append(dict(list(df.iloc[i+j].drop(['description','description_grok']).to_dict().items()) + list({'description':description}.items())))
                    except:
                        print(i,j,k, 'no description')
failed


270 4 Error landmarks
350 7 Error landmarks
620 7 Error landmarks
850 4 Error landmarks
1800 9 Error landmarks
1990 4 Error landmarks
4630 2 Error landmarks
7620 8 Error landmarks
9230 1 Error landmarks
9410 8 Error landmarks
9500 9 Error landmarks
10540 1 0 no description
10540 2 0 no description
10540 3 0 no description
10540 4 0 no description
10540 5 0 no description
10540 6 0 no description
10540 7 0 no description
10540 8 0 no description
10540 9 0 no description
11250 5 Error landmarks
11590 3 Error landmarks
11770 1 Error landmarks
13940 2 Error landmarks
16340 2 Error landmarks
16680 2 Error landmarks
17100 8 Error landmarks
17460 9 Error landmarks
18320 4 Error landmarks
19340 6 Error landmarks
20000 3 Error landmarks
21030 7 Error landmarks
22790 9 Error landmarks
24300 2 Error landmarks
25610 9 Error landmarks
26460 4 Error landmarks
29390 6 4 no description
29490 0 1 no description
29490 0 2 no description
29490 0 3 no description
29490 0 4 no description
29490 1 1 no desc

[50,
 430,
 3410,
 6840,
 8550,
 8730,
 10430,
 10530,
 10550,
 10560,
 10570,
 10600,
 10630,
 10650,
 10690,
 10700,
 11150,
 11780,
 13530,
 14670,
 16140,
 16440,
 19110,
 19390,
 22130,
 22650,
 22710,
 22940,
 23580,
 23820,
 23900,
 24030,
 25130,
 25420,
 25480,
 25780,
 26860,
 28230,
 28340,
 29520,
 29620,
 29630,
 29640,
 29650,
 29660,
 29670,
 29730,
 29740,
 30120,
 30170,
 32180,
 32430,
 34180,
 34240,
 34370,
 34810,
 34940,
 35250,
 37260,
 37370,
 37540,
 37970,
 38600,
 38610,
 39110,
 39200,
 39360,
 39630,
 41100,
 41340,
 42420,
 44330,
 44680,
 44690,
 44970,
 46790,
 47130,
 47780,
 48050]

In [None]:
pd.DataFrame.from_dict(df_name_landmarks).to_csv('data/landmarks_name_markup.csv')
pd.DataFrame.from_dict(df_description_landmarks).to_csv('data/landmarks_description_markup.csv')

In [None]:
df_cities = df.drop(['description','description_grok'], axis=1)
df_cities['description'] = df_cities['city_ascii']
df_cities.to_csv('data/cities_markup.csv')
df_cities

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
0,Tokyo,Tokyo,35.6870,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764,Tokyo
1,Jakarta,Jakarta,-6.1750,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077,Jakarta
2,Delhi,Delhi,28.6100,77.2300,India,IN,IND,Delhi,admin,32226000.0,1356872604,Delhi
3,Guangzhou,Guangzhou,23.1300,113.2600,China,CN,CHN,Guangdong,admin,26940000.0,1156237133,Guangzhou
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629,Mumbai
...,...,...,...,...,...,...,...,...,...,...,...,...
48054,Al Jabīn,Al Jabin,14.7040,43.5990,Yemen,YE,YEM,Raymah,admin,,1887910100,Al Jabin
48055,Nelspruit,Nelspruit,-25.4745,30.9703,South Africa,ZA,ZAF,Mpumalanga,admin,,1710114438,Nelspruit
48056,Gqeberha,Gqeberha,-33.9681,25.5981,South Africa,ZA,ZAF,Eastern Cape,,,1710000082,Gqeberha
48057,Lupane,Lupane,-18.9315,27.8070,Zimbabwe,ZW,ZWE,Matabeleland North,admin,,1716206606,Lupane


In [21]:
async def ask_gpt_geographic(client, data, model = "gpt-4.1"):
    try:
        city_list = "; ".join([f"{name}, {latitude}, {longitude}" for name, latitude, longitude in data])
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"You are creating riddles. If you get the name of city/town/village, you try to describe it without naming it or its coordinates. \
                    Use nearest geographic places to describe settlement. Never name the city/town/village.\
                    Use hints that distinguish the place described from other similar places in the world. Your description contains only a couple sentences. Use only english language in answer." },
                {"role": "user", "content": f"You will get a list of names,latitude,longitude of {len(list(data))} cities/towns/villages in <list> tokens, you should answer to them separately, separate them by <answer> token. <list>{city_list}<list>"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Ошибка: {str(e)}"

In [24]:
batch = 10
tasks = pulemet.process([ask_gpt_geographic(client, zip(df[i:i+batch]['city_ascii'].values, df[i:i+batch]['lat'].values, df[i:i+batch]['lng'].values)) for i in range(0, len(df), batch)])
responses_geograpic = await asyncio.gather(*tasks, return_exceptions=True)

Total: 100%|██████████| 4811/4811 [41:49<00:00,  1.99it/s]  

In [25]:
failed = []
responses_fixed = []
for i in range(0, len(df), batch):
    if len(('<answer>'+responses_geograpic[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1]) != batch:
        responses_fixed.extend(['']*min(batch, len(df)-i))
        failed.append(i)
    else:
        responses_fixed.extend(('<answer>'+responses_geograpic[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1])
failed

[19490, 19580, 19810, 19860, 24010, 36660, 48050]

In [None]:
df['description']= responses_fixed
df[df['description']!=''].to_csv('data/gpt41_geographic_markup.csv')
df.head(20)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
0,Tokyo,Tokyo,35.687,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764,This sprawling metropolis lies on the eastern ...
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077,Situated just south of the equator on the isla...
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604,"Resting along the banks of the Yamuna River, t..."
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133,"Found on the Pearl River delta, this city is l..."
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629,Located on the west coast of a peninsular nati...
5,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,24922000.0,1608618140,This densely populated capital sits on the isl...
6,Shanghai,Shanghai,31.2286,121.4747,China,CN,CHN,Shanghai,admin,24073000.0,1156073548,Positioned on the east coast at the mouth of t...
7,São Paulo,Sao Paulo,-23.5504,-46.6339,Brazil,BR,BRA,São Paulo,admin,23086000.0,1076532519,Nestled on a plateau inland from the Atlantic ...
8,Seoul,Seoul,37.5667,126.9833,"Korea, South",KR,KOR,Seoul,primary,23016000.0,1410836482,"Not far from the border with North Korea, this..."
9,Mexico City,Mexico City,19.4333,-99.1333,Mexico,MX,MEX,Ciudad de México,primary,21804000.0,1484247881,"Resting in a high valley in central Mexico, th..."


In [27]:
async def ask_gpt_facts(client, data, model = "gpt-4.1-nano"):
    try:
        city_list = "; ".join([f"{name}, {latitude}, {longitude}" for name, latitude, longitude in data])
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"You are creating short historical article. If you get the name of city/town/village, you try to describe it without naming it or its coordinates. \
                    Use historical facts or interesting facts to describe settlement. Never name the city/town/village.\
                    Use hints that distinguish the place described from other similar places in the world. Your description contains only a couple sentences. Use only english language in answer." },
                {"role": "user", "content": f"You will get a list of names,latitude,longitude of {len(list(data))} cities/towns/villages in <list> tokens, you should answer to them separately, separate them by <answer> token. <list>{city_list}<list>"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Ошибка: {str(e)}"

In [28]:
batch = 10
tasks = pulemet.process([ask_gpt_facts(client, zip(df[i:i+batch]['city_ascii'].values, df[i:i+batch]['lat'].values, df[i:i+batch]['lng'].values)) for i in range(0, len(df), batch)])
responses_facts = await asyncio.gather(*tasks, return_exceptions=True)

Total: 100%|██████████| 9617/9617 [1:22:38<00:00,  2.00it/s]

In [29]:
failed = []
responses_fixed = []
for i in range(0, len(df), batch):
    if len(('<answer>'+responses_facts[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1]) != batch:
        responses_fixed.extend(['']*min(batch, len(df)-i))
        failed.append(i)
    else:
        responses_fixed.extend(('<answer>'+responses_facts[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1])
failed

[160,
 300,
 460,
 570,
 610,
 720,
 960,
 1030,
 1190,
 1200,
 1280,
 1370,
 1540,
 1570,
 1720,
 1750,
 1790,
 1880,
 1890,
 2030,
 2320,
 2450,
 2460,
 2550,
 2640,
 2830,
 3110,
 3230,
 3240,
 3280,
 3340,
 3390,
 3400,
 3420,
 3500,
 3630,
 3650,
 3820,
 3870,
 4000,
 4050,
 4140,
 4200,
 4470,
 4670,
 4760,
 5140,
 5190,
 5310,
 5340,
 5380,
 5450,
 5610,
 5620,
 5690,
 6000,
 6250,
 6380,
 6410,
 6550,
 6620,
 6740,
 6810,
 6850,
 6870,
 7020,
 7050,
 7090,
 7160,
 7180,
 7410,
 7440,
 7560,
 7740,
 7950,
 7990,
 8050,
 8190,
 8240,
 8410,
 8480,
 8520,
 8570,
 8600,
 8770,
 8860,
 8930,
 8960,
 9020,
 9170,
 9260,
 9280,
 9360,
 9570,
 9610,
 9690,
 9800,
 9870,
 9880,
 9930,
 10310,
 10480,
 10560,
 10640,
 10720,
 10860,
 10920,
 11040,
 11060,
 11250,
 11390,
 11430,
 11470,
 11740,
 11770,
 11800,
 11870,
 11920,
 12000,
 12080,
 12220,
 12310,
 12420,
 12590,
 12730,
 12760,
 12790,
 12820,
 12860,
 13000,
 13030,
 13040,
 13060,
 13120,
 13560,
 13620,
 13640,
 13700,
 13

In [None]:
df['description']= responses_fixed
df[df['description']!=''].to_csv('data/gpt41nano_facts_markup.csv')
df.head(20)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
0,Tokyo,Tokyo,35.687,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764,This metropolis is renowned for its towering m...
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077,This capital city is situated along a major ri...
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604,"As the capital of a vast northern plain, this ..."
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133,"This south Chinese city, once a major trading ..."
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629,This coastal city is India's financial capital...
5,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,24922000.0,1608618140,"This insular city, located on an archipelago i..."
6,Shanghai,Shanghai,31.2286,121.4747,China,CN,CHN,Shanghai,admin,24073000.0,1156073548,This global financial hub on China's eastern c...
7,São Paulo,Sao Paulo,-23.5504,-46.6339,Brazil,BR,BRA,São Paulo,admin,23086000.0,1076532519,"Located on a high-altitude plateau, this South..."
8,Seoul,Seoul,37.5667,126.9833,"Korea, South",KR,KOR,Seoul,primary,23016000.0,1410836482,This mainly Seoul-based metropolis is a histor...
9,Mexico City,Mexico City,19.4333,-99.1333,Mexico,MX,MEX,Ciudad de México,primary,21804000.0,1484247881,"This historic city, situated in the heart of a..."


In [34]:
async def ask_deepseek(client, data, model = "deepseek-r1"):
    try:
        city_list = "; ".join([f"{name}, {latitude}, {longitude}" for name, latitude, longitude in data])
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"You are creating preview of article in journal. If you get the name of city/town/village, you try to describe it without naming it or its coordinates. \
                    Use nearest geographic objects or famous landmarks to describe settlement. Use simple phases. Never name the city/town/village.\
                    Use hints that distinguish the place described from other similar places in the world. Your description contains only a couple sentences. Use only english language in answer." },
                {"role": "user", "content": f"You will get a list of names,latitude,longitude of {len(list(data))} cities/towns/villages in <list> tokens, you should answer to them separately, separate them by <answer> token. <list>{city_list}<list>"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Ошибка: {str(e)}"

In [44]:
batch = 10
tasks = pulemet.process([ask_deepseek(client, zip(df[i:i+batch]['city_ascii'].values, df[i:i+batch]['lat'].values, df[i:i+batch]['lng'].values)) for i in range(0, len(df), batch)])
responses_deepseek = await asyncio.gather(*tasks, return_exceptions=True)

Total: 100%|██████████| 14433/14433 [5:24:36<00:00,  1.18s/it]    

In [45]:
failed = []
responses_fixed = []
for i in range(0, len(df), batch):
    if len(('<answer>'+responses_deepseek[i//batch]+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1]) != batch:
        responses_fixed.extend(['']*min(batch, len(df)-i))
        failed.append(i)
    else:
        responses_fixed.extend(('<answer>'+responses_deepseek[i//batch].replace('</answer>', '')+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1])
failed

[1300,
 1540,
 3070,
 3110,
 3230,
 3440,
 3450,
 3490,
 3500,
 3550,
 3560,
 3570,
 3580,
 3600,
 3670,
 3690,
 3710,
 3720,
 3860,
 4100,
 5610,
 7070,
 7510,
 8140,
 8410,
 9130,
 9160,
 9180,
 9240,
 9380,
 9420,
 10730,
 11460,
 11470,
 11560,
 11580,
 11590,
 11640,
 11650,
 12450,
 16620,
 18420,
 20020,
 20180,
 23790,
 24430,
 24780,
 29870,
 30050,
 30100,
 31350,
 32140,
 37460,
 38520,
 39320,
 39520,
 40200,
 41050,
 41610,
 43290,
 44330,
 46680,
 48050]

In [None]:
df['description']= responses_fixed
df[df['description']!=''].to_csv('data/deepseekr1_markup.csv')
df.head(20)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
0,Tokyo,Tokyo,35.687,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764,A metropolis near a snow-capped stratovolcano ...
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077,A coastal capital flanked by a strait notoriou...
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604,"Built along a sacred river, this city lies clo..."
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133,Positioned in a fertile delta where a major ri...
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629,A peninsula city facing an ancient maritime tr...
5,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,24922000.0,1608618140,Overlooking a natural harbor and a bay once ra...
6,Shanghai,Shanghai,31.2286,121.4747,China,CN,CHN,Shanghai,admin,24073000.0,1156073548,"At the estuary of a continent’s longest river,..."
7,São Paulo,Sao Paulo,-23.5504,-46.6339,Brazil,BR,BRA,São Paulo,admin,23086000.0,1076532519,Nestled below a mountain range draped in Atlan...
8,Seoul,Seoul,37.5667,126.9833,"Korea, South",KR,KOR,Seoul,primary,23016000.0,1410836482,Encircled by fortress walls and bisected by a ...
9,Mexico City,Mexico City,19.4333,-99.1333,Mexico,MX,MEX,Ciudad de México,primary,21804000.0,1484247881,Set in a highland basin surrounded by active v...


In [47]:
async def ask_deepseek_geographic(client, data, model = "deepseek-r1"):
    try:
        city_list = "; ".join([f"{name}, {latitude}, {longitude}" for name, latitude, longitude in data])
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"You are helping to find settlement on the map. If you get the name of city/town/village, you try to describe it without naming it or its coordinates. \
                    Use only nearest natural geographic objects and name them to describe settlement. Use simple phases. Never name the city/town/village.\
                    Use hints that distinguish the place described from other similar places in the world. Your description contains only a couple sentences. Use only english language in answer." },
                {"role": "user", "content": f"You will get a list of names,latitude,longitude of {len(list(data))} cities/towns/villages in <list> tokens, you should answer to them separately, separate them by <answer> token. <list>{city_list}<list>"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Ошибка: {str(e)}"

In [48]:
batch = 10
tasks = pulemet.process([ask_deepseek_geographic(client, zip(df[i:i+batch]['city_ascii'].values, df[i:i+batch]['lat'].values, df[i:i+batch]['lng'].values)) for i in range(0, len(df), batch)])
responses_deepseek_geographic = await asyncio.gather(*tasks, return_exceptions=True)

Total: 100%|██████████| 19239/19239 [6:51:16<00:00,  1.37s/it]   

In [49]:
failed = []
responses_fixed = []
for i in range(0, len(df), batch):
    if len(('<answer>'+responses_deepseek_geographic[i//batch].replace('</answer>', '')+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1]) != batch:
        responses_fixed.extend(['']*min(batch, len(df)-i))
        failed.append(i)
    else:
        responses_fixed.extend(('<answer>'+responses_deepseek_geographic[i//batch].replace('</answer>', '')+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1])
failed

[1780,
 4260,
 5990,
 10340,
 10830,
 12560,
 14990,
 15010,
 15100,
 15110,
 18940,
 20410,
 21740,
 26540,
 26820,
 29190,
 31730,
 31790,
 31810,
 37690,
 46150,
 48050]

In [None]:
df['description']= responses_fixed
df[df['description']!=''].to_csv('data/deepseekr1_geographic_markup.csv')
df.head(20)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
0,Tokyo,Tokyo,35.687,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764,"A coastal metropolis near a volcanic peak, nes..."
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077,A tropical lowland port surrounded by dense ra...
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604,"A sprawling inland hub along a historic river,..."
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133,A delta city where a mighty river meets the se...
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629,A peninsula settlement flanked by tidal creeks...
5,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,24922000.0,1608618140,A bay-facing urban center straddling a river t...
6,Shanghai,Shanghai,31.2286,121.4747,China,CN,CHN,Shanghai,admin,24073000.0,1156073548,"A river-mouth megacity on alluvial plains, whe..."
7,São Paulo,Sao Paulo,-23.5504,-46.6339,Brazil,BR,BRA,São Paulo,admin,23086000.0,1076532519,A highland plateau city near the escarpment of...
8,Seoul,Seoul,37.5667,126.9833,"Korea, South",KR,KOR,Seoul,primary,23016000.0,1410836482,A river-valley capital surrounded by forested ...
9,Mexico City,Mexico City,19.4333,-99.1333,Mexico,MX,MEX,Ciudad de México,primary,21804000.0,1484247881,A high-altitude basin settlement encircled by ...


In [None]:
pd.concat([
    pd.read_csv('landmarks_name_markup.csv', index_col=0),
    pd.read_csv('landmarks_description_markup.csv', index_col=0),
    pd.read_csv('gpt41_markup.csv', index_col=0),
    pd.read_csv('grok_markup.csv', index_col=0),
    pd.read_csv('cities_markup.csv', index_col=0),
    pd.read_csv('gpt41_geographic_markup.csv', index_col=0),
    pd.read_csv('gpt41nano_facts_markup.csv', index_col=0),
    pd.read_csv('deepseekr1_markup.csv', index_col=0),
    pd.read_csv('deepseekr1_geographic_markup.csv', index_col=0),
    ]).to_csv('data/full_markup.csv')

In [6]:
async def ask_test_qwen(client, data, model = "qwen2.5-coder-32b-instruct"):
    try:
        city_list = "; ".join([f"{name}, {latitude}, {longitude}" for name, latitude, longitude in data])
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"You are creating riddles. If you get the name of city/town/village, you try to describe it without naming it or its coordinates. \
                    You usually use a description of the appearance of a city, famous facts about it, or the nearest iconic places. Never name the city/town/village.\
                    Use hints that distinguish the place described from other similar places in the world. Your description contains only a couple sentences. Use only english language in answer." },
                {"role": "user", "content": f"You will get a list of names,latitude,longitude of {len(list(data))} cities/towns/villages in <list> tokens, you should answer to them separately, separate them by <answer> token. <list>{city_list}<list>"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Ошибка: {str(e)}"

In [7]:
test_sample = df[:20000].sample(2000)
test_sample

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
9027,Brentwood,Brentwood,37.9356,-121.7190,United States,US,USA,California,,64609.0,1840018904,
15237,Guacarí,Guacari,3.7667,-76.3333,Colombia,CO,COL,Valle del Cauca,minor,35513.0,1170383272,
3961,Zhanlicun,Zhanlicun,23.2881,116.2594,China,CN,CHN,Guangdong,,161439.0,1156184947,
7113,Arujá,Aruja,-23.3967,-46.3211,Brazil,BR,BRA,São Paulo,minor,83939.0,1076436872,
7937,Novotroitsk,Novotroitsk,51.2039,58.3114,Russia,RU,RUS,Orenburgskaya Oblast’,minor,74469.0,1643186193,
...,...,...,...,...,...,...,...,...,...,...,...,...
538,Kharkiv,Kharkiv,49.9925,36.2311,Ukraine,UA,UKR,Kharkivska Oblast,admin,1421125.0,1804588111,
2673,Bata,Bata,1.8650,9.7700,Equatorial Guinea,GQ,GNQ,Litoral,admin,250770.0,1226528087,
10636,Bisceglie,Bisceglie,41.2409,16.5021,Italy,IT,ITA,Puglia,,53534.0,1380547143,
1782,Nangong,Nangong,37.3590,115.4090,China,CN,CHN,Hebei,minor,396718.0,1156426803,


In [8]:
batch = 10
tasks = pulemet.process([ask_test_qwen(client, zip(test_sample[i:i+batch]['city_ascii'].values, test_sample[i:i+batch]['lat'].values, test_sample[i:i+batch]['lng'].values)) for i in range(0, len(test_sample), batch)])
responses_test = await asyncio.gather(*tasks, return_exceptions=True)

Total: 100%|██████████| 200/200 [03:57<00:00,  1.85it/s] 

In [9]:
failed = []
responses_fixed = []
for i in range(0, len(test_sample), batch):
    if len(('<answer>'+responses_test[i//batch].replace('</answer>', '')+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1]) != batch:
        responses_fixed.extend(['']*min(batch, len(test_sample)-i))
        failed.append(i)
    else:
        responses_fixed.extend(('<answer>'+responses_test[i//batch].replace('</answer>', '')+'<answer>').replace('\n', '').replace('<answer><answer>', '<answer>').replace('<answer><answer>', '<answer>').split('<answer>')[1:-1])
failed

[140, 730, 1430, 1680]

In [None]:
test_sample['description']= responses_fixed
test_sample[test_sample['description']!=''].to_csv('data/test_markup.csv')
test_sample.head(20)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id,description
9027,Brentwood,Brentwood,37.9356,-121.719,United States,US,USA,California,,64609.0,1840018904,A suburban area known for its affluent lifesty...
15237,Guacarí,Guacari,3.7667,-76.3333,Colombia,CO,COL,Valle del Cauca,minor,35513.0,1170383272,"Nestled in the lush Colombian Andes, this smal..."
3961,Zhanlicun,Zhanlicun,23.2881,116.2594,China,CN,CHN,Guangdong,,161439.0,1156184947,"Located in southern China, this village is kno..."
7113,Arujá,Aruja,-23.3967,-46.3211,Brazil,BR,BRA,São Paulo,minor,83939.0,1076436872,"Situated in southeastern Brazil, this city is ..."
7937,Novotroitsk,Novotroitsk,51.2039,58.3114,Russia,RU,RUS,Orenburgskaya Oblast’,minor,74469.0,1643186193,"In the heart of Russia, this city is an import..."
2629,Tokushima,Tokushima,34.0667,134.55,Japan,JP,JPN,Tokushima,admin,254510.0,1392795984,This city on the island of Shikoku is renowned...
12387,Timbó,Timbo,-26.8233,-49.2717,Brazil,BR,BRA,Santa Catarina,minor,44977.0,1076339334,"Located in southern Brazil, this city is known..."
4858,Santo Tomas,Santo Tomas,7.53,125.62,Philippines,PH,PHL,Davao del Norte,,128667.0,1608792041,Nestled on the eastern coast of the Philippine...
4314,Jequié,Jequie,-13.8578,-40.0839,Brazil,BR,BRA,Bahia,minor,147202.0,1076558785,"In the northeastern part of Brazil, this city ..."
4135,Andong,Andong,36.5592,128.7289,"Korea, South",KR,KOR,Gyeongbuk,admin,153306.0,1410095935,"Located in southeastern Korea, this city is kn..."
