works with https://tech.yandex.ru/maps/geosearch/doc/concepts/request-docpage/

In [9]:
import pandas as pd
import requests
import os

### importing data

In [10]:
# reading api keys
api_keys = pd.read_excel('../api_keys.xlsx')
api_keys.set_index('key_name', inplace=True)

# API Yandex organization search
search_api_key = api_keys.loc['yandex_search']['key']

In [11]:
# coords table
coords = pd.read_excel('./input_params.xlsx', sheet_name = 'coords')
coords.set_index('point_name', inplace=True)

def extract_lon_lat(location):
    loc = coords.loc[location]
    return loc['lon'], loc['lat']

# south-west bounding box coordinates
sw_lon, sw_lat = extract_lon_lat('southwest')

# north-east bounding box coordinates
ne_lon, ne_lat = extract_lon_lat('northeast')

In [12]:
# table of items to request
request_list = pd.read_excel('./input_params.xlsx', sheet_name = 'request_example')


In [13]:
cat_list = list(request_list.columns)
cat_list

['education',
 'healthcare',
 'sport',
 'culture',
 'shops',
 'services',
 'food',
 'leisure',
 'parks',
 'government',
 'transport']

In [14]:
# lists of items without missing values
for item_type in cat_list:
    list_of_items = request_list[item_type].dropna().to_list()
    print(list_of_items)

['общеобразовательная школа', 'гимназия', 'лицей', 'частная школа', 'школа-интернат', 'детский сад', 'вуз', 'колледж', 'училище', 'художественная школа', 'музыкальная школа']
['поликлиника', 'больница', 'пункт скорой помощи', 'фельдшерско-амбулаторный пункт', 'лаборатория анализов', 'аптека', 'женская консультация', 'стоматологическая клиника', 'травмпункт']
['стадион', 'бассейн', 'каток', 'спортивная школа', 'спортивный клуб', 'спортплощадка', 'спортивная секция', 'воркаут', 'футбольное поле', 'теннисный корт', 'хоккейная коробка']
['кинотеатр', 'концертный зал', 'музей', 'выставка', 'галерея', 'дом культуры', 'библиотека', 'театр', 'цирк', 'зоопарк']
['супермаркет', 'магазин продуктов', 'гипермаркет', 'магазин одежды', 'магазин обуви', 'промтовары', 'хозтовары', 'пекарня', 'магазин спортивных товаров', 'магазин электроники', 'салон связи', 'товары для дома', 'алкогольный магазин', 'ювелирный магазин', 'рынок', 'рыбный магазин', 'цветы', 'магазин табака', 'книжный магазин', 'магазин к

### request

In [15]:
# in case we go out of limit with skip counts

print(sw_lon, sw_lat, ne_lon, ne_lat)

def split_into_quadrants(sw_lon, sw_lat, ne_lon, ne_lat):
    for slon in [sw_lon, (sw_lon + ne_lon) / 2]:
        for slat in [sw_lat, (sw_lat + ne_lat) / 2]:
            yield slon, slat, slon + (ne_lon - sw_lon) / 2, slat + (ne_lat - sw_lat) / 2

print("\n".join(map(str, list(split_into_quadrants(sw_lon, sw_lat, ne_lon, ne_lat)))))

47.300052 42.846594 47.704613 43.070313
(47.300052, 42.846594, 47.5023325, 42.958453500000005)
(47.300052, 42.958453500000005, 47.5023325, 43.070313)
(47.5023325, 42.846594, 47.704613, 42.958453500000005)
(47.5023325, 42.958453500000005, 47.704613, 43.070313)


In [16]:
def split_single_query_request(search_query, sw_lon, sw_lat, ne_lon, ne_lat):
    print(f"Splitting {search_query} by 4 at", sw_lon, sw_lat, ne_lon, ne_lat)
    results = []
    for (lon1, lat1, lon2, lat2) in split_into_quadrants(sw_lon, sw_lat, ne_lon, ne_lat):
        results += single_query_request(search_query, lon1, lat1, lon2, lat2)
    return results


def single_query_request(search_query, sw_lon, sw_lat, ne_lon, ne_lat):
    URL = 'https://search-maps.yandex.ru/v1/'
    bbox = f"{sw_lon:.6f},{sw_lat:.6f}~{ne_lon:.6f},{ne_lat:.6f}"
    results = []
    for skip_cnt in [0, 500, 1000]:
        params = { 
            'text' : search_query, 
            'apikey': search_api_key,
            'lang': 'ru_RU',
            'type':'biz',
            'bbox': bbox,
            'rspn': 1,
            'results': 500,
            'skip': skip_cnt
        }
        response = requests.get(URL, params=params)
        response_json = response.json()
        results += response_json['features']
        feature_count = len(response_json['features'])
        if feature_count < 500:
            break
    else:
        return split_single_query_request(search_query, sw_lon, sw_lat, ne_lon, ne_lat)
    return results

In [17]:
def organization_request(item_type):
    # list of queries from column
    list_of_items = request_list[item_type].dropna().to_list()
    
    # request parameters - URL and bounding box
        
    results = []
    
    # request
    for search_query in list_of_items:
        results += single_query_request(search_query, sw_lon, sw_lat, ne_lon, ne_lat)

    # take only unique values is results

    result_dict = {}
    for result in results:
        result_dict[result['properties']['CompanyMetaData']['id']] = result
    results_unique = list(result_dict.values())

    # transform json, delete exceed columns, create category columns
    for res in results_unique:
        res.update(res['properties'])
        res['lon'], res['lat'] = res['geometry']['coordinates']
        res.update(res['CompanyMetaData'])
        if 'class' not in res['Categories'][0]:
            print("No 'class' in category: ", res['Categories'][0])
        res['category'] = res['Categories'][0].get('class', 'UNKNOWN')
        res['category_name'] = res['Categories'][0].get('name', 'UNKNOWN')
        for cat in res['Categories']:
            res['cat_' + cat.get('class', 'UNKNOWN').replace(' ', '_')] = True
        for key in ['properties', 'CompanyMetaData', 'boundedBy', 'type', 'geometry', 'url', 'Phones', 'Hours', 'Categories']:
            if key in res:
                del res[key]

    # create dataframe out of transformed results
    df = pd.DataFrame(results_unique)
    
    return df

### saving result

In [18]:
# create dir for output files
os.makedirs("output", exist_ok=True)

In [19]:
# create csv files for every category in request

for item_type in cat_list:
    df = organization_request(item_type)
    df.to_csv("./output/{}.csv".format(str(item_type))) 

Splitting магазин продуктов by 4 at 47.300052 42.846594 47.704613 43.070313
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in category:  {'name': 'Сквер'}
No 'class' in categor

### joining result files

In [21]:
df = pd.DataFrame()
        
for item_type in cat_list:
    tab = pd.read_csv("./output/{}.csv".format(str(item_type)))
    print (item_type, len(tab))
    df = df.append(tab)
        
    print ('total with duplicates', len(df))

    df.set_index('Unnamed: 0', inplace=True)

    df.drop_duplicates(inplace=True)
    print ('total without duplicates',len(df))

    df.to_csv('./output/yandex_total.csv')

education 581
total with duplicates 581
total without duplicates 581
healthcare 1090
total with duplicates 1671
total without duplicates 1671
sport 278
total with duplicates 1949
total without duplicates 1947
culture 82
total with duplicates 2029
total without duplicates 2029
shops 5303
total with duplicates 7332
total without duplicates 7328
services 4264
total with duplicates 11592
total without duplicates 11467
food 2071
total with duplicates 13538
total without duplicates 13419
leisure 406
total with duplicates 13825
total without duplicates 13750
parks 44
total with duplicates 13794
total without duplicates 13766
government 238
total with duplicates 14004
total without duplicates 14003
transport 837
total with duplicates 14840
total without duplicates 14835
