In [1]:
import json
import pandas as pd
import numpy as np

from jellyfish import jaro_winkler_similarity

import pyperclip

In [155]:
from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [276]:
df_places = pd.read_parquet('data/places_SK.parquet')

In [277]:
df_multisport = pd.read_parquet('data/multisport.parquet')

In [278]:
len(df_places)

3725

In [26]:
len(df_multisport)

881

# Parsovanie vlastnosti

## Places

In [286]:
df_places['name'] = df_places.names.apply(lambda x: x['primary'])
df_places['address'] = df_places['addresses'].apply(lambda x: x[0]['freeform'])
df_places['city'] = df_places['addresses'].apply(lambda x: x[0]['locality'])

In [287]:
import re

def extract_address_parts(addr):

    try:
        freeform, city = addr.split('\n')
        match = re.match(r'^(\d{3}\s?\d{2})\s+(.+)$', city)
        if match:
            postal_code = match.group(1)
            postal_code = postal_code.replace(' ', '')
            city = match.group(2)
        else:
            city = re.sub(r'^\d+\s*', '', f)
            postal_code = None
        return freeform, postal_code, city
    except Exception as e:
        return addr, None, None

## Multisport

In [288]:
df_multisport['address_freeform'], df_multisport['address_postcode'], df_multisport['address_locality'] = zip(*df_multisport.address.apply(extract_address_parts))

# Merging

In [289]:
from collections import defaultdict

PLACES_TO_MULTISPORT_MAP = defaultdict(list)

In [290]:
with open('data/multisport_to_places_map.json', 'r', encoding='utf-8') as f:
    MULTISPORT_TO_PLACES_MAP = json.load(f)

In [291]:
for key, val in MULTISPORT_TO_PLACES_MAP.items():
    PLACES_TO_MULTISPORT_MAP[val].append(key)

In [292]:
len(PLACES_TO_MULTISPORT_MAP)

444

In [293]:
len(MULTISPORT_TO_PLACES_MAP)

445

In [294]:
df_multisport.columns

Index(['name', 'url', 'content', 'address', 'tags', 'phone', 'email',
       'facebook', 'web', 'description', 'logo', 'pictures', 'coordinates',
       'address_freeform', 'address_postcode', 'address_locality'],
      dtype='object')

In [295]:
df_places.columns

Index(['id', 'geometry', 'bbox', 'version', 'update_time', 'sources', 'names',
       'categories', 'confidence', 'websites', 'socials', 'emails', 'phones',
       'brand', 'addresses', 'main_category', 'coordinates', 'facebook_url',
       'name', 'address', 'city'],
      dtype='object')

In [296]:
df_places.phones[0]

array(['+421911384485'], dtype=object)

In [297]:
import tldextract
from urllib.parse import urlparse, urlunparse, parse_qs

def normalise_url(url):
    domain = tldextract.extract(url).registered_domain
    parsed_url = urlparse(url)
    url_path = parsed_url.path.rstrip('/')
    if parsed_url.query:
        url_path += f'?{parsed_url.query}'
    return f'https://{domain}{url_path}'

In [350]:
df_merged = pd.DataFrame(columns=[
    'overture_maps_id',
    'multisport_urls',
    'names',
    'websites',
    'facebooks',
    'phones',
    'description',
    'logo',
    'pictures',
    'address_freeform',
    'address_locality',
    'address_postcode',
    'address_region',
    'coordinates',
    'main_category',
    'tags',
])

for i, row in tqdm(df_places.iterrows(), total=len(df_places)):
    multisport_urls = PLACES_TO_MULTISPORT_MAP[row['id']]
    df_I = df_multisport[df_multisport['url'].isin(multisport_urls)]

    websites = list(row['websites']) if not pd.isna(row['websites']) else []
    facebooks = [row['facebook_url']] if not pd.isna(row['facebook_url']) else []
    phones = list(row['phones']) if not pd.isna(row['phones']) else []
    tags = []
    if row['categories']['alternate'] is not None:
        tags.extend(list(row['categories']['alternate']))
    for multisport_tags in df_I['tags']:
        tags.extend(multisport_tags)
        
    
    new_row = {
        'overture_maps_id': row['id'],
        'multisport_urls': df_I['url'].tolist(),
        'names': list(set([row['name']] + df_I['name'].tolist())),
        'websites': list(set(websites + df_I['web'][df_I['web'].notnull()].tolist())),
        'facebooks': list(set(facebooks + df_I['facebook'][df_I['facebook'].notnull()].tolist())),
        'phones': list(set(phones + df_I['phone'][df_I['phone'].notnull()].tolist())),
        'description': None if len(df_I) == 0 else df_I.iloc[0]['description'],
        'logo': None if len(df_I) == 0 else df_I.iloc[0]['logo'],
        'pictures': None if (len(df_I) == 0 or df_I.iloc[0]['pictures'] is None) else list(df_I.iloc[0]['pictures']),
        'address_freeform': list(set([row['addresses'][0]['freeform']] + df_I['address_freeform'].tolist())),
        'address_locality': list(set([row['addresses'][0]['locality']] + df_I['address_locality'].tolist())),
        'address_postcode': list(set([row['addresses'][0]['postcode']] + df_I['address_postcode'].tolist())),
        'address_region': row['addresses'][0]['region'],
        'coordinates': list(row['coordinates']),
        'main_category': row['categories']['main'],
        'tags': tags,
    }

    # Remove None, normalise urls
    for key in ['websites', 'facebooks']:
        new_row[key] = list(set([normalise_url(v) for v in new_row[key] if v is not None]))
        
    df_merged = pd.concat([df_merged, pd.DataFrame([new_row])], ignore_index=True)


  0%|                                               | 0/3725 [00:00<?, ?it/s][A
  2%|▋                                    | 70/3725 [00:00<00:05, 672.83it/s][A
  4%|█▎                                  | 141/3725 [00:00<00:05, 678.48it/s][A
  6%|██                                  | 214/3725 [00:00<00:05, 689.10it/s][A
  8%|██▊                                 | 287/3725 [00:00<00:04, 694.03it/s][A
 10%|███▌                                | 363/3725 [00:00<00:04, 707.24it/s][A
 12%|████▏                               | 437/3725 [00:00<00:04, 708.65it/s][A
 14%|████▉                               | 510/3725 [00:00<00:04, 706.35it/s][A
 16%|█████▌                              | 581/3725 [00:00<00:04, 698.74it/s][A
 17%|██████▎                             | 651/3725 [00:00<00:04, 690.66it/s][A
 19%|██████▉                             | 724/3725 [00:01<00:04, 694.07it/s][A
 21%|███████▋                            | 797/3725 [00:01<00:04, 696.42it/s][A
 23%|████████▍             

In [351]:
df_m = df_multisport[~df_multisport.url.isin(MULTISPORT_TO_PLACES_MAP.keys())]
for i, row in tqdm(df_m.iterrows(), total=len(df_m)):
    
    new_row = {
        'overture_maps_id': None,
        'multisport_urls': [row['url']],
        'names': [row['name']],
        'websites': [row['web']] if not pd.isna(row['web']) else [],
        'facebooks': [row['facebook']] if not pd.isna(row['facebook']) else [],
        'phones': [row['phone']] if not pd.isna(row['phone']) else [],
        'description': row['description'],
        'logo': row['logo'],
        'pictures': None if row['pictures'] is None else list(row['pictures']),
        'address_freeform': [row['address_freeform']],
        'address_locality': [row['address_locality']],
        'address_postcode': [row['address_postcode']],
        'address_region': None,
        'coordinates': [float(x) for x in row['coordinates']],
        'main_category': None,
        'tags': row['tags'],
    }
        
    df_merged = pd.concat([df_merged, pd.DataFrame([new_row])], ignore_index=True)


  0%|                                                | 0/436 [00:00<?, ?it/s][A
 27%|█████████▋                          | 117/436 [00:00<00:00, 1124.58it/s][A
 55%|███████████████████▉                | 241/436 [00:00<00:00, 1163.83it/s][A
100%|████████████████████████████████████| 436/436 [00:00<00:00, 1158.95it/s][A


In [354]:
def remove_none(val):
    if isinstance(val, list):
        return [v for v in val if v is not None]
    return val

In [355]:
df_merged = df_merged.map(remove_none).copy()

In [356]:
df_merged.sample(20)

Unnamed: 0,overture_maps_id,multisport_urls,names,websites,facebooks,phones,description,logo,pictures,address_freeform,address_locality,address_postcode,address_region,coordinates,main_category,tags
3512,08f1e0c10016eb2e0306c0a65cd60406,[],[Ski Zamutov],[],[https://facebook.com/people/Ski-Zamutov/10005...,[+421915927497],,,,[Zamutov 696],[Zámutov],[09415],,"[48.90267, 21.54092]",sports_and_recreation_venue,"[active_life, bar]"
2813,08f1e0163379a40103fc4e1e380de686,[],[Wing Tsun Kids RS],[https://emaa.sk?index.html],[https://facebook.com/wtkidsrs],[+421907344996],,,,[Malohontská 1931],[Rimavská Sobota],[979 01],,"[48.381905, 20.0165989]",martial_arts_club,"[school, education]"
2907,08f1e01ba09493a80311e6bd8ab36ef8,[],[Skatešopa Gerlachov],[https://studioblp.sk],[https://facebook.com/skatesopa],[+421948891145],,,,[Hlavná 121/36],[Gerlachov],[059 42],,"[49.09715, 20.20789]",gym,"[stadium_arena, arts_and_entertainment]"
1659,08f1e00c6b41108a0335365e78a18619,[],[VOLT racing center],[https://elektrickemotokary.sk],[https://facebook.com/VOLTracingcenter],[+421326401589],,,,[Zlatovská 2763],[Trenčín],[911 05],,"[48.8929146, 18.0190668]",go_kart_club,"[race_track, sports_and_recreation_venue]"
283,08f1e02d6da6ec36031f2113c146e775,[https://multi-sport.sk/aktivity/partner/lotus...,"[Lotus joga, Lotus Joga]",[https://lotusjoga.sk],[https://facebook.com/lotusjogabratislava],"[0948029480, +421948029480]",,https://prod0mobileapp.blob.core.windows.net/f...,,"[Kuzmányho 4, Kuzmányho 4]","[Bratislava I, Bratislava]",[81106],BL,"[48.1498704, 17.1037692]",gym,"[yoga_studio, public_health_clinic, Joga]"
1787,08f1e006ce6e8c58030a1a229f480b56,[],[AQUASPORT Levice],[https://aquasport-levice.sk],[https://facebook.com/aquasportlevice],[+421905351668],,,,[Turecký rad 5],[Levice],[934 01],,"[48.2216956, 18.5984266]",swimming_instructor,"[sports_club_and_league, amateur_sports_team]"
551,08f1e0202992016903add2cc7554283d,[],[Bősi Kempo MMA & BJJ Klub],[],[https://facebook.com/bosi.kempo.mma.bjj],[+421905221887],,,,[Námestie Svätej Trojice 1044/2],[Gabčíkovo],[930 05],,"[47.8913703, 17.5765237]",martial_arts_club,"[sports_club_and_league, active_life]"
1472,08f1e06aca0c396403937c30552ca51a,[],[Kúpalisko Zlatnícka dolina],[https://zlatnickadolina.sk],[https://facebook.com/pages/K%C3%BApalisko%20Z...,[+421346646833],,,,[Zlatnícka Dolina],[],[909 01],,"[48.8240558, 17.3002316]",swimming_pool,[]
879,08f1e02803d9e08403e3a3855d63713a,[],[Veľkobielske Jazero],[],[https://facebook.com/pages/Ve%C4%BEkobielske%...,[],,,,[],[Velky Biel],[],,"[48.2073867, 17.3576683]",diving_center,[]
993,08f1e0296b29a160035cf9e7a93ca322,[],[The Feel],[https://thefeel.sk],[https://facebook.com/TheFeel.sk],[],,,,[Ludvika Van Beethovena 5985/29],[Trnava],[917 08],,"[48.3699071, 17.5684074]",sports_and_fitness_instruction,[]


In [357]:
df_merged.main_category.value_counts()

main_category
gym                            1159
dance_club                      343
sports_and_recreation_venue     320
ski_resort                      229
dance_school                    218
                               ... 
kiteboarding                      1
cricket_ground                    1
golf_instructor                   1
rafting_kayaking_area             1
adventure_sports_center           1
Name: count, Length: 64, dtype: int64

# Tags

In [358]:
all_tags = []
for c in df_merged.tags:
    all_tags.extend(c)
all_tags = list(set(all_tags))

In [359]:
with open('data/categories_map.json', 'r', encoding='utf-8') as f:
    CATEGORIES_MAP = json.load(f)

In [360]:
def is_in_obj(tag):
    for key, val in CATEGORIES_MAP.items():
        if tag == key or tag in val:
            return True
    return False

for tag in all_tags:
    if not is_in_obj(tag):
        print(tag)

In [361]:
translations = pd.read_excel('data/OM_tags_translated.xlsx')

In [362]:
translations_dict = {tag: translation for tag, translation in translations.itertuples(index=False)}

In [363]:
sum(df_merged.tags.isnull())

0

In [364]:
from collections import Counter

for i, row in df_merged.iterrows():
    tags = ([row['main_category']] if row['main_category'] is not None else []) + list(row['tags'])
    tags_clean = []
    for tag in tags:
        if tag not in translations_dict.keys():
            tags_clean.append(tag)
        elif not pd.isna(translations_dict[tag]):
            tags_clean.append(translations_dict[tag])

    main_category = None
    if row['main_category'] is not None:
        for key, val in CATEGORIES_MAP.items():
            if row['main_category'] in val:
                main_category = key
                break
    else:
        c = Counter()
        for tag in tags_clean:
            for key, val in CATEGORIES_MAP.items():
                if tag in val:
                    c.update([key])
        main_category = c.most_common(1)[0][0]

    if pd.isna(main_category):
        print('row main', row['main_category'])
        print('tags', tags_clean)
        print('main', main_category)
    df_merged.at[i, 'main_category'] = main_category
    df_merged.at[i, 'tags'] = list(set([tag for tag in tags_clean if tag is not None]))
        

In [365]:
df_merged.to_csv('data/venues_OM_MS.csv', index=False)

In [366]:
sum(df_merged.main_category.isnull())

0

In [344]:
venues = []
for row_dict in df_merged.head().to_dict(orient="records"):
    print(row_dict)

{'overture_maps_id': '08f030d6ae8b06d103553c1f08349087', 'multisport_urls': ['https://multi-sport.sk/aktivity/partner/kryta-plavaren-komarno/'], 'names': ['Krytá plaváreň Komárno'], 'websites': ['https://comorraservis.sk', 'https://comorraservis.sk/strediska/kryta-plavaren'], 'facebooks': ['https://facebook.com/people/Kryt%C3%A1-plav%C3%A1re%C5%88-Kom%C3%A1rno/100086381752580', 'https://facebook.com/profile.php?id=100086381752580'], 'phones': ['0911384485', '+421911384485'], 'description': 'Krytá plaváreň\n    v\xa0Komárne je prevádzkovaná celoročne na\xa0základe uvedených\n\n        otváracích hodín\n\n    . Plaváreň je dostupná pre účely plaveckých cvikov, tréningy športových\n    klubov ako aj pre tréningy jednotlivcov alebo pohybuchtivých návštevníkov.\nAtraktívne sú večerné otváracie hodiny a\xa0vnútorné priestory, ktoré    prešli v\xa0roku 2015 rekonštrukciou.', 'logo': 'https://prod0mobileapp.blob.core.windows.net/facilities/facilities/sk/5057/images/2_5057_1_FacilityPhoto.jpg',

In [369]:
with open('data/venues_OM_MS_clean.json', 'w', encoding='utf-8') as f:
    json.dump({'venues': df_merged.to_dict(orient="records")}, f, indent=4)

# OM tags translations

In [209]:
df_places = pd.read_parquet('data/places_SK.parquet')

In [214]:
all_categories = []
for c in df_places.categories:
    all_categories.append(c['main'])
    if c['alternate'] is not None:
        all_categories.extend(c['alternate'])
all_categories = list(set(all_categories))

In [216]:
len(all_categories)

300

In [217]:
df = pd.DataFrame({'category': all_categories})

In [219]:
df['translation'] = None

In [220]:
df.to_excel('data/OM_tags.xlsx', index=False)

# Manualne naparovanie Multisport <--> Overture Maps

In [55]:
multisport_to_places_map = {
    3: 3573,
    4: 265,
    8: 1334,
    9: 1505,
    10: 1183,
    11: 1600,
    12: 1191,
    13: 1953,
    14: 529,
    18: 2592,
    22: 2776,
    28: 3252,
    33: 1144,
    35: 1661,
    37: 1119,
    40: 1986,
    44: 2291,
    46: 960,
    67: 2071,
    69: 2191,
    72: 202,
    74: 2580,
    75: 2234,
    76: 2506,
    77: 2227,
    80: 3236,
    81: 1487,
    85: 3347,
    86: 2353,
    91: 404,
    96: 1443,
    97: 189,
    99: 3297,
    100: 2229,
    102: 1856,
    106: 833,
    107: 3375,
    108: 3612,
    109: 2273,
    110: 431,
    112: 2659,
    113: 1240,
    114: 2814,
    115: 3348,
    116: 1590,
    118: 1339,
    119: 2896,
    120: 1004,
    121: 2131,
    123: 3202,
    125: 3465,
    126: 2243,
    127: 892,
    129: 2283,
    131: 428,
    134: 1591,
    137: 62,
    138: 3485,
    139: 3363,
    140: 423,
    141: 884,
    143: 2717,
    146: 994,
    148: 1105,
    149: 2954,
    150: 199,
    153: 1692,
    156: 74,
    157: 2164,
    158: 2705,
    161: 1333,
    162: 3388,
    164: 2933,
    165: 2386,
    168: 3089,
    169: 1730,
    172: 2784,
    175: 1729,
    178: 2716,
    179: 3614,
    181: 1663,
    182: 1270,
    183: 163,
    184: 517,
    185: 1761,
    186: 2943,
    188: 2772,
    190: 1871,
    191: 2210,
    192: 2178,
    193: 1905,
    195: 3340,
    196: 336,
    197: 3281,
    201: 1409,
    202: 870,
    203: 1611,
    208: 3460,
    209: 2310,
    210: 3179,
    211: 246,
    216: 102,
    218: 377,
    219: 2219,
    220: 101,
    225: 1437,
    226: 477,
    227: 3298,
    230: 3263,
    231: 655,
    232: 120,
    233: 1731,
    234: 3056,
    235: 3445,
    237: 3561,
    238: 859,
    240: 2200,
    242: 1939,
    244: 499,
    245: 3129,
    246: 2809,
    247: 907,
    248: 1033,
    249: 895,
    251: 1173,
    252: 2960,
    254: 68,
    255: 1490,
    257: 3700,
    258: 3464,
    259: 3439,
    261: 1632,
    263: 3156,
    264: 1655,
    265: 678,
    266: 3237,
    267: 1482,
    268: 894,
    269: 1956,
    270: 122,
    271: 433,
    272: 117,
    275: 2393,
    277: 1412,
    278: 2962,
    281: 695,
    282: 494,
    284: 3715,
    287: 2218,
    288: 987,
    289: 1311,
    291: 342,
    292: 3683,
    294: 3127,
    298: 1601,
    300: 3247,
    304: 442,
    306: 1109,
    310: 2773,
    312: 953,
    313: 2362,
    314: 2621,
    319: 1277,
    322: 168,
    324: 1628,
    325: 1283,
    326: 3012,
    327: 689,
    328: 621,
    332: 3158,
    333: 1646,
    352: 194,
    353: 446,
    356: 3066,
    358: 1685,
    359: 3580,
    360: 1580,
    361: 2688,
    363: 3260,
    364: 390,
    365: 3210,
    366: 154,
    367: 3412,
    368: 1386,
    369: 699,
    370: 1306,
    373: 622,
    374: 2016,
    375: 3566,
    376: 2638,
    377: 2743,
    380: 914,
    382: 1552,
    383: 814,
    387: 2392,
    388: 3453,
    389: 921,
    392: 2635,
    393: 1068,
    394: 82,
    395: 1006,
    397: 2340,
    403: 3265,
    404: 1309,
    405: 1263,
    406: 3455,
    409: 2278,
    415: 1796,
    416: 1881,
    417: 2810,
    418: 1622,
    419: 0,
    420: 2581,
    421: 904,
    423: 1938,
    424: 3050,
    426: 2215,
    431: 2719,
    432: 3313,
    435: 143,
    436: 1715,
    437: 2076,
    439: 2241,
    445: 3704,
    447: 874,
    449: 2153,
    450: 2152,
    451: 1971,
    452: 3617,
    453: 1442,
    455: 711,
    458: 2428,
    459: 85,
    461: 1247,
    463: 1531,
    464: 983,
    465: 803,
    466: 660,
    467: 858,
    468: 3308,
    469: 182,
    470: 936,
    474: 637,
    475: 389,
    476: 1604,
    479: 3341,
    481: 1662,
    484: 2151,
    485: 651,
    486: 536,
    490: 2702,
    492: 1120,
    494: 2605,
    496: 283,
    497: 3067,
    498: 2858,
    499: 2829,
    500: 2779,
    503: 87,
    504: 1455,
    505: 1821,
    507: 1562,
    508: 1872,
    509: 2499,
    510: 367,
    511: 2505,
    513: 3642,
    515: 2374,
    516: 2324,
    517: 2168,
    518: 3691,
    519: 3310,
    521: 2774,
    522: 1252,
    523: 2425,
    526: 3531,
    527: 2898,
    528: 2818,
    531: 923,
    535: 2368,
    537: 2272,
    538: 1915,
    539: 197,
    543: 639,
    544: 2117,
    545: 2128,
    547: 3632,
    554: 1577,
    555: 2708,
    559: 1923,
    560: 3152,
    561: 3214,
    562: 3188,
    563: 1558,
    565: 2002,
    566: 1056,
    571: 1927,
    572: 1005,
    573: 1869,
    574: 1733,
    575: 2107,
    581: 121,
    583: 816,
    585: 2453,
    589: 1508,
    591: 2053,
    592: 2061,
    593: 1718,
    595: 242,
    598: 2719,
    599: 1788,
    600: 66,
    603: 399,
    605: 2328,
    607: 2268,
    608: 1483,
    609: 1516,
    612: 256,
    614: 1319,
    616: 1509,
    617: 1833,
    618: 1076,
    622: 3403,
    623: 3451,
    624: 2720,
    625: 2068,
    627: 2315,
    629: 2660,
    630: 2751,
    631: 257,
    633: 2106,
    634: 707,
    635: 2255,
    636: 1365,
    637: 2266,
    641: 1021,
    642: 2322,
    645: 574,
    651: 439,
    655: 2690,
    656: 146,
    657: 1937,
    659: 1863,
    661: 2819,
    662: 1242,
    667: 1446,
    669: 564,
    673: 3204,
    676: 409,
    678: 522,
    680: 1160,
    682: 2936,
    683: 1954,
    684: 1123,
    687: 400,
    689: 3482,
    690: 3408,
    691: 688,
    692: 440,
    695: 1296,
    701: 1229,
    702: 2781,
    703: 845,
    704: 3087,
    706: 2141,
    707: 903,
    708: 1202,
    711: 1770,
    715: 909,
    717: 3602,
    718: 1107,
    719: 729,
    723: 1634,
    724: 636,
    730: 3444,
    731: 1190,
    733: 2048,
    734: 640,
    735: 848,
    740: 2226,
    743: 2806,
    748: 1734,
    751: 248,
    752: 99,
    754: 725,
    757: 2912,
    759: 1230,
    761: 3180,
    764: 3418,
    766: 2399,
    774: 659,
    775: 2689,
    777: 1374,
    779: 2185,
    780: 1920,
    781: 25,
    782: 45,
    783: 1142,
    784: 1028,
    785: 1150,
    786: 537,
    787: 1153,
    789: 628,
    790: 1094,
    792: 586,
    795: 617,
    796: 1332,
    798: 381,
    799: 3193,
    800: 2254,
    802: 3186,
    804: 2370,
    805: 3273,
    807: 488,
    808: 582,
    812: 1110,
    815: 3074,
    816: 3484,
    818: 1155,
    819: 1070,
    825: 2212,
    827: 3367,
    829: 3357,
    830: 819,
    831: 258,
    834: 1132,
    841: 2489,
    842: 2600,
    843: 1652,
    844: 1807,
    845: 2644,
    846: 1308,
    848: 1347,
    849: 3038,
    850: 3072,
    851: 314,
    852: 935,
    854: 334,
    856: 253,
    857: 2518,
    861: 3426,
    862: 2039,
    863: 2343,
    867: 1782,
    871: 1855,
}

In [8]:
columns = ['name', 'address', 'city', 'websites', 'categories', 'socials']

In [13]:
# ix = 0
ix = 431
ix = 598

In [14]:
row = df_multisport.iloc[ix]

venue_name = row['name']
print('INDEX', ix)
print(venue_name)
print(row['address'])
print(row['web'])

df_sim = df_places.sort_values(by='name', key=lambda name: name.apply(lambda x: jaro_winkler_similarity(x.lower(), venue_name.lower())), ascending=False)
pyperclip.copy(f'{ix}: {int(df_sim.index[0])},')
ix += 1
df_sim.head(20)[columns]

INDEX 598
Plaváreň Čadca
Športovcov 2
022 01 Čadca
http://www.plavarencadca.sk/


Unnamed: 0,name,address,city,websites,categories,socials
12,Plaváreň,41 Eötvösa,Komárno,,"{'alternate': ['bar', 'lounge'], 'main': 'swim...",[https://www.facebook.com/2473125816065429]
904,Plaváreň,Komenského 2874/43,Pezinok,[http://www.plavaren-pk.sk],"{'alternate': ['active_life', 'sports_and_recr...",[https://www.facebook.com/200871417143690]
704,Plaváreň Gaudeamus,Mokrohájska cesta 3392/3,Bratislava,,"{'alternate': ['gym', 'pub'], 'main': 'swimmin...",[https://www.facebook.com/683511038698281]
848,Plaváreň Malina,Sasinkova 901/2,Malacky,[http://www.adhocmalacky.sk/sportova-hala-mali...,"{'alternate': ['active_life', 'swimming_pool']...",[https://www.facebook.com/105481216236268]
1290,Plaváreň Duslo,Nám. Sv.Trojice 7,,[http://www.aquasport.sk],"{'alternate': ['active_life', 'sports_club_and...",[https://www.facebook.com/237575873013394]
699,Plaváreň Iuventa,Karloveská 2951/64,Bratislava,[http://www.iuventa.sk],"{'alternate': ['active_life', 'event_planning'...",[https://www.facebook.com/622461251112989]
398,Plaváreň Vlčie Hrdlo,,,,"{'alternate': None, 'main': 'swimming_pool'}",[https://www.facebook.com/736946026652400]
2904,Plaváreň Svit,Jilemnického 307,Svit,[http://www.bpsvit.sk/plavaren/plavaren],"{'alternate': ['topic_concert_venue'], 'main':...",[https://www.facebook.com/112136917048214]
1039,Plaváreň SPTS,,,,"{'alternate': None, 'main': 'swimming_pool'}",[https://www.facebook.com/593605191071627]
1938,Plaváreň Prievidza,,,,"{'alternate': None, 'main': 'swimming_pool'}",[https://www.facebook.com/2050775215015798]


In [15]:
df_places[df_places['name'].str.lower().str.contains('yogo city')][columns]

Unnamed: 0,name,address,city,websites,categories,socials


In [16]:
df_multisport[df_multisport['name'].str.lower().str.contains('yogo city')]

Unnamed: 0,name,url,content,address,tags,phone,email,facebook,web,description,logo,pictures,coordinates
854,Yogo City studio,https://multi-sport.sk/aktivity/partner/yogo-c...,"<!DOCTYPE html>\n<html lang=""sk-SK"">\n\n<head>...",Gajova 4\n81109 Bratislava,[Joga],421902489812,,https://www.facebook.com/yogocitystudio,https://www.yogocity.sk/,,,[https://prod0mobileapp.blob.core.windows.net/...,"[48.1441910, 17.1184560]"


In [18]:
# df_places[(df_places['name'].str.lower().str.contains('gym')) & (df_places['city'].str.lower().str.contains('krupina'))][columns]

In [17]:
# df_places[df_places['address'].apply(lambda x: False if not isinstance(x, str) else 'Hlboká' in x)][columns]

## Dict s jednoznacnymi identifikatormi

In [56]:
MULTISPORT_TO_PLACES_MAP = {}
for ix_multisport, ix_places in multisport_to_places_map.items():
    multisport_url = df_multisport.iloc[ix_multisport]['url']
    overture_maps_id = df_places.iloc[ix_places]['id']
    MULTISPORT_TO_PLACES_MAP[multisport_url] = overture_maps_id

In [57]:
with open('data/multisport_to_places_map.json', 'w', encoding='utf-8') as f:
    json.dump(MULTISPORT_TO_PLACES_MAP, f)