In [87]:
import os
import requests
from bs4 import BeautifulSoup, Tag
import time
import shutil
from urllib.parse import urlparse, parse_qs, unquote

import pandas as pd
import json

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
BASE_URL = 'https://multi-sport.sk/aktivity'

In [4]:
def get_venue_urls():

    print(f'Processing url: {BASE_URL}')
    r = requests.get(BASE_URL)

    soup = BeautifulSoup(r.content, 'html.parser')

    pagination = soup.find('nav', {'class': 'pagination'})
    n_pages = int(pagination.text.split()[-1])

    venues = []

    for card in soup.findAll('div', {'class': 'facilityCard'}):
        a = card.find('a')
        venues.append({
            'name': a.text,
            'url': a['href']
        })

    for i in range(2, n_pages + 1):

        url = f'{BASE_URL}/page/{i}'
        print(f'Processing url: {url}')
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')

        for card in soup.findAll('div', {'class': 'facilityCard'}):
            a = card.find('a')
            venues.append({
                'name': a.text,
                'url': a['href']
            })

    return venues

In [5]:
venue_urls = get_venue_urls()

Processing url: https://multi-sport.sk/aktivity
Processing url: https://multi-sport.sk/aktivity/page/2
Processing url: https://multi-sport.sk/aktivity/page/3
Processing url: https://multi-sport.sk/aktivity/page/4
Processing url: https://multi-sport.sk/aktivity/page/5
Processing url: https://multi-sport.sk/aktivity/page/6
Processing url: https://multi-sport.sk/aktivity/page/7
Processing url: https://multi-sport.sk/aktivity/page/8
Processing url: https://multi-sport.sk/aktivity/page/9
Processing url: https://multi-sport.sk/aktivity/page/10
Processing url: https://multi-sport.sk/aktivity/page/11
Processing url: https://multi-sport.sk/aktivity/page/12
Processing url: https://multi-sport.sk/aktivity/page/13
Processing url: https://multi-sport.sk/aktivity/page/14
Processing url: https://multi-sport.sk/aktivity/page/15
Processing url: https://multi-sport.sk/aktivity/page/16
Processing url: https://multi-sport.sk/aktivity/page/17
Processing url: https://multi-sport.sk/aktivity/page/18
Processi

In [6]:
len(venue_urls)

891

In [29]:
df = pd.DataFrame({
    'name': [v['name'] for v in venue_urls],
    'url': [v['url'] for v in venue_urls],
})

In [88]:
df['content'] = df.url.progress_apply(lambda url: requests.get(url).text)

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [04:06<00:00,  3.57it/s]


In [88]:
def extract_name(content):

    soup = BeautifulSoup(content)
    name = soup.find('h1').text
    return name

def extract_address(content):

    soup = BeautifulSoup(content)
    address = soup.find('address').get_text(strip=True, separator='\n')
    return address

def extract_tags(content):

    soup = BeautifulSoup(content)
    tags = [s.text for s in soup.find_all('span', {'class': 'badge'})]
    return tags

def extract_contact(content):

    soup = BeautifulSoup(content)
    
    phone = None
    email = None
    fb = None
    web = None
    
    for h2 in soup.find_all('h2'):
        if h2.text == 'Kontakt':
            div = h2.find_next_sibling('div')
            for l in div.find_all('a'):
                href = l['href']
                if href.startswith('tel:'):
                    phone = href[4:]
                elif href.startswith('mailto:'):
                    email = unquote(href)[7:]
                elif 'facebook' in href:
                    fb = href
                else:
                    web = href
    return phone, email, fb, web

def extract_description(content):

    soup = BeautifulSoup(content)
    div = soup.find('div', {'class': 'my-4'})
    if not div:
        return None

    text_parts = []
    for child in div.children:
        if isinstance(child, Tag):
            if 'mt-5' in child.attrs.get('class', []):
                break
            text_parts.append(child.text)

    if len(text_parts) == 0:
        return None

    return '\n'.join(text_parts).strip()

def extract_pictures(content):

    logo_url = None
    pictures = None

    soup = BeautifulSoup(content)
    figure = soup.find('figure', {'class': 'text-center'})
    if figure:
        logo_url = figure.find('img').attrs['data-src']

    splide = soup.find('div', {'class': 'splide__list'})
    if splide:
        pictures = [img.attrs['data-src'] for img in splide.find_all('img')]

    return logo_url, pictures

def extract_coords(content):

    soup = BeautifulSoup(content)
    iframe = soup.find('iframe', {'class': 'd-block'})
    parsed_url = urlparse(iframe.attrs['src'])
    coords = parse_qs(parsed_url.query)['q'][0].split(',')
    return coords
    

In [49]:
df = pd.read_parquet('data/multisport.parquet')

In [50]:
df.columns

Index(['name', 'url', 'content', 'address', 'tags', 'phone', 'email',
       'facebook', 'web', 'description', 'logo', 'pictures'],
      dtype='object')

In [51]:
df['name'] = df.content.progress_apply(extract_name)

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [00:10<00:00, 81.51it/s]


In [34]:
df['address'] = df.content.progress_apply(extract_address)

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [00:10<00:00, 83.54it/s]


In [35]:
df['tags'] = df.content.progress_apply(extract_tags)

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [00:10<00:00, 82.48it/s]


In [89]:
df['phone'], df['email'], df['facebook'], df['web'] = zip(*df.content.progress_apply(extract_contact))

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [00:10<00:00, 81.80it/s]


In [37]:
df['description'] = df.content.progress_apply(extract_description)

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [00:10<00:00, 82.26it/s]


In [38]:
df['logo'], df['pictures'] = zip(*df.content.progress_apply(extract_pictures))

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [00:11<00:00, 79.71it/s]


In [53]:
df['coordinates'] = df.content.progress_apply(extract_coords)

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [00:10<00:00, 81.92it/s]


In [93]:
df.to_parquet('data/multisport.parquet', index=False)

In [92]:
sum(df.pictures.notnull())

651

In [97]:
lens = df.pictures.apply(lambda x: 0 if not isinstance(x, list) else len(x))

In [98]:
lens.value_counts()

pictures
3    481
0    230
2    103
1     67
Name: count, dtype: int64

In [99]:
sum(df.description.isnull())

368

In [100]:
print(sum(df.phone.isnull()))
print(sum(df.email.isnull()))
print(sum(df.facebook.isnull()))
print(sum(df.web.isnull()))

71
816
130
176


In [101]:
sum((df.phone.isnull()) & (df.email.isnull()) & (df.facebook.isnull()) & (df.web.isnull()))

10

In [102]:
for i, row in df[df.description.notnull()].iterrows():
    print(row['url'])
    print(row['description'])
    print('----------------')

https://multi-sport.sk/aktivity/partner/do-tela-povazska-bystrica/
Ľudský organizmus je prepracovaný systém a najlepšie funguje, keď je v harmónii, „fyzično a psychično“. Dosiahnúť maximálny potenciál v živote je snom každého človeka, avšak ako na to? BeYourBest vám odpovie.
----------------
https://multi-sport.sk/aktivity/partner/1-tenisovy-klub/
Využite svoju kartu MultiSport a prídťe si k nám zahrať  tenis.


    Informácie o voľných termínoch vám poskytne správca areálu  na telefónnom
    čísle 0948 – 022 273.


    Prajeme skvelý športový zážitok a tešíme sa na vás!
----------------
https://multi-sport.sk/aktivity/partner/24hr-fitness/
Novootvorené fitness centrum v Bidovciach.




    NO PAIN NO GAIN


    Bolesť, ktorú cítiš dnes bude sila, ktorú pocítiš zajtra.
----------------
https://multi-sport.sk/aktivity/partner/365-fit-co-eurovea/
Na začiatku bola vášeň a obdiv pre jednu z najkrajších a najrýchlejšie rastúcich lokalít Bratislavy a práve preto sme sa rozhodli tu vybudovať 