In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import duckdb
import requests
import json
import os
from dotenv import load_dotenv
import logging
from src.imports import get_stops
from src.data_processing import extract_stop_info, normalize_stops

load_dotenv()

True

In [3]:
stop_names = [
    "Hradčanská", "Sparta", "Korunovační", "Letenské náměstí",
    "Kamenická", "Strossmayerovo náměstí", "Nábřeží Kapitána Jaroše",
    "Vltavská", "Výstaviště", "Veletržní palác", "Chotkovy sady",
]
base_url = 'https://api.golemio.cz/v2/gtfs/stops'

In [4]:
letna_stops = get_stops(base_url, os.getenv('GOLEMIO_TOKEN'), stop_names)

In [5]:
stops_data_df = pd.DataFrame(extract_stop_info(letna_stops))

In [40]:
df = normalize_stops(stops_data_df)
df.to_csv('letna_stops.csv')

In [7]:
import folium
from folium.plugins import MarkerCluster, HeatMap

In [8]:
m = folium.Map(location=[df['avg_latitude'].mean(), df['avg_longitude'].mean()], zoom_start=14, tiles="CartoDB Positron")

marker_cluster = MarkerCluster().add_to(m)

# Add each stop to the map with modern, tooltip-only markers
for _, stop in df.iterrows():
    folium.CircleMarker(
        location=[stop["avg_latitude"], stop["avg_longitude"]],
        radius=8,  # Size of the circle marker
        color="#FF5733",  
        fill=True,
        fill_color="#FF5733",  
        fill_opacity=0.7,  # Fill transparency
        tooltip=stop["stop_name"],  # Tooltip displayed on hover
        popup=(f"<b>Stop Name:</b> {stop['stop_name']}<br>"
               f"<b>Base Stop ID:</b> {stop['base_stop_id']}<br>"
               f"<b>All Stop IDs:</b> {', '.join(stop['all_stop_ids'])}")
    ).add_to(marker_cluster)

m

In [13]:
def setup_azure_duckdb(duckdb_conn: duckdb.DuckDBPyConnection) -> None:
    duckdb_conn.sql(f"ATTACH 'public-transport.db';")
    duckdb_conn.sql('INSTALL azure; LOAD azure;')
    duckdb_conn.sql(f'''
        CREATE SECRET azure_spn (
            TYPE AZURE,
            PROVIDER SERVICE_PRINCIPAL,
            TENANT_ID '{os.getenv("parquetAzureTenantID")}',
            CLIENT_ID '{os.getenv("parquetAzureAppID")}',
            CLIENT_SECRET '{os.getenv("parquetAzureClientSecret")}',
            ACCOUNT_NAME '{os.getenv("parquetStorageName")}');
            ''')
    logging.info("Azure and DuckDB setup complete")
    
def get_stop_times_from_azure(duckdb_conn, stop_ids) -> pd.DataFrame:
    stop_times_sql_string = f'''
    SELECT * 
        FROM 'azure://golem-data-lake-pid/vehiclepositions_stop_times_history/*/*/*/*.parquet'
        WHERE YEAR = 2024 AND gtfs_stop_id IN ({stop_ids})
    '''
    logging.info("Getting stop times from Azure")
    return duckdb_conn.sql(stop_times_sql_string).df()

In [16]:
with duckdb.connect('duck.db') as duckdb_conn:
    stop_ids_sql = ", ".join([f"'{stop_id}'" for stop_id in stops_data_df['stop_id'].to_list()])
    setup_azure_duckdb(duckdb_conn)
    stop_times_df = get_stop_times_from_azure(duckdb_conn, stop_ids_sql)
    logging.info('Stop times data loaded')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [21]:
stop_times_df.to_csv('stop_times.csv', index=False)

In [50]:
stop_times_df = stop_times_df.assign(
    current_stop_departure = pd.to_datetime(stop_times_df['current_stop_departure']),  
    current_stop_arrival = pd.to_datetime(stop_times_df['current_stop_arrival']),
    created_at = pd.to_datetime(stop_times_df['created_at']),
    updated_at = pd.to_datetime(stop_times_df['updated_at']),
    base_stop_id = stop_times_df['gtfs_stop_id'].str.extract(r'^(.*?)(?=[SZ])')[0]
)

In [38]:
stop_times_df.columns

Index(['rt_trip_id', 'gtfs_date', 'gtfs_trip_id', 'gtfs_direction_id',
       'gtfs_route_short_name', 'gtfs_route_type', 'run_number',
       'vehicle_registration_number', 'gtfs_stop_sequence', 'gtfs_stop_id',
       'current_stop_arrival', 'current_stop_departure',
       'current_stop_arr_delay', 'current_stop_dep_delay', 'create_batch_id',
       'created_at', 'created_by', 'update_batch_id', 'updated_at',
       'updated_by', 'origin_route_name', 'day', 'month', 'year'],
      dtype='object')

In [22]:
stop_times_df

Unnamed: 0,rt_trip_id,gtfs_date,gtfs_trip_id,gtfs_direction_id,gtfs_route_short_name,gtfs_route_type,run_number,vehicle_registration_number,gtfs_stop_sequence,gtfs_stop_id,...,create_batch_id,created_at,created_by,update_batch_id,updated_at,updated_by,origin_route_name,day,month,year
449944,2024-05-31T23:30:00+02:00_94_1867_240428_8278,2024-05-31,94_1867_240428,0,94,0,74,8278.0,23,U100Z4P,...,,2024-06-01 02:18:00.083005+02:00,,,2024-06-01 02:18:00.083005+02:00,,94,1,6,2024
449945,2024-05-31T23:30:00+02:00_94_1867_240428_8278,2024-05-31,94_1867_240428,0,94,0,74,8278.0,24,U717Z2P,...,,2024-06-01 02:18:00.083005+02:00,,,2024-06-01 02:18:00.083005+02:00,,94,1,6,2024
449946,2024-05-31T23:30:00+02:00_94_1867_240428_8278,2024-05-31,94_1867_240428,0,94,0,74,8278.0,25,U439Z1P,...,,2024-06-01 02:18:00.083005+02:00,,,2024-06-01 02:18:00.083005+02:00,,94,1,6,2024
449955,2024-06-01T00:08:00+02:00_97_1401_231206_8363,2024-05-31,97_1401_231206,0,97,0,55,8363.0,14,U163Z1P,...,,2024-06-01 02:18:00.083005+02:00,,,2024-06-01 02:18:00.083005+02:00,,97,1,6,2024
449951,2024-06-01T00:00:00+02:00_94_1860_240527_8464,2024-05-31,94_1860_240527,0,94,0,61,8464.0,23,U100Z4P,...,,2024-06-01 02:18:00.083005+02:00,,,2024-06-01 02:18:00.083005+02:00,,94,1,6,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747532,2024-12-31T23:34:05+01:00_993_3643_210715_26_26,2024-12-31,993_3643_210715,0,C,1,26,,7,U100Z102P,...,,2025-01-01 00:48:00.058105+01:00,,,2025-01-01 00:48:00.058105+01:00,,993,31,12,2024
747512,2024-12-31T23:02:00+01:00_97_1898_241231_9425,2024-12-31,97_1898_241231,1,97,0,61,9425.0,34,U163Z2P,...,,2025-01-01 00:48:00.058105+01:00,,,2025-01-01 00:48:00.058105+01:00,,22,31,12,2024
747534,2024-12-31T23:41:40+01:00_991_1338_241224_16_16,2024-12-31,991_1338_241224,1,A,1,16,,6,U163Z101P,...,,2025-01-01 00:48:00.058105+01:00,,,2025-01-01 00:48:00.058105+01:00,,991,31,12,2024
747531,2024-12-31T23:33:35+01:00_991_1339_241223_4_4,2024-12-31,991_1339_241223,0,A,1,4,,12,U163Z102P,...,,2025-01-01 00:48:00.058105+01:00,,,2025-01-01 00:48:00.058105+01:00,,991,31,12,2024


In [47]:
df

Unnamed: 0,stop_name,avg_longitude,avg_latitude,base_stop_id,all_stop_ids
0,Chotkovy sady,14.409043,50.095165,U187,"[U187Z1P, U187Z2P]"
1,Hradčanská,14.404189,50.097498,U163,"[U163S1, U163Z1P, U163Z10P, U163Z101P, U163Z10..."
2,Kamenická,14.428636,50.099638,U231,"[U231Z1P, U231Z2P]"
3,Korunovační,14.419715,50.100393,U262,"[U262Z2P, U262Z3P, U262Z4P]"
4,Letenské náměstí,14.423605,50.099975,U324,"[U324Z1P, U324Z2P]"
5,Nábřeží Kapitána Jaroše,14.431299,50.096321,U439,[U439Z1P]
6,Sparta,14.417687,50.09916,U692,"[U692Z1P, U692Z2P, U692Z3P, U692Z4P]"
7,Strossmayerovo náměstí,14.433265,50.09891,U717,"[U717Z1P, U717Z2P, U717Z4P, U717Z5P]"
8,Veletržní palác,14.433064,50.101734,U841,"[U841Z1P, U841Z2P, U841Z4P]"
9,Vltavská,14.4383,50.099185,U100,"[U100S1, U100Z1P, U100Z101P, U100Z102P, U100Z2..."


In [49]:
stop_times_df.merge(df, left_on='stop_id_base', right_on='base_stop_id', how='inner')

Unnamed: 0,rt_trip_id,gtfs_date,gtfs_trip_id,gtfs_direction_id,gtfs_route_short_name,gtfs_route_type,run_number,vehicle_registration_number,gtfs_stop_sequence,gtfs_stop_id,...,origin_route_name,day,month,year,stop_id_base,stop_name,avg_longitude,avg_latitude,base_stop_id,all_stop_ids
0,2024-05-31T23:30:00+02:00_94_1867_240428_8278,2024-05-31,94_1867_240428,0,94,0,74,8278.0,23,U100Z4P,...,94,1,6,2024,U100,Vltavská,14.438300,50.099185,U100,"[U100S1, U100Z1P, U100Z101P, U100Z102P, U100Z2..."
1,2024-06-01T00:00:00+02:00_94_1860_240527_8464,2024-05-31,94_1860_240527,0,94,0,61,8464.0,23,U100Z4P,...,94,1,6,2024,U100,Vltavská,14.438300,50.099185,U100,"[U100S1, U100Z1P, U100Z101P, U100Z102P, U100Z2..."
2,2024-06-01T00:00:00+02:00_911_490_240428_6741,2024-05-31,911_490_240428,0,911,3,55,6741.0,24,U100Z6P,...,911,1,6,2024,U100,Vltavská,14.438300,50.099185,U100,"[U100S1, U100Z1P, U100Z101P, U100Z102P, U100Z2..."
3,2024-06-01T00:17:00+02:00_905_257_240428_6820,2024-05-31,905_257_240428,0,905,3,56,6820.0,22,U100Z6P,...,905,1,6,2024,U100,Vltavská,14.438300,50.099185,U100,"[U100S1, U100Z1P, U100Z101P, U100Z102P, U100Z2..."
4,2024-06-01T00:06:00+02:00_911_493_240428_6669,2024-05-31,911_493_240428,1,911,3,51,6669.0,19,U100Z5P,...,911,1,6,2024,U100,Vltavská,14.438300,50.099185,U100,"[U100S1, U100Z1P, U100Z101P, U100Z102P, U100Z2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2425988,2024-12-31T22:42:00+01:00_91_1980_241231_9430,2024-12-31,91_1980_241231,1,91,0,60,9430.0,30,U231Z2P,...,5,31,12,2024,U231,Kamenická,14.428636,50.099638,U231,"[U231Z1P, U231Z2P]"
2425989,2024-12-31T22:58:00+01:00_91_2004_241231_9281,2024-12-31,91_2004_241231,0,91,0,56,9281.0,20,U231Z1P,...,26,31,12,2024,U231,Kamenická,14.428636,50.099638,U231,"[U231Z1P, U231Z2P]"
2425990,2024-12-31T22:54:00+01:00_96_992_241231_9229,2024-12-31,96_992_241231,1,96,0,60,9229.0,24,U231Z2P,...,6,31,12,2024,U231,Kamenická,14.428636,50.099638,U231,"[U231Z1P, U231Z2P]"
2425991,2024-12-31T23:12:00+01:00_96_980_241231_9301,2024-12-31,96_980_241231,0,96,0,56,9301.0,13,U231Z1P,...,1,31,12,2024,U231,Kamenická,14.428636,50.099638,U231,"[U231Z1P, U231Z2P]"


In [41]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_sparta_matches():
    # URL of Sparta Praha's match schedule
    url = 'https://www.eurofotbal.cz/kluby/cesko/sparta-praha/zapasy/'
    
    # Send a GET request to the URL
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all match rows
    match_rows = soup.find_all('div', class_='e-tables-table-overview__row e-tables-table-overview__row--hoverable')
    
    matches = []
    
    for row in match_rows:
        # Extract competition
        competition_cell = row.find('div', class_='e-tables-table-overview__cell--league')
        competition = competition_cell.text.strip() if competition_cell else "Unknown"
        
        # Extract round
        round_cell = row.find('div', class_='e-tables-table-overview__cell--round')
        round_text = round_cell.text.strip() if round_cell else "Unknown"
        
        # Extract date and time - specifically targeting the third and fourth cells
        date_cells = row.find_all('div', class_='e-tables-table-overview__cell--gray')
        
        date = "Unknown"
        time = "Unknown"
        
        if len(date_cells) >= 3:
            date = date_cells[2].text.strip()
        
        if len(date_cells) >= 4:
            time = date_cells[3].text.strip()
        
        # Extract teams
        team_cells = row.find_all('div', class_='e-tables-table-overview__result-team-label')
        home_team = team_cells[0].text.strip()
        away_team = team_cells[1].text.strip()
        
        # Determine Sparta's position (home or away)
        sparta_home = home_team == "Sparta Praha"
        sparta_away = away_team == "Sparta Praha"
        
        # Extract score
        score_cell = row.find('span', class_='e-tables-table-overview__result-score-inner')
        score = score_cell.text.strip() if score_cell else "- : -"
        
        # Determine result color
        result_color = score_cell['class'][-1] if score_cell and len(score_cell.get('class', [])) > 0 else "Unknown"
        
        match_data = {
            'Competition': competition,
            'Round': round_text,
            'Date': date,
            'Time': time,
            'Home Team': home_team,
            'Away Team': away_team,
            'Sparta Position': 'Home' if sparta_home else 'Away' if sparta_away else 'N/A',
            'Score': score,
            'Result Color': result_color
        }
        
        matches.append(match_data)
    
    return pd.DataFrame(matches)

# Scrape and save matches
df = scrape_sparta_matches()

if df is not None:
    # Save to CSV
    df.to_csv('sparta_matches.csv', index=False, encoding='utf-8')

    # Print summary
    print(df)
    print(f"\nTotal matches: {len(df)}")
    
    # Additional analysis
    print("\nCompetitions:")
    print(df['Competition'].value_counts())
    
    print("\nSparta's Home/Away Matches:")
    print(df['Sparta Position'].value_counts())

    Competition     Round        Date   Time        Home Team  \
0   Chance Liga   1. kolo  19.07.2024  19:00     Sparta Praha   
1   Liga mistrů  2.př 1.z  23.07.2024  21:00  Shamrock Rovers   
2   Chance Liga   2. kolo  27.07.2024  17:00       FK Teplice   
3   Liga mistrů  2.př 2.z  30.07.2024  19:00     Sparta Praha   
4   Chance Liga   3. kolo  02.08.2024  20:00     Sparta Praha   
5   Liga mistrů  3.př 1.z  06.08.2024  20:00     Sparta Praha   
6   Chance Liga   4. kolo  10.08.2024  17:00   Bohemians 1905   
7   Liga mistrů  3.př 2.z  13.08.2024  20:30             FCSB   
8   Chance Liga   5. kolo  17.08.2024  20:00      FK Jablonec   
9   Liga mistrů  4.př 1.z  21.08.2024  21:00         Malmö FF   
10  Chance Liga   6. kolo  24.08.2024  17:00     Sparta Praha   
11  Liga mistrů  4.př 2.z  27.08.2024  21:00     Sparta Praha   
12  Chance Liga   7. kolo  31.08.2024  20:00     Hradec Král.   
13  Liga mistrů      Hl.f  18.09.2024  18:45     Sparta Praha   
14  Chance Liga   9. kolo

In [42]:
df

Unnamed: 0,Competition,Round,Date,Time,Home Team,Away Team,Sparta Position,Score,Result Color
0,Chance Liga,1. kolo,19.07.2024,19:00,Sparta Praha,FK Pardubice,Home,2:1,e-tables-table-overview__bg--green
1,Liga mistrů,2.př 1.z,23.07.2024,21:00,Shamrock Rovers,Sparta Praha,Away,0:2,e-tables-table-overview__bg--green
2,Chance Liga,2. kolo,27.07.2024,17:00,FK Teplice,Sparta Praha,Away,1:4,e-tables-table-overview__bg--green
3,Liga mistrů,2.př 2.z,30.07.2024,19:00,Sparta Praha,Shamrock Rovers,Home,4:2,e-tables-table-overview__bg--green
4,Chance Liga,3. kolo,02.08.2024,20:00,Sparta Praha,Dukla Praha,Home,2:0,e-tables-table-overview__bg--green
5,Liga mistrů,3.př 1.z,06.08.2024,20:00,Sparta Praha,FCSB,Home,1:1,e-tables-table-overview__bg--yellow
6,Chance Liga,4. kolo,10.08.2024,17:00,Bohemians 1905,Sparta Praha,Away,1:2,e-tables-table-overview__bg--green
7,Liga mistrů,3.př 2.z,13.08.2024,20:30,FCSB,Sparta Praha,Away,2:3,e-tables-table-overview__bg--green
8,Chance Liga,5. kolo,17.08.2024,20:00,FK Jablonec,Sparta Praha,Away,1:2,e-tables-table-overview__bg--green
9,Liga mistrů,4.př 1.z,21.08.2024,21:00,Malmö FF,Sparta Praha,Away,0:2,e-tables-table-overview__bg--green
