In [1]:
import numpy as np
import pandas as pd
import datetime

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

In [2]:
def download_link(url):
  '''Method reformates the share link from Google Drive into fetchable form'''
  return 'https://drive.google.com/uc?id=' + url.split('/')[-2]

In [3]:
# Cycling data
cycling_data = pd.read_csv(download_link("https://drive.google.com/file/d/15eHai6zkPwOBMq59n8uIjjohuuiaV8DF/view?usp=sharing"))

In [4]:
def clean_data(data):
    '''Removes unnecessary columns, creates datetime index and total cyclists column'''
    data = data.copy()
    data["date_time"] = pd.to_datetime(data["DATUM_A_CAS"], errors="coerce")
    data = data[data["date_time"] >= "2022-01-01"]

    data["cyclists_total"] = data["POCET_DO"] + data["POCET_Z"]
    data = data.rename(columns={"NAZOV": "name", "POCET": "cyclists_total"})
    
    columns = ["name", "date_time", "cyclists_total"]
    data = data[columns]
    return data

In [5]:
data = clean_data(cycling_data)

In [6]:
data

Unnamed: 0,name,date_time,cyclists_total
0,#11 - Most Apollo,2025-04-22 23:00:00+00:00,0
1,#11 - Most Apollo,2025-04-22 22:00:00+00:00,0
2,#11 - Most Apollo,2025-04-22 21:00:00+00:00,0
3,#11 - Most Apollo,2025-04-22 20:00:00+00:00,0
4,#11 - Most Apollo,2025-04-22 19:00:00+00:00,0
...,...,...,...
691171,Devinska Nova Ves,2025-12-15 18:00:00+00:00,0
691172,Devinska Nova Ves,2025-12-15 19:00:00+00:00,0
691173,Devinska Nova Ves,2025-12-15 20:00:00+00:00,1
691174,Devinska Nova Ves,2025-12-15 21:00:00+00:00,0


In [7]:
def create_features(df):
    """Create time series features based on time index."""
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    # cyclic notation for day of the week
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek']/7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek']/7)
    #is weekend
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
    # spring/fall/summer/winter
    df['is_spring'] = df['month'].isin([3,4,5]).astype(int)
    df['is_summer'] = df['month'].isin([6,7,8]).astype(int)
    df['is_fall'] = df['month'].isin([9,10,11]).astype(int)
    df['is_winter'] = df['month'].isin([12,1,2]).astype(int)

    return df

**Train-Val-Test Split**

In [8]:
# Getting all the routes available
routes_data = list(data["name"].unique())
routes_data

['#11 - Most Apollo',
 '#7 - Vajnorská > NTC',
 '#4 - Dolnozemská',
 '#10 - Dunajská',
 '#8 - Most SNP',
 '#9 - Páričkova',
 '#6 - Vajnorská',
 '#5 - Devínska cesta',
 '#3 - River Park',
 '#2 - Starý most 2',
 '#1 - Starý Most',
 '#14 - Vajanského 2',
 '#13 - Vajanského 1',
 '#12 - Železná studnička',
 '#16 - Trenčianska',
 '#17 - Dunajská/Lazaretská',
 '#15 - Incheba Einsteinova',
 'Cyklomost Slobody',
 'Devinska Nova Ves',
 'Viedenska',
 'Hradza Berg']

In [9]:
FEATURES_1 = ['hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear', 'is_weekend']
FEATURES_2 = ['hour', 'dayofweek_sin','dayofweek_cos', 'quarter', 'month', 'year', 'dayofyear', 'is_weekend']
TARGET_TOTAL = 'cyclists_total'
TARGET_TO = 'direction_to'
TARGET_FROM = 'direction_from'

In [10]:
data_with_features = create_features(data.set_index('date_time'))

In [11]:
results_lr = []

In [12]:
for route in routes_data:
    print(f"=== ROUTE: {route} ===")

    route_data = (
        data_with_features[data_with_features["name"] == route]
        .copy()
        .sort_index()
    )

    DATA = route_data

    TRAIN = DATA[:int(0.6 * len(DATA))]
    VALIDATION = DATA[int(0.6 * len(DATA)):int(0.8 * len(DATA))]

    X_train = TRAIN[FEATURES_2]
    y_train = TRAIN[TARGET_TOTAL]

    X_val = VALIDATION[FEATURES_2]
    y_val = VALIDATION[TARGET_TOTAL]
    mean_val = y_val.mean()

    # --- Scaling ---
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # --- Model ---
    MODEL = LinearRegression()
    MODEL.fit(X_train_scaled, y_train)

    # --- Validation ---
    y_pred = MODEL.predict(X_val_scaled)
    mae = mean_absolute_error(y_val, y_pred)

    rel_mae = (mae / mean_val) * 100 if mean_val > 0 else 0

    results_lr.append({
        "route": route,
        "model": MODEL,
        "scaler": scaler,
        "mae": mae,
        "rel_mae": rel_mae
    })

    print(f"Done: {route} | Rel. MAE: {rel_mae:.2f}% | MAE: {mae:.4f}")



=== ROUTE: #11 - Most Apollo ===
Done: #11 - Most Apollo | Rel. MAE: 114.43% | MAE: 22.3981
=== ROUTE: #7 - Vajnorská > NTC ===
Done: #7 - Vajnorská > NTC | Rel. MAE: 100.90% | MAE: 5.2333
=== ROUTE: #4 - Dolnozemská ===
Done: #4 - Dolnozemská | Rel. MAE: 288.27% | MAE: 83.2393
=== ROUTE: #10 - Dunajská ===
Done: #10 - Dunajská | Rel. MAE: 121.62% | MAE: 12.3240
=== ROUTE: #8 - Most SNP ===
Done: #8 - Most SNP | Rel. MAE: 111.27% | MAE: 14.1484
=== ROUTE: #9 - Páričkova ===
Done: #9 - Páričkova | Rel. MAE: 130.31% | MAE: 20.7845
=== ROUTE: #6 - Vajnorská ===
Done: #6 - Vajnorská | Rel. MAE: 1091.51% | MAE: 5.4146
=== ROUTE: #5 - Devínska cesta ===
Done: #5 - Devínska cesta | Rel. MAE: 348.17% | MAE: 6.2262
=== ROUTE: #3 - River Park ===
Done: #3 - River Park | Rel. MAE: 152.03% | MAE: 36.2098
=== ROUTE: #2 - Starý most 2 ===
Done: #2 - Starý most 2 | Rel. MAE: 162.64% | MAE: 19.7696
=== ROUTE: #1 - Starý Most ===
Done: #1 - Starý Most | Rel. MAE: 166.15% | MAE: 26.6604
=== ROUTE: #14 -

In [None]:
df_results = pd.DataFrame(results_lr)

In [None]:
feature_scores_lr = {}

for _, row in best_results_lr_per_route.iterrows():
    model = row["model"]

    coefs = model.coef_
    features = FEATURES_2

    # absolútne hodnoty koeficientov
    importance = dict(zip(features, np.abs(coefs)))

    sorted_features = sorted(
        importance.items(),
        key=lambda x: x[1],
        reverse=True
    )

    # bodovanie: 1. miesto = 13 bodov
    for rank, (feature, score) in enumerate(sorted_features[:10]):
        points = 13 - rank
        if points > 0:
            feature_scores_lr[feature] = feature_scores_lr.get(feature, 0) + points


In [None]:
summary_importance_lr = (
    pd.DataFrame({
        "feature": feature_scores_lr.keys(),
        "total_points": feature_scores_lr.values()
    })
    .sort_values("total_points", ascending=True)
)


In [None]:
plt.figure(figsize=(10, 6))
plt.barh(
    summary_importance_lr["feature"].tail(10),
    summary_importance_lr["total_points"].tail(10),
    color="steelblue"
)
plt.title("Feature importance – Linear Regression")
plt.xlabel("Celkový počet bodov")
plt.ylabel("Premenná")
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()