# Inroduction


This sample notebook demonstrates how to process live data streams using Pathway. The dataset used here is a subset of the one provided â€” specifically, it includes data for only a single parking spot. You are expected to implement your model across all parking spots.

Please note that the pricing model used in this notebook is a simple baseline. You are expected to design and implement a more advanced and effective model.


In [1]:
!pip install pathway bokeh --quiet # This cell may take a few seconds to execute.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from datetime import datetime
import pathway as pw
import bokeh.plotting
import panel as pn

# Step 1: Importing and Preprocessing the Data

In [None]:
df = pd.read_csv('dataset.csv')
df.size

# You can find the sample dataset here: https://drive.google.com/file/d/1D479FLjp9aO3Mg8g6Lpj9oRViWacurA6/view?usp=sharing

In [None]:
# Combine the 'LastUpdatedDate' and 'LastUpdatedTime' columns into a single datetime column
df['Timestamp'] = pd.to_datetime(df['LastUpdatedDate'] + ' ' + df['LastUpdatedTime'],
                                  format='%d-%m-%Y %H:%M:%S')

# Sort the DataFrame by the new 'Timestamp' column and reset the index
df = df.sort_values('Timestamp').reset_index(drop=True)

In [None]:
# Save the selected columns to a CSV file for streaming or downstream processing
df[["Timestamp", "Occupancy", "Capacity","LotID"]].to_csv("parking_stream.csv", index=False)

# Note: Only three features are used here for simplicity.
# Participants are expected to incorporate additional relevant features in their models.

In [24]:
# Define the schema for the streaming data using Pathway
# This schema specifies the expected structure of each data row in the stream

class ParkingSchema(pw.Schema):
    Timestamp: str   # Timestamp of the observation (should ideally be in ISO format)
    Occupancy: int   # Number of occupied parking spots
    Capacity: int 
    LotID: str       # Total parking capacity at the location


In [25]:
# Load the data as a simulated stream using Pathway's replay_csv function
# This replays the CSV data at a controlled input rate to mimic real-time streaming
# input_rate=1000 means approximately 1000 rows per second will be ingested into the stream.

data = pw.demo.replay_csv("parking_stream.csv", schema=ParkingSchema, input_rate=100)

In [26]:
# Define the datetime format to parse the 'Timestamp' column
fmt = "%Y-%m-%d %H:%M:%S"

# Add new columns to the data stream:
# - 't' contains the parsed full datetime
# - 'day' extracts the date part and resets the time to midnight (useful for day-level aggregations)
data_with_time = data.with_columns(
    t = data.Timestamp.dt.strptime(fmt),
    day = data.Timestamp.dt.strptime(fmt).dt.strftime("%Y-%m-%dT00:00:00")
)


# Step 2: Making a simple pricing function

In [27]:
# Define a daily tumbling window over the data stream using Pathway
# This block performs temporal aggregation and computes a dynamic price for each day
import datetime

delta_window = (
    data_with_time.windowby(
        pw.this.t,  # Event time column to use for windowing (parsed datetime)
        instance=pw.this.day,  # Logical partitioning key: one instance per calendar day
        window=pw.temporal.tumbling(datetime.timedelta(days=1)),  # Fixed-size daily window
        behavior=pw.temporal.exactly_once_behavior()  # Guarantees exactly-once processing semantics
    )
    .reduce(
        t=pw.this._pw_window_end,                        # Assign the end timestamp of each window
        occ_max=pw.reducers.max(pw.this.Occupancy),      # Highest occupancy observed in the window
        occ_min=pw.reducers.min(pw.this.Occupancy),      # Lowest occupancy observed in the window
        cap=pw.reducers.max(pw.this.Capacity),           # Maximum capacity observed (typically constant per spot)
    )
    .with_columns(
        # Compute the price using a simple dynamic pricing formula:
        #
        # Pricing Formula:
        #     price = base_price + demand_fluctuation
        #     where:
        #         base_price = 10 (fixed minimum price)
        #         demand_fluctuation = (occ_max - occ_min) / cap
        #
        # Intuition:
        # - The greater the difference between peak and low occupancy in a day,
        #   the more volatile the demand is, indicating potential scarcity.
        # - Dividing by capacity normalizes the fluctuation (to stay in [0,1] range).
        # - This fluctuation is added to the base price of 10 to set the final price.
        # - Example: If occ_max = 90, occ_min = 30, cap = 100
        #            => price = 10 + (90 - 30)/100 = 10 + 0.6 = 10.6

        price= 10 + (pw.this.occ_max - pw.this.occ_min) / pw.this.cap
    )
)

BASE_PRICE = 10.0
ALPHA = 2.0  # Sensitivity parameter

# Stateless price (for each row, price is BASE + ALPHA * (occ/cap))
data_with_price = data_with_time.with_columns(
    Price_Model1 = BASE_PRICE + ALPHA * (pw.this.Occupancy / pw.this.Capacity)
)

# Step 3: Visualizing Daily Price Fluctuations with a Bokeh Plot

**Note:** The Bokeh plot in the next cell will only be generated after you run the `pw.run()` cell (i.e., the final cell).


In [17]:
# Activate the Panel extension to enable interactive visualizations
pn.extension()

# Define a custom Bokeh plotting function that takes a data source (from Pathway) and returns a figure
def price_plotter(source):
    # Create a Bokeh figure with datetime x-axis
    fig = bokeh.plotting.figure(
        height=400,
        width=800,
        title="Pathway: Daily Parking Price",
        x_axis_type="datetime",  # Ensure time-based data is properly formatted on the x-axis
    )
    # Plot a line graph showing how the price evolves over time
    fig.line("t", "price", source=source, line_width=2, color="navy")

    # Overlay red circles at each data point for better visibility
    fig.scatter("t", "price", source=source, size=6, color="red")

    return fig

# Use Pathway's built-in .plot() method to bind the data stream (delta_window) to the Bokeh plot
# - 'price_plotter' is the rendering function
# - 'sorting_col="t"' ensures the data is plotted in time order
viz = delta_window.plot(price_plotter, sorting_col="t")

# Create a Panel layout and make it servable as a web app
# This line enables the interactive plot to be displayed when the app is served
pn.Column(viz).servable()

In [18]:
# Start the Pathway pipeline execution in the background
# - This triggers the real-time data stream processing defined above
# - %%capture --no-display suppresses output in the notebook interface

# %%capture --no-display
pw.run()


UsageError: Line magic function `%%capture` not found.


<h1>My Work</h1>

In [34]:
# Install necessary libraries (run once in Colab)
!pip install pathway bokeh panel --quiet

# --- Imports ---
import numpy as np
import pandas as pd
import pathway as pw
import bokeh.plotting
import panel as pn

# --- 1. Data Preparation (if needed, adjust file path as per your setup) ---
# Assume you have a CSV named 'dataset.csv' with required columns.
# If you already have a cleaned 'parking_stream.csv', you can skip this step.

df = pd.read_csv('dataset.csv')

# Combine date and time columns if needed
if 'LastUpdatedDate' in df.columns and 'LastUpdatedTime' in df.columns:
    df['Timestamp'] = pd.to_datetime(df['LastUpdatedDate'] + ' ' + df['LastUpdatedTime'],
                                     format='%d-%m-%Y %H:%M:%S')
else:
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Sort by time
df = df.sort_values('Timestamp').reset_index(drop=True)

# If you have multiple lots, ensure a unique lot identifier column (e.g., LotID)
if 'LotID' not in df.columns:
    df['LotID'] = df['Latitude'].astype(str) + '_' + df['Longitude'].astype(str)

# Save a streaming-ready CSV
df[["Timestamp", "Occupancy", "Capacity", "LotID"]].to_csv("parking_stream.csv", index=False)

# --- 2. Pathway Schema ---
class ParkingSchema(pw.Schema):
    Timestamp: str
    Occupancy: int
    Capacity: int
    LotID: str

# --- 3. Real-Time Data Ingestion with Pathway ---
# Simulate real-time streaming at 100 rows/sec
data = pw.demo.replay_csv(
    "parking_stream.csv",
    schema=ParkingSchema,
    input_rate=100
)

# Parse timestamp for plotting
fmt = "%Y-%m-%d %H:%M:%S"
data_with_time = data.with_columns(
    t = data.Timestamp.dt.strptime(fmt)
)

# --- 4. Model 1: Baseline Linear Pricing ---
BASE_PRICE = 10.0
ALPHA = 2.0  # Sensitivity parameter

# Stateless price (for each row, price is BASE + ALPHA * (occ/cap))
data_with_price = data_with_time.with_columns(
    Price_Model1 = BASE_PRICE + ALPHA * (pw.this.Occupancy / pw.this.Capacity)
)

# --- 5. Real-Time Visualization with Bokeh and Panel ---
pn.extension('bokeh')

def price_plotter(source):
    fig = bokeh.plotting.figure(
        height=400,
        width=800,
        title="Model 1: Real-Time Parking Price per Lot",
        x_axis_type="datetime",
    )
    # Plot each lot as a separate line
    lots = list(set(source.data['LotID']))
    colors = bokeh.palettes.Category10[10] + bokeh.palettes.Category20[20]
    for i, lot in enumerate(lots):
        mask = [l == lot for l in source.data['LotID']]
        fig.line(
            [t for t, m in zip(source.data['t'], mask) if m],
            [p for p, m in zip(source.data['Price_Model1'], mask) if m],
            line_width=2,
            color=colors[i % len(colors)],
            legend_label=f"Lot {lot}"
        )
    fig.legend.location = "top_left"
    fig.xaxis.axis_label = 'Time'
    fig.yaxis.axis_label = 'Price ($)'
    return fig

# Use Pathway's .plot() for real-time updates
viz = data_with_price.plot(price_plotter, sorting_col="t")
pn.Column(viz).servable()




You are attempting to set `plot.legend.location` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

  fig.legend.location = "top_left"


In [33]:
pw.run()

Output()

In [None]:
# --- 1. Install Required Libraries ---
!pip install pathway bokeh --quiet

# --- 2. Imports ---
import numpy as np
import pandas as pd
import pathway as pw
import bokeh.plotting
from bokeh.io import output_notebook, show

# --- 3. Data Preparation ---
# Load the dataset (replace with your actual file if needed)
df = pd.read_csv('dataset.csv')

# Combine date and time columns if needed
if 'LastUpdatedDate' in df.columns and 'LastUpdatedTime' in df.columns:
    df['Timestamp'] = pd.to_datetime(
        df['LastUpdatedDate'] + ' ' + df['LastUpdatedTime'],
        format='%d-%m-%Y %H:%M:%S'
    )
else:
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Sort by time
df = df.sort_values('Timestamp').reset_index(drop=True)

# Ensure a unique lot identifier column (LotID)
if 'LotID' not in df.columns:
    df['LotID'] = df['Latitude'].astype(str) + '_' + df['Longitude'].astype(str)

# Save a streaming-ready CSV for Pathway
df[["Timestamp", "Occupancy", "Capacity", "LotID"]].to_csv("parking_stream.csv", index=False)

# --- 4. Pathway Schema ---
class ParkingSchema(pw.Schema):
    Timestamp: str
    Occupancy: int
    Capacity: int
    LotID: str

# --- 5. Real-Time Data Ingestion with Pathway ---
# Simulate real-time streaming at 100 rows/sec
data = pw.demo.replay_csv(
    "parking_stream.csv",
    schema=ParkingSchema,
    input_rate=100
)

# Parse timestamp for plotting
fmt = "%Y-%m-%d %H:%M:%S"
data_with_time = data.with_columns(
    t = data.Timestamp.dt.strptime(fmt)
)

# --- 6. Model 1: Baseline Linear Pricing ---
BASE_PRICE = 10.0
ALPHA = 2.0  # Sensitivity parameter

# Stateless price (for each row, price is BASE + ALPHA * (occ/cap))
data_with_price = data_with_time.with_columns(
    Price_Model1 = BASE_PRICE + ALPHA * (pw.this.Occupancy / pw.this.Capacity)
)

# --- 7. Collect Results for Visualization ---
# Convert the Pathway table to a Pandas DataFrame for plotting
df_price = data_with_price.to_pandas()

# --- 8. Visualization with Bokeh ---
output_notebook()

def plot_all_lots(df):
    fig = bokeh.plotting.figure(
        height=400,
        width=800,
        title="Model 1: Real-Time Parking Price per Lot",
        x_axis_type="datetime",
    )
    lots = df['LotID'].unique()
    palette = bokeh.palettes.Category10[10] + bokeh.palettes.Category20[20]
    for i, lot in enumerate(lots):
        lot_df = df[df['LotID'] == lot]
        fig.line(
            lot_df['t'],
            lot_df['Price_Model1'],
            line_width=2,
            color=palette[i % len(palette)],
            legend_label=f"Lot {lot}"
        )
    fig.legend.location = "top_left"
    fig.xaxis.axis_label = 'Time'
    fig.yaxis.axis_label = 'Price ($)'
    show(fig)

plot_all_lots(df_price)

# --- 9. (Optional) Save Results ---
# df_price.to_csv("model1_price_output.csv", index=False)
