In [None]:
%pip install pyspark==3.5.4 findspark plotly altair folium pydeck "nbformat>=4.2.0"

In [None]:
user_num                            = 999999
CLIENT_ID                           = ""
CLIENT_SECRET                       = ""
PRINCIPAL_ROLE                      = ""
POLARIS_CATALOG_ACCOUNT_IDENTIFIER  = ""
POLARIS_WAREHOUSE                   = ""
REGION                              = ""


In [None]:
import findspark
findspark.init()
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when


PACKAGES = "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.1,org.apache.iceberg:iceberg-aws-bundle:1.9.1"
spark = SparkSession.builder.appName('iceberg_lab')\
.config("spark.driver.memory", "3g") \
.config('spark.jars.packages', PACKAGES) \
.config("spark.driver.host", "localhost") \
.config("spark.driver.bindAddress", "127.0.0.1") \
.config('spark.sql.iceberg.vectorization.enabled', "false") \
.config('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions') \
.config('spark.sql.defaultCatalog', 'polaris') \
.config('spark.sql.catalog.polaris', 'org.apache.iceberg.spark.SparkCatalog') \
.config('spark.sql.catalog.polaris.type', 'rest') \
.config('spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation','vended-credentials') \
.config('spark.sql.catalog.polaris.uri',f'https://{POLARIS_CATALOG_ACCOUNT_IDENTIFIER}.snowflakecomputing.com/polaris/api/catalog') \
.config('spark.sql.catalog.polaris.credential',f'{CLIENT_ID}:{CLIENT_SECRET}') \
.config('spark.sql.catalog.polaris.warehouse',POLARIS_WAREHOUSE) \
.config('spark.sql.catalog.polaris.scope',f'PRINCIPAL_ROLE:{PRINCIPAL_ROLE}')\
.config('spark.sql.catalog.polaris.client.region',REGION)\
.config('spark.sql.catalog.snowflake_catalog.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO') \
.getOrCreate()

spark.sql("show namespaces").show()


In [None]:
from pyspark.sql.functions import col, max as spark_max, lit
from pyspark.sql.functions import to_timestamp

MS_IN_MIN = 60000

# Load TRAIN_MOVEMENTS DataFrame (replace with your actual loading logic)
train_movements = spark.table(f"HOL_USER_{user_num}_DB.GOLD.TRAIN_MOVEMENTS")  

# 1️⃣ Calculate max_ts
max_ts_row = train_movements \
    .filter(col("ACTUAL_TIMESTAMP").isNotNull()) \
    .agg(spark_max(col("ACTUAL_TIMESTAMP")).alias("max_ts_epoch")) \
    .collect()[0]

max_ts_epoch = max_ts_row["max_ts_epoch"]

# 2️⃣ Build time spine (48 half-hour buckets)
# We'll create a DataFrame with sequence 0..47
seq_df = spark.range(0, 48).withColumnRenamed("id", "seq")

# Create time_bucket column
time_spine = seq_df.withColumn(
    "time_bucket",lit(max_ts_epoch) - (col("seq") * lit(30 * MS_IN_MIN))
)

# 3️⃣ Prepare TRAIN_MOVEMENTS with timestamp conversion


# 4️⃣ Filter TRAIN_MOVEMENTS to only rows in last 24h window
filtered_train_movements = train_movements.filter(
    (col("ACTUAL_TIMESTAMP") >=  lit(max_ts_epoch) - lit(24 * 60 * MS_IN_MIN))
    & (col("ACTUAL_TIMESTAMP").isNotNull())
)

# 5️⃣ Join time_spine and train_movements
joined_df = time_spine.alias("t").join(
    filtered_train_movements.alias("m"),
    (col("m.ACTUAL_TIMESTAMP") >= col("t.time_bucket")) &
    (col("m.ACTUAL_TIMESTAMP") < col("t.time_bucket") + lit(30 * MS_IN_MIN)),
    how="left"
)

# 6️⃣ Group and aggregate
result_df = joined_df.groupBy(
    col("t.time_bucket").alias("time_bucket"),
    col("m.VARIATION_STATUS").alias("status")
).count().withColumnRenamed("count", "arrival_count")

# 7️⃣ Order by time_bucket, status
final_df = result_df.orderBy("time_bucket", "status")

# Show result
final_df.select(to_timestamp(col("time_bucket")/1000).alias("time_bucket"),"status","arrival_count").show(truncate=False)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max, expr, lit, to_timestamp
from pyspark.sql.types import TimestampType
import datetime

# TrainMovementsWithLocations

train_movements = spark.table(f"HOL_USER_{user_num}_DB.GOLD.TRAIN_MOVEMENTS").alias("tm")
locations_raw = spark.table(f"HOL_USER_{user_num}_DB.BRONZE.LOCATIONS_RAW").alias("loc")

# 1️⃣ Calculate max_ts
max_ts_row = train_movements \
    .filter(col("ACTUAL_TIMESTAMP").isNotNull()) \
    .agg(spark_max(col("ACTUAL_TIMESTAMP")).alias("max_ts_epoch")) \
    .collect()[0]

max_ts_epoch = max_ts_row["max_ts_epoch"]


# 2️⃣ Build time spine (48 half-hour buckets)
seq_df = spark.range(0, 48).withColumnRenamed("id", "seq")

time_spine = seq_df.withColumn(
    "time_bucket",
    lit(max_ts_epoch) - (col("seq") * lit(30 * MS_IN_MIN))
)

# 3️⃣ Prepare movements_with_name (with timestamp and location name)
movements_with_name = train_movements \
    .filter(col("ACTUAL_TIMESTAMP").isNotNull()) \
    .join(locations_raw, col("tm.LOC_STANOX") == col("loc.STANOX"), how="left") \
    .select(
        col("tm.ACTUAL_TIMESTAMP").alias("actual_ts"),
        col("tm.VARIATION_STATUS"),
        col("loc.NAME").alias("location_name")
    )

# 4️⃣ Join time spine and movements_with_name
joined_df = time_spine.alias("t").join(
    movements_with_name.alias("m"),
    (col("m.actual_ts") >= col("t.time_bucket")) &
    (col("m.actual_ts") < col("t.time_bucket") + lit(30 * MS_IN_MIN)) &
    (col("m.actual_ts") >= lit(max_ts_epoch) - lit(24 * 60 * MS_IN_MIN)),
    how="left"
)

# 5️⃣ Group and aggregate
result_df = joined_df.groupBy(
    col("t.time_bucket").alias("time_bucket"),
    col("m.location_name"),
    col("m.VARIATION_STATUS").alias("status")
).count().withColumnRenamed("count", "arrival_count")

# 6️⃣ Order by time_bucket, location_name, status
final_df = result_df.orderBy("time_bucket", "location_name", "status")

# Show result
final_df.select(to_timestamp(col("time_bucket")/1000).alias("time_bucket"),"location_name","status","arrival_count").show(truncate=False)



In [None]:
from pyspark.sql.functions import col, max as spark_max, to_timestamp, floor

# TopStationsDelays
train_movements = spark.table(f"HOL_USER_{user_num}_DB.GOLD.TRAIN_MOVEMENTS").alias("tm")
locations_raw = spark.table(f"HOL_USER_{user_num}_DB.BRONZE.LOCATIONS_RAW").alias("loc")

# 1️⃣ Calculate max_ts
max_ts_row = train_movements \
    .filter(col("ACTUAL_TIMESTAMP").isNotNull()) \
    .agg(spark_max(col("ACTUAL_TIMESTAMP")).alias("max_ts_epoch")) \
    .collect()[0]

max_ts_epoch = max_ts_row["max_ts_epoch"]


# 2️⃣ converted_data CTE
converted_data = train_movements \
    .filter(col("ACTUAL_TIMESTAMP").isNotNull()) \
    .withColumn("actual_ts", col("ACTUAL_TIMESTAMP")) \
    .filter(col("actual_ts") >= lit(max_ts_epoch) - lit(24 * 60 * MS_IN_MIN)) \
    .select(
        col("LOC_STANOX"),
        col("actual_ts"),
        col("TIMETABLE_VARIATION"),
        col("LATE_IND")
    )

# 3️⃣ bucketed_data CTE (create 30-min buckets)
bucketed_data = converted_data \
    .filter((col("TIMETABLE_VARIATION") > 0) | (col("LATE_IND") == 1)) \
    .withColumn(
        "time_bucket",
    (floor(col("actual_ts") / 3600) * 3600) + 
    (floor((col("actual_ts") % 3600) / (30 * 60)) * (30 * 60))
    ) \
    .groupBy("LOC_STANOX", "time_bucket") \
    .count() \
    .withColumnRenamed("count", "delay_count")

# 4️⃣ top_stations CTE
top_stations = bucketed_data \
    .groupBy("LOC_STANOX") \
    .sum("delay_count") \
    .withColumnRenamed("sum(delay_count)", "total_delay_count") \
    .orderBy(col("total_delay_count").desc()) \
    .limit(10)

# 5️⃣ Final SELECT + JOINs
# First, join bucketed_data with locations
joined_with_locations = bucketed_data.alias("b") \
    .join(locations_raw.alias("l"), col("b.LOC_STANOX") == col("l.STANOX"), how="left") \
    .select(
        col("b.time_bucket"),
        col("b.LOC_STANOX"),
        col("b.delay_count"),
        col("l.DESCRIPTION")
    )

# Now, join with top_stations
final_df = joined_with_locations.alias("b") \
    .join(top_stations.alias("t"), col("b.LOC_STANOX") == col("t.LOC_STANOX")) \
    .orderBy("b.time_bucket", "b.LOC_STANOX")

# Show result
top_stations = final_df.select(
    col("b.time_bucket").alias("time_bucket"),
    col("b.LOC_STANOX").alias("LOC_STANOX"),
    col("b.delay_count").alias("DELAY_COUNT"),
    col("b.DESCRIPTION").alias("DESCRIPTION"))


top_stations.select(to_timestamp(col("time_bucket")/lit(1000)).alias("TIME_BUCKET"),
"LOC_STANOX","DELAY_COUNT","DESCRIPTION").show(truncate=False)


In [None]:
import pandas as pd
import altair as alt

df = top_stations.toPandas()
alt.data_transformers.disable_max_rows()


# Rename columns to lowercase
df = df.rename(columns={
    'TIME_BUCKET': 'time_bucket',
    'LOC_STANOX': 'loc_stanox',
    'DELAY_COUNT': 'delay_count',
    'DESCRIPTION': 'description'
})

# Convert time_bucket to datetime
df['time_bucket'] = pd.to_datetime(df['time_bucket'].astype(float) / 1000, unit='s')


# Optional: show table to debug
print("DataFrame head:")
df.head()


print("-------")
# Create stacked bar chart
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('time_bucket:T', title='Time (30-min buckets)'),
    y=alt.Y('delay_count:Q', title='Delay Count'),
    color=alt.Color('loc_stanox:N', title='Station'),
    tooltip=['time_bucket:T', 'loc_stanox:N', 'delay_count:Q']
).properties(
    width=800,
    height=400,
    title="Top Stations by Delay (per 30-min buckets)"
)

# Show chart inline
chart


In [None]:
# Filter last n hours hours
HOURS = 4
# Get max timestamp in data
max_time = df['time_bucket'].max()

# Define cutoff time (4 hours ago)
cutoff_time = max_time - pd.Timedelta(hours=HOURS)

# Filter data
df_recent = df[df['time_bucket'] >= cutoff_time]

# Optional: print to verify
print(f"Max time in data: {max_time}")
print(f"Cutoff time: {cutoff_time}")
print(f"Rows after filter: {len(df_recent)}")

chart = alt.Chart(df_recent).mark_bar().encode(
    x=alt.X('time_bucket:T', title='Time (30-min buckets)'),
    y=alt.Y('delay_count:Q', title='Delay Count'),
    color=alt.Color('loc_stanox:N', title='Station'),
    tooltip=['time_bucket:T', 'loc_stanox:N', 'delay_count:Q']
).properties(
    width=800,
    height=400,
    title="Top Stations by Delay (last 4 hours)"
)

chart

In [None]:
from pyspark.sql.functions import col, max as spark_max, expr, to_timestamp

# Delay Data Query

# Load DataFrames (replace with your actual loading logic)
train_movements = spark.table(f"HOL_USER_{user_num}_DB.GOLD.TRAIN_MOVEMENTS").alias("tm")
locations_raw = spark.table(f"HOL_USER_{user_num}_DB.BRONZE.LOCATIONS_RAW").alias("loc")

# Step 1️⃣ - Calculate max_ts
max_ts_row = train_movements \
    .filter(col("ACTUAL_TIMESTAMP").isNotNull()) \
    .agg(spark_max(col("ACTUAL_TIMESTAMP")).alias("max_ts_epoch")) \
    .collect()[0]

max_ts_epoch = max_ts_row["max_ts_epoch"]


# Step 2️⃣ - Build delay_data DataFrame
# Assuming m.MVT_LAT_LON is a STRUCT with fields 'lat' and 'long'

delay_data = train_movements \
    .filter((col("LATE_IND") == 1) &
            (col("ACTUAL_TIMESTAMP").isNotNull()) &
            (col("MVT_LAT_LON").isNotNull()) &
            (col("ACTUAL_TIMESTAMP") >= lit(max_ts_epoch / 1000))) \
    .join(locations_raw, col("tm.LOC_STANOX") == col("loc.STANOX"), how="left") \
    .select(
        col("tm.LOC_STANOX").alias("LOC_STANOX"),
        col("tm.LATE_IND").alias("LATE_IND"),
        col("tm.MVT_LAT_LON.lat").cast("float").alias("LATITUDE"),
        col("tm.MVT_LAT_LON.long").cast("float").alias("LONGITUDE"),
        col("loc.DESCRIPTION").alias("STATION_NAME"),
        col("loc.tiploc").alias("TIPLOC")
    )

# Optional: show result
delay_data.show(truncate=False)


In [None]:
import pandas as pd
import plotly.express as px
from IPython.display import display

# Your data processing
df = delay_data.toPandas()
df = df.dropna(subset=["LATITUDE", "LONGITUDE"])
df["LATITUDE"] = df["LATITUDE"].astype(float)
df["LONGITUDE"] = df["LONGITUDE"].astype(float)

# Create scatter map (updated function)
fig = px.scatter_map(
    df,
    lat="LATITUDE",
    lon="LONGITUDE",
    hover_name="LATITUDE",  # You can change this to a more meaningful column if available
    hover_data={"LONGITUDE": True},
    color_discrete_sequence=["red"],
    zoom=6,
    height=600,
    title="Train delays in the UK (last 24 hours)"
)

fig.update_layout(
    map=dict(
        center=dict(lat=54.0, lon=-2.0),
        zoom=6
    ),
    margin={"r":0,"t":50,"l":0,"b":0}
)

fig.show()

In [None]:
# if you are not running this in your laptop you can try this code
# dont try it on a restricted environment like Codespaces
SKIP_CELL = True
if not SKIP_CELL:

    import pandas as pd
    import folium
    from folium.plugins import MarkerCluster

    # Example: Load data from Snowflake
    # df = your_snowflake_cursor.to_pandas()
    df = delay_data.toPandas()

    # Drop nulls in lat/lon
    df = df.dropna(subset=["LATITUDE", "LONGITUDE"])

    # Ensure the columns are properly named and lat/lon are numeric
    df["LATITUDE"] = df["LATITUDE"].astype(float)
    df["LONGITUDE"] = df["LONGITUDE"].astype(float)

    # Create a map centered roughly over the UK
    m = folium.Map(location=[54.0, -2.0], zoom_start=6)

    # Add markers for each train delay
    marker_cluster = MarkerCluster().add_to(m)

    for _, row in df.iterrows():
        folium.Marker(
            location=[row["LATITUDE"], row["LONGITUDE"]],
            popup="Train Delay"
        ).add_to(marker_cluster)

    # Display the map in Jupyter
    m