In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


# Set User-Specific Variables

This section defines variables for your username, role, database, and schema. These variables will be used throughout the notebook to ensure all operations are performed in your dedicated environment.

In [None]:
usernum = str('<INSERT USER NUMBER>')

In [None]:
SET USERNAME = 'HOL_USER_' || {{usernum}};
SELECT $USERNAME;

In [None]:
SET HOLROLE = $USERNAME || '_FULL_ROLE';
SET DB_NAME = $USERNAME || '_DB';
SET SCHEMANAME = 'GOLD';

In [None]:
USE ROLE IDENTIFIER($HOLROLE);
USE DATABASE IDENTIFIER($DB_NAME);
USE SCHEMA IDENTIFIER($SCHEMANAME);

In [None]:
WITH max_time AS (
  SELECT
    TO_TIMESTAMP_NTZ(MAX(ACTUAL_TIMESTAMP)/1000) AS max_ts
  FROM
    TRAIN_MOVEMENTS
  WHERE
    ACTUAL_TIMESTAMP IS NOT NULL
),
time_spine AS (
  SELECT
    DATEADD(MINUTE, -seq * 30, (SELECT max_ts FROM max_time)) AS time_bucket
  FROM (
    SELECT
      ROW_NUMBER() OVER (ORDER BY SEQ4()) - 1 AS seq
    FROM
      TABLE(GENERATOR(ROWCOUNT => 48))  -- 48 half-hour buckets in last 24 hours
  )
)
SELECT
  t.time_bucket,
  m.VARIATION_STATUS AS status,
  COUNT(*) AS arrival_count
FROM
  time_spine AS t
  LEFT JOIN TRAIN_MOVEMENTS AS m
    ON TO_TIMESTAMP_NTZ(m.ACTUAL_TIMESTAMP/1000) >= t.time_bucket
   AND TO_TIMESTAMP_NTZ(m.ACTUAL_TIMESTAMP/1000) < DATEADD(MINUTE, 30, t.time_bucket)
   AND TO_TIMESTAMP_NTZ(m.ACTUAL_TIMESTAMP/1000) >= DATEADD(HOUR, -24, (SELECT max_ts FROM max_time))
WHERE
  m.ACTUAL_TIMESTAMP IS NOT NULL
GROUP BY
  1, 2
ORDER BY
  1, 2;


In [None]:
WITH max_time AS (
  SELECT
    TO_TIMESTAMP_NTZ(MAX(ACTUAL_TIMESTAMP)/1000) AS max_ts
  FROM
    TRAIN_MOVEMENTS
  WHERE
    ACTUAL_TIMESTAMP IS NOT NULL
),
time_spine AS (
  SELECT
    DATEADD(MINUTE, -seq * 30, (SELECT max_ts FROM max_time)) AS time_bucket
  FROM (
    SELECT
      ROW_NUMBER() OVER (ORDER BY SEQ4()) - 1 AS seq
    FROM
      TABLE(GENERATOR(ROWCOUNT => 48))  -- 48 half-hour buckets in last 24 hours
  )
)
SELECT
  t.time_bucket,
  m.VARIATION_STATUS AS status,
  COUNT(*) AS arrival_count
FROM
  time_spine AS t
  LEFT JOIN TRAIN_MOVEMENTS AS m
    ON TO_TIMESTAMP_NTZ(m.ACTUAL_TIMESTAMP/1000) >= t.time_bucket
   AND TO_TIMESTAMP_NTZ(m.ACTUAL_TIMESTAMP/1000) < DATEADD(MINUTE, 30, t.time_bucket)
   AND TO_TIMESTAMP_NTZ(m.ACTUAL_TIMESTAMP/1000) >= DATEADD(HOUR, -24, (SELECT max_ts FROM max_time))
WHERE
  m.ACTUAL_TIMESTAMP IS NOT NULL
GROUP BY
  1, 2
ORDER BY
  1, 2;

In [None]:
df = station_arrivals.to_pandas()
# Pivot to have statuses as columns
df_pivot = df.pivot(index='TIME_BUCKET', columns='STATUS', values='ARRIVAL_COUNT').fillna(0)

# Optional: Sort by time
df_pivot = df_pivot.sort_index()


df = station_arrivals.to_pandas()
# --- HANDLE NULL STATUSES
df['STATUS'] = df['STATUS'].fillna('UNKNOWN')

# --- PREPARE FOR ALTAIR
df['TIME_BUCKET'] = pd.to_datetime(df['TIME_BUCKET'])

# Altair needs "long" format (already is)
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('TIME_BUCKET:T', title='Time (30-min buckets)'),
    y=alt.Y('ARRIVAL_COUNT:Q', stack='zero', title='Arrival Count'),
    color=alt.Color('STATUS:N', title='Variation Status'),
    tooltip=['TIME_BUCKET:T', 'STATUS:N', 'ARRIVAL_COUNT:Q']
).properties(
    title='Train Movements by Variation Status (Last 24h)',
    width=900,
    height=400
)

# --- DISPLAY IN STREAMLIT
st.title("Train Arrival Variations")
st.altair_chart(chart, use_container_width=True)

In [None]:
WITH max_time AS (
  SELECT
    TO_TIMESTAMP_NTZ(MAX(ACTUAL_TIMESTAMP/1000)) AS max_ts
  FROM
    TRAIN_MOVEMENTS
  WHERE
    ACTUAL_TIMESTAMP IS NOT NULL
),
time_spine AS (
  SELECT
    DATEADD(MINUTE, -seq * 30, (SELECT max_ts FROM max_time)) AS time_bucket
  FROM (
    SELECT
      ROW_NUMBER() OVER (ORDER BY SEQ4()) - 1 AS seq
    FROM
      TABLE(GENERATOR(ROWCOUNT => 48))  -- 48 half-hour buckets
  )
),
movements_with_name AS (
  SELECT
    TO_TIMESTAMP_NTZ(m.ACTUAL_TIMESTAMP/1000) AS actual_ts,
    m.VARIATION_STATUS,
    l.NAME AS location_name
  FROM
    TRAIN_MOVEMENTS m
    LEFT JOIN BRONZE.LOCATIONS_RAW l
      ON m.LOC_STANOX = l.STANOX
  WHERE
    m.ACTUAL_TIMESTAMP IS NOT NULL
)
SELECT
  t.time_bucket,
  m.location_name,
  m.VARIATION_STATUS AS status,
  COUNT(*) AS arrival_count
FROM
  time_spine AS t
  LEFT JOIN movements_with_name AS m
    ON m.actual_ts >= t.time_bucket
   AND m.actual_ts < DATEADD(MINUTE, 30, t.time_bucket)
   AND m.actual_ts >= DATEADD(HOUR, -24, (SELECT max_ts FROM max_time))
GROUP BY
  1, 2, 3
ORDER BY
  1, 2, 3;


In [None]:
WITH max_time AS (
    SELECT TO_TIMESTAMP_NTZ(MAX(ACTUAL_TIMESTAMP)/1000) AS max_ts
    FROM TRAIN_MOVEMENTS
    WHERE ACTUAL_TIMESTAMP IS NOT NULL
),
converted_data AS (
    SELECT
        LOC_STANOX,
        TO_TIMESTAMP_NTZ(ACTUAL_TIMESTAMP/1000) AS actual_ts,
        TIMETABLE_VARIATION,
        LATE_IND
    FROM TRAIN_MOVEMENTS, max_time
    WHERE ACTUAL_TIMESTAMP IS NOT NULL
      AND TO_TIMESTAMP_NTZ(ACTUAL_TIMESTAMP/1000) >= DATEADD(HOUR, -24, max_ts)
),
bucketed_data AS (
    SELECT
        LOC_STANOX,
        DATEADD(
            MINUTE,
            -MOD(DATE_PART('MINUTE', actual_ts), 30),
            DATE_TRUNC('HOUR', actual_ts)
        ) AS time_bucket,
        COUNT(*) AS delay_count
    FROM converted_data
    WHERE TIMETABLE_VARIATION > 0 OR LATE_IND = 1
    GROUP BY LOC_STANOX, time_bucket
),
top_stations AS (
    SELECT
        LOC_STANOX,
        SUM(delay_count) AS total_delay_count
    FROM bucketed_data
    GROUP BY LOC_STANOX
    ORDER BY total_delay_count DESC
    LIMIT 10
)
SELECT
    b.time_bucket,
    b.LOC_STANOX,
    b.delay_count,
    l.DESCRIPTION
FROM bucketed_data b
LEFT JOIN BRONZE.LOCATIONS_RAW l on b.LOC_STANOX = l.STANOX
JOIN top_stations t ON b.LOC_STANOX = t.LOC_STANOX
ORDER BY b.time_bucket, b.LOC_STANOX;


In [None]:
import streamlit as st
import pandas as pd
import altair as alt

df = top_stations.to_pandas()


# Rename columns to lowercase
df = df.rename(columns={
    'TIME_BUCKET': 'time_bucket',
    'LOC_STANOX': 'loc_stanox',
    'DELAY_COUNT': 'delay_count',
    'DESCRIPTION': 'description'
})

# Convert time_bucket to datetime
df['time_bucket'] = pd.to_datetime(df['time_bucket'])

# Optional: show table to debug
st.dataframe(df)

# Create stacked bar chart
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('time_bucket:T', title='Time (30-min buckets)'),
    y=alt.Y('delay_count:Q', title='Delay Count'),
    color=alt.Color('loc_stanox:N', title='Station'),
    tooltip=['time_bucket:T', 'loc_stanox:N', 'delay_count:Q']
).properties(
    width=800,
    height=400,
    title="Top Stations by Delay (per 30-min buckets)"
)

st.altair_chart(chart)


In [None]:
desc table  BRONZE.LOCATIONS_RAW

In [None]:
WITH max_time AS (
  SELECT
    TO_TIMESTAMP_NTZ(MAX(ACTUAL_TIMESTAMP/1000)) AS max_ts
  FROM
    TRAIN_MOVEMENTS
  WHERE
    ACTUAL_TIMESTAMP IS NOT NULL
),
delay_data AS (
  SELECT
    m.LOC_STANOX,
    m.LATE_IND,
    m.MVT_LAT_LON:lat::FLOAT AS latitude,
    m.MVT_LAT_LON:long::FLOAT AS longitude,
    l.NAME AS station_name,
    l.tiploc,
    l.DESCRIPTION
    
  FROM
    TRAIN_MOVEMENTS m
    LEFT JOIN BRONZE.LOCATIONS_RAW l ON m.LOC_STANOX = l.STANOX,
    max_time
  WHERE
    m.LATE_IND = 1
    AND m.ACTUAL_TIMESTAMP IS NOT NULL
    AND m.MVT_LAT_LON IS NOT NULL
    AND TO_TIMESTAMP_NTZ(m.ACTUAL_TIMESTAMP/1000) >= DATEADD(HOUR, -24, max_ts)
)
SELECT * FROM delay_data;


In [None]:
import streamlit as st
import pandas as pd
import streamlit as st
import pandas as pd
import pydeck as pdk

# Example: Load data from Snowflake
# df = your_snowflake_cursor.to_pandas()
df = delayed.to_pandas()

# Drop nulls in lat/lon
df = df.dropna(subset=["LATITUDE", "LONGITUDE"])


# Ensure the columns are properly named and lat/lon are numeric
df = df.dropna(subset=["LATITUDE", "LONGITUDE"])
df["LATITUDE"] = df["LATITUDE"].astype(float)
df["LONGITUDE"] = df["LONGITUDE"].astype(float)

# Display the map
st.subheader("Train delays in the UK (last 24 hours)")
st.map(df[["LATITUDE", "LONGITUDE"]])

