In [0]:
import dlt
from pyspark.sql.functions import (
    abs, col, concat, date_format, datediff, dayofmonth, dayofweek, 
    dayofyear, expr, first, hash, last_day, lpad, lit, month, quarter, 
    row_number, sum, weekofyear, when, year
)

from pyspark.sql.types import BooleanType, StringType, IntegerType
from pyspark.sql.window import Window

In [0]:
catalog_name = spark.conf.get("catalog_name")
schema_name = spark.conf.get("schema_name")
landing_volume_name = spark.conf.get("landing_volume_name")

## Dim Time of Day

In [0]:
@dlt.table(
    name="gold_dim_time_of_day",
    comment="Time of day dimension table",
    table_properties={
        "quality": "gold"
    }
)
def gold_dim_time_of_day():
    # Enforce dependency on table
    dlt.read("silver_bike_point")

    # Import necessary functions
    from pyspark.sql.functions import (
        explode, sequence, to_timestamp, hour, minute, second, 
        lpad, concat, lit, when, col, date_format
    )
    from pyspark.sql.types import IntegerType, StringType

    # Create base time range with 1440 minutes in a day (24 * 60)
    df = spark.sql(
        "SELECT explode(sequence(0, 1439, 1)) as minute_of_day"
    )
    
    # Create dim_time_of_day dataframe with all the relevant columns
    df = (df
        .withColumn("hour_of_day", (col("minute_of_day") / 60).cast(IntegerType()))
        .withColumn("minute_of_hour", col("minute_of_day") % 60)
        .withColumn("second_of_minute", lit(0))
        
        # Create time string in format HH:MM:SS
        .withColumn("hour_str", lpad(col("hour_of_day").cast(StringType()), 2, "0"))
        .withColumn("minute_str", lpad(col("minute_of_hour").cast(StringType()), 2, "0"))
        .withColumn("second_str", lpad(col("second_of_minute").cast(StringType()), 2, "0"))
        .withColumn("time_string", concat(
            col("hour_str"), lit(":"), col("minute_str"), lit(":"), col("second_str")
        ))
        
        # Create time_of_day_key in format HHMMSS
        .withColumn("time_of_day_key", concat(
            col("hour_str"), col("minute_str"), col("second_str")
        ).cast(IntegerType()))
        
        # Add period of day
        .withColumn("period_of_day", 
            when((col("hour_of_day") >= 5) & (col("hour_of_day") < 12), lit("Morning"))
            .when((col("hour_of_day") >= 12) & (col("hour_of_day") < 17), lit("Afternoon"))
            .when((col("hour_of_day") >= 17) & (col("hour_of_day") < 21), lit("Evening"))
            .otherwise(lit("Night"))
        )
        
        # Add AM/PM indicator
        .withColumn("am_pm", when(col("hour_of_day") < 12, lit("AM")).otherwise(lit("PM")))
        
        # Add 12-hour format
        .withColumn("hour_12", when(col("hour_of_day") % 12 == 0, lit(12))
                             .otherwise(col("hour_of_day") % 12))
        .withColumn("hour_12_str", lpad(col("hour_12").cast(StringType()), 2, "0"))
        .withColumn("time_12_hour", concat(
            col("hour_12_str"), lit(":"), col("minute_str"), lit(" "), col("am_pm")
        ))
        
        # Add part of hour
        .withColumn("part_of_hour", 
            when(col("minute_of_hour") < 15, lit("First Quarter"))
            .when(col("minute_of_hour") < 30, lit("Second Quarter"))
            .when(col("minute_of_hour") < 45, lit("Third Quarter"))
            .otherwise(lit("Fourth Quarter"))
        )
        
        # Add business indicators
        .withColumn("is_business_hour", 
            when((col("hour_of_day") >= 9) & (col("hour_of_day") < 17), lit(True))
            .otherwise(lit(False))
        )
        
        # Add rush hour indicators (typically 7-9 AM and 4-6 PM)
        .withColumn("is_rush_hour", 
            when(((col("hour_of_day") >= 7) & (col("hour_of_day") < 9)) | 
                 ((col("hour_of_day") >= 16) & (col("hour_of_day") < 18)), lit(True))
            .otherwise(lit(False))
        )
        
        # Clean up temporary columns
        .drop("hour_str", "minute_str", "second_str", "hour_12")
    )
    
    return df

## Dim Date

In [0]:

@dlt.table(
    name="gold_dim_date",
    comment="Date dimension table",
    table_properties={
        "quality": "gold"
    }
)
def gold_dim_date():
    # Enforce dependency on table a
    dlt.read("silver_bike_point")

    # Create base date range
    df = spark.sql(
        "SELECT explode(sequence(to_date('2023-01-01'), to_date('2023-01-31'), interval 1 day)) as date"
    )
    # Create dim_date dataframe with all the relevant columns
    df = (df
        .withColumn("date_key", date_format("date", "yyyyMMdd").cast(IntegerType()))
        .withColumn("year", year("date"))
        .withColumn("month_num", month("date"))
        .withColumn("day_num", dayofmonth("date"))
        .withColumn("day_of_week", dayofweek("date"))
        .withColumn("week_of_year", weekofyear("date"))
        .withColumn("day_of_year", dayofyear("date"))
        .withColumn("month_name", date_format("date", "MMMM"))
        .withColumn("month_short_name", date_format("date", "MMM"))
        .withColumn("day_name", date_format("date", "EEEE"))
        .withColumn("day_short_name", date_format("date", "EEE"))
        .withColumn("quarter", quarter("date"))
        .withColumn("year_month", date_format("date", "yyyy-MM"))
        .withColumn("year_month_num", date_format("date", "yyyyMM"))
        .withColumn("last_day_of_month", last_day("date"))
        .withColumn("first_day_of_month", expr("trunc(date, 'MM')"))
        .withColumn("day_of_month", datediff("date", expr("trunc(date, 'MM')")) + 1)
        .withColumn("is_weekend", when(dayofweek("date").isin(1, 7), lit(True)).otherwise(lit(False)))
        .withColumn("is_weekday", when(dayofweek("date").isin(1, 7), lit(False)).otherwise(lit(True)))
        .withColumn("season", when(month("date").isin(3, 4, 5), lit("Spring"))
                        .when(month("date").isin(6, 7, 8), lit("Summer"))
                        .when(month("date").isin(9, 10, 11), lit("Fall"))
                        .otherwise(lit("Winter")))
        .withColumn("year_str", col("year").cast(StringType()))
        .withColumn("month_str", lpad(col("month_num").cast(StringType()), 2, "0"))
        .withColumn("day_str", lpad(col("day_num").cast(StringType()), 2, "0"))
        .withColumn("date_key", concat("year_str", "month_str", "day_str"))
        .drop("year_str", "month_str", "day_str")
    )
    return df

## Dim Bikepoint


In [0]:
@dlt.table(
  name="gold_dim_bikepoint",
  comment="Bikepoint dimension table",
  table_properties={
    "quality": "gold"
  }
)
def create_gold_dim_bikepoint():
    df = spark.read.table("robin_huebner.tfl_analytics.silver_bike_point")
    window_spec = Window.partitionBy("bikepoint_id").orderBy(col("landing_timestamp").desc())
    df = df.withColumn("row_number", row_number().over(window_spec)).withColumn("bikepoint_key", abs(hash(col("bikepoint_id")))).drop("bikepoint_id")
    df = df.filter(col("row_number") == 1)
    return df

## Fact Bike Utilization



In [0]:
@dlt.table(
    name="gold_fact_bike_utilization_total",
    comment="The total amount of bikes which are used in a specific time",
    table_properties={"quality": "gold"},
)
def create_gold_fact_bike_utilization_total():
    df = spark.read.table("robin_huebner.tfl_analytics.silver_bike_point")

    df = df.groupBy("bikepoint_id", "landing_timestamp").agg(
        sum(df.empty_dock_count).alias("empty_dock_count"),
        sum(df.dock_count).alias("dock_count"),
        sum(df.bike_count).alias("bike_count"),
        sum(df.standard_bike_count).alias("standard_bike_count"),
        sum(df.ebike_count).alias("ebike_count"),
    )

    df = (
        df.withColumn("date_key", date_format(df.landing_timestamp, "yyyyMMdd"))
        .withColumn("time_of_day_key", date_format(df.landing_timestamp, "HHmmss").cast("integer"))
        .withColumn("bikepoint_key", abs(hash(col("bikepoint_id"))))
        .select("date_key", "time_of_day_key", "bikepoint_key", *df.columns)
        .drop("landing_timestamp", "bikepoint_id")
    )
    return df