In [0]:
import dlt
from pyspark.sql.functions import (
    year, month, dayofmonth, dayofweek, weekofyear, dayofyear, 
    date_format, last_day, datediff, expr, concat, col, lit, lpad, 
    when, quarter, first, row_number
)
from pyspark.sql.types import BooleanType, StringType, IntegerType
from pyspark.sql.window import Window

In [0]:
catalog_name = spark.conf.get("catalog_name")
schema_name = spark.conf.get("schema_name")
landing_volume_name = spark.conf.get("landing_volume_name")

## Dim Date

In [0]:

@dlt.table(
    name="gold_dim_date",
    comment="Dimension table for date",
)
def gold_dim_date():
    # Create base date range
    df = spark.sql(
        "SELECT explode(sequence(to_date('2023-01-01'), to_date('2023-01-31'), interval 1 day)) as date"
    )
    # Create dim_date dataframe with all the relevant columns
    df = (df
        .withColumn("date_key", date_format("date", "yyyyMMdd").cast(IntegerType()))
        .withColumn("year", year("date"))
        .withColumn("month_num", month("date"))
        .withColumn("day_num", dayofmonth("date"))
        .withColumn("day_of_week", dayofweek("date"))
        .withColumn("week_of_year", weekofyear("date"))
        .withColumn("day_of_year", dayofyear("date"))
        .withColumn("month_name", date_format("date", "MMMM"))
        .withColumn("month_short_name", date_format("date", "MMM"))
        .withColumn("day_name", date_format("date", "EEEE"))
        .withColumn("day_short_name", date_format("date", "EEE"))
        .withColumn("quarter", quarter("date"))
        .withColumn("year_month", date_format("date", "yyyy-MM"))
        .withColumn("year_month_num", date_format("date", "yyyyMM"))
        .withColumn("last_day_of_month", last_day("date"))
        .withColumn("first_day_of_month", expr("trunc(date, 'MM')"))
        .withColumn("day_of_month", datediff("date", expr("trunc(date, 'MM')")) + 1)
        .withColumn("is_weekend", when(dayofweek("date").isin(1, 7), lit(True)).otherwise(lit(False)))
        .withColumn("is_weekday", when(dayofweek("date").isin(1, 7), lit(False)).otherwise(lit(True)))
        .withColumn("season", when(month("date").isin(3, 4, 5), lit("Spring"))
                        .when(month("date").isin(6, 7, 8), lit("Summer"))
                        .when(month("date").isin(9, 10, 11), lit("Fall"))
                        .otherwise(lit("Winter")))
        .withColumn("year_str", col("year").cast(StringType()))
        .withColumn("month_str", lpad(col("month_num").cast(StringType()), 2, "0"))
        .withColumn("day_str", lpad(col("day_num").cast(StringType()), 2, "0"))
        .withColumn("date_key", concat("year_str", "month_str", "day_str"))
        .drop("year_str", "month_str", "day_str")
    )
    return df

## Dim Bikepoint


In [0]:
@dlt.table(name="gold_dim_bikepoint")
def create_gold_dim_bikepoint():
    df = spark.read.table("robin_huebner.tfl_analytics.silver_bike_point")
    window_spec = Window.partitionBy("bikepoint_id").orderBy(col("landing_timestamp").desc())
    ranked_df = df.withColumn("row_number", row_number().over(window_spec))
    return ranked_df.filter(col("row_number") == 1)

In [0]:



# @dlt.table(
#     name="gold_dim_bikepoint", 
#     comment="Gold dimension of bikepoints"
# )
# def gold_dim_bikepoint():
#     return dlt.apply_changes_from_snapshot(
#         target = "gold_dim_bikepoint",
#         source = "silver_dim_bike_point",
#         sequence_by = "processing_time",
#         keys = ["bikepoint_id"],
#         stored_as_scd_type = 1
# )

# @dlt.view(
#     name="gold_dim_bikepoint_latest",
#     comment="Latest bikepoint data based on silver_timestamp",
# )
# def gold_dim_bikepoint_latest():
#     window_spec = (Window.partitionBy("bikepoint_id").orderBy(col("silver_timestamp").desc()))
#     source_df = dlt.read("silver_bike_point")
#     source_df = source_df.withColumn("row_num", row_number().over(window_spec))
#     source_df = source_df.filter(col("row_num") == 1).drop("row_num")
#     return source_df

# @dlt.table(
#     name="gold_dim_bikepoint", 
#     comment="Batch table for gold dimension of bikepoints"
# )
# def gold_dim_bikepoint():
#     return dlt.read("gold_dim_bikepoint_latest")

# def gold_dim_bikepoint_scd():
#         return dlt.apply_changes(
#         target="gold_dim_bikepoint",
#         source="gold_dim_bikepoint_latest",
#         keys=["bikepoint_id"],
#         sequence_by=col("silver_timestamp"),
#         stored_as_scd_type=1
#     )