In [0]:
import re
from datetime import datetime

def extract_date_from_string(input_string):
    match = re.search(
      # r'20\d{2}[-_]?(0[1-9]|1[0-2])[-_]?(0[1-9]|[12]\d|3[01])',
        r'20\d\d[-_]?(?:0[1-9]|1[0-2])[-_]?(?:0[1-9]|[12]\d|3[01])',
        input_string
    )
    if not match:
        return None
    try:
        date_str = match.group(0).replace("-", "").replace("_", "")
        return datetime.strptime(date_str, "%Y%m%d").date()
    except:
        return None

In [0]:
# Import modules
from pyspark import pipelines as dp
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, IntegerType, StringType, StructType, StructField,DateType


incoming_dir = "/Volumes/demos_standard/public_data/public_incoming/"
# Define the path to the source data
file_path = f"{incoming_dir}"

schema = StructType([
    StructField("ident", StringType(), True),
    StructField("type", StringType(), True),
    StructField("name", StringType(), True),
    StructField("elevation_ft", StringType(), True),
    StructField("continent", StringType(), True),
    StructField("iso_country", StringType(), True),
    StructField("iso_region", StringType(), True),
    StructField("municipality", StringType(), True),
    StructField("gps_code", StringType(), True),
    StructField("iata_code", StringType(), True),
    StructField("local_code", StringType(), True),
    StructField("coordinates", StringType(), True),
    StructField("source_file_name", StringType(), True),
    StructField("source_file_date", DateType(), True)
])

@dp.table(
  comment="Raw data of airports to kick the tires."
)

def airports_raw():
  return ( 
    spark.read
      .option("header", "true")
      .option("sep", ",")
      .csv(file_path)
      .withColumn(
        "source_file_name",
        regexp_extract(col("_metadata.file_path"), r'([^/]+)$', 1))
      .withColumn("source_file_date", udf(extract_date_from_string, DateType())(col("source_file_name")))
  )


# Define a materialized view that shows it works
@dp.materialized_view(
  comment="view of airports"
)
@dp.expect("ID valid", "ID IS NOT NULL")
def airports_prepared():
  return (
    spark.read.table("airports_raw")
      .withColumnRenamed("ident", "ID")
      .withColumnRenamed("type","AirportType")
      .withColumnRenamed("name","AirportName")
      .withColumnRenamed("elevation_ft","ElevationFt")
      .withColumnRenamed("continent","Continent")
      .withColumnRenamed("iso_country","Country")
      .withColumnRenamed("iso_region","Region")
      .withColumnRenamed("municipality","Municipality")
      # .withColumnRenamed("","")
      # .withColumnRenamed("","")
      .select("ID", "AirportType", "AirportName", "ElevationFt", "Continent", "Country", "Region", "Municipality")
  )
  


```
# Define a materialized view that shows it works
@dp.materialized_view(
  comment="view of airports"
)
@dp.expect("ident valid", "ident IS NOT NULL")
def airports_prepared():
  return (
    spark.read.table("airports_raw")
      .withColumnRenamed("ident", "ID")
      .select("ID", "type", "name", "elevation_ft", "continent", "iso_country", "iso_region", "municipality")
  )
  ```