In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
# for incremental process of data setup
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../../Includes/Configuration"

In [0]:
%run "../../Includes/Common Functions"

##### Step 1 - Read the JSON file using spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, FloatType

In [0]:
qualifying_schema = StructType(fields=[
                                   StructField("qualifyId", IntegerType(),False),
                                   StructField("raceId", IntegerType(),True),
                                   StructField("driverId", IntegerType(),True),
                                   StructField("constructorId", IntegerType(),True),
                                   StructField("number", IntegerType(),True),
                                   StructField("position", IntegerType(),True),
                                   StructField("q1", StringType(),True),
                                   StructField("q2", StringType(),True),
                                   StructField("q3", StringType(),True)
])

In [0]:
qualifying_df = spark.read\
    .schema(qualifying_schema)\
    .option("multiLine",True) \
    .json(f"{raw_folder_path}/{v_file_date}/qualifying")
         

##### Step 2 - Rename columns and add new columns
1. Rename qualifyid, driverid, constructorid and raceid 
1. Add ingestion_date with current timestamp


In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
final_df = qualifying_df.withColumnRenamed("driverId", "driver_id") \
                                   .withColumnRenamed("qualifyId", "qualify_id") \
                                   .withColumnRenamed("constructorId", "constructor_id") \
                                   .withColumnRenamed("raceId","race_id") \
                                   .withColumn("ingestion_date", current_timestamp()) \
                                   .withColumn("data_source", lit(v_data_source)) \
                                   .withColumn("file_date", lit(v_file_date)) ## added this from widget
                                    

##### Step 4 - Write to output to processed container in parquet format

In [0]:
# final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.qualifying")
#overwrite_partition(final_df,'f1_processed','qualifying','race_id')

In [0]:
merge_condition = "tgt.qualify_id = src.qualify_id AND tgt.race_id = src.race_id"
merge_delta_table(final_df, 'f1_processed', 'qualifying', processed_folder_path, merge_condition, 'race_id') 

In [0]:
dbutils.notebook.exit("Success")