### **Ingest Players JSON**
Process the player_file*.json files to a table in the silver container

### Configuration

In [1]:
%run /utils/general_functions

In [2]:
create_mounts()

In [None]:
spark.conf.set('spark.sql.shuffle.partitions', 300)

### Define Schema

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, TimestampType

player_schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('position', StringType(), True),
    StructField('height_feet', IntegerType(), True),
    StructField('height_inches', IntegerType(), True),
    StructField('weight_pounds', IntegerType(), True),
])

player_gold_schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('position', StringType(), True),
    StructField('height_feet', IntegerType(), True),
    StructField('height_inches', IntegerType(), True),
    StructField('weight_pounds', IntegerType(), True),
    StructField('is_active', BooleanType(), True),
    StructField('eff_start_date', TimestampType(), True),
    StructField('eff_end_date', TimestampType(), True),
    StructField('player_key', IntegerType(), True),
])

### Read Player Files

In [23]:
job_id = mssparkutils.env.getJobId()

# Players in gold container
players_gold_df = None

# Players in bronze container
players_df = spark.read.json(f'synfs:/{job_id}/mnt/bronze/players/*.json', player_schema)

for table in spark.catalog.listTables('prize_picks_gold'):
    if table.name == 'dim_player':
        players_gold_df = spark.read.format('delta').load(f'synfs:/{job_id}/mnt/gold/dim_player')
    else:
        players_gold_df = spark.createDataFrame([], player_gold_schema)

# New Players
players_diff_df = players_df.exceptAll(players_gold_df.drop('is_active').drop('eff_start_date').drop('eff_end_date').drop('player_key'))

### Transformations

Get Max player_key From Gold Container

In [24]:
from pyspark.sql.functions import max

max_value = None

if players_gold_df.count() > 0:
    max_value_df = players_gold_df.agg(max(players_gold_df.player_key))
    max_value = max_value_df.collect()[0][0]

max_player_key = max_value if max_value is not None else 0

Add Date and key attributes

In [25]:
from pyspark.sql.functions import lit, row_number, current_timestamp
from pyspark.sql import Window

players_diff_updated_df = None
spec = Window.orderBy(players_diff_df.id.asc())

players_diff_updated_df = players_diff_df \
    .withColumn('is_active', lit(True)) \
    .withColumn('eff_start_date', current_timestamp()) \
    .withColumn('eff_end_date', lit('1900-01-01 00:00:00.000').cast('timestamp')) \
    .withColumn('player_key', row_number().over(spec) + max_player_key)

In [26]:
players_gold_df = players_gold_df.withColumnRenamed('player_id', 'id')

In [27]:
combined_players_df = players_gold_df.unionByName(players_diff_updated_df)

In [36]:
max_date_df = combined_players_df.groupBy(combined_players_df.id.alias('groupby_id')) \
    .agg(max(combined_players_df.eff_start_date).alias('max_date'))

Mark Rows With A Dimension Change

In [35]:

players_scd_df = combined_players_df.join(max_date_df, (combined_players_df.id == max_date_df.groupby_id) & (combined_players_df.eff_start_date == max_date_df.max_date), 'left') \
    .drop(max_date_df.groupby_id)

Update Dimensions

In [13]:
from pyspark.sql.functions import when, col 

players_final_df = players_scd_df.withColumnRenamed('id', 'player_id') \
    .withColumn('is_active', when(col('max_date').isNull(), lit(False)).otherwise(col('is_active'))) \
    .withColumn('eff_end_date', when(col('max_date').isNull(), current_timestamp()).otherwise(col('eff_end_date'))) \
    .drop('max_date')

### Write file as table

In [None]:
%%sql
DROP TABLE IF EXISTS prize_picks_silver.dim_player;

In [55]:
container = 'silver'
database = 'prize_picks_silver'
table = 'dim_player'
file_format = 'delta'
merge_condition = 'tgt.player_key == src.player_key'

merge_data(players_final_df, container, database, table, file_format, merge_condition=merge_condition)