### **Ingest Stats JSON**
Process the stat_file*.json files to a table in the silver container

### Parameters

In [147]:
season = 2021

### Configuration

In [148]:
%run /utils/general_functions

In [149]:
create_mounts()

### Define Schema

In [150]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType, TimestampType

game_schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('date', TimestampType(), True),
    StructField('home_team_id', IntegerType(), False),
    StructField('home_team_score', IntegerType(), True),
    StructField('season', IntegerType(), True),
    StructField('visitor_team_id', IntegerType(), False),
    StructField('visitor_team_score', IntegerType(), True),
])

team_schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('abbreviation', StringType(), True),
    StructField('city', StringType(), True),
    StructField('conference', StringType(), True),
    StructField('division', StringType(), True),
    StructField('full_name', StringType(), True),
    StructField('name', StringType(), True),
])

player_schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('position', StringType(), True),
    StructField('team_id', IntegerType(), True)
])

In [151]:
stat_schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('ast', IntegerType(), True),
    StructField('blk', IntegerType(), True),
    StructField('dreb', IntegerType(), True),
    StructField('fg3_pct', FloatType(), True),
    StructField('fg3a', IntegerType(), True),
    StructField('fg3m', IntegerType(), True),
    StructField('fg_pct', FloatType(), True),
    StructField('fga', IntegerType(), True),
    StructField('fgm', IntegerType(), True),
    StructField('ft_pct', FloatType(), True),
    StructField('fta', IntegerType(), True),
    StructField('ftm', IntegerType(), True),
    StructField('game', game_schema),
    StructField('min', StringType(), True),
    StructField('oreb', IntegerType(), True),
    StructField('pf', IntegerType(), True),
    StructField('player', player_schema),
    StructField('pts', IntegerType(), True),
    StructField('reb', IntegerType(), True),
    StructField('stl', IntegerType(), True),
    StructField('team', team_schema),
    StructField('turnover', IntegerType(), True),
])

### Read Stat Files

In [152]:
job_id = mssparkutils.env.getJobId()

stat_df = spark.read.json(f'synfs:/{job_id}/mnt/bronze/stats/season={season}/stat_file*.json', stat_schema)

### Transformations

In [153]:
from pyspark.sql.functions import col, lit, current_timestamp

stat_df = stat_df.withColumnRenamed('id', 'stat_id') 

In [154]:
stat_df = stat_df.withColumn('game_id', col('game.id')) \
    .withColumn('player_id', col('player.id')) \
    .withColumn('team_id', col('team.id')) \
    .withColumn('season', lit(season))

In [155]:
stat_df = stat_df.drop(col('game')) \
    .drop(col('player')) \
    .drop(col('team')) 

In [156]:
stat_final_df = stat_df.distinct()

### Write file as table

In [None]:
%%sql
DROP TABLE IF EXISTS prize_picks_silver.fact_stats;

In [157]:
container = 'silver'
database = 'prize_picks_silver'
table = 'fact_stats'
file_format = 'delta'
partition_col = 'season'
merge_condition = 'tgt.stat_id = src.stat_id'

merge_data(stat_final_df, container, database, table, file_format, partition_col, merge_condition)