### **Transform Stat Facts**

### Parameters

In [79]:
season = 2022

### Configuration

In [80]:
%run /utils/general_functions

In [81]:
create_mounts()

### Load Data

In [82]:
job_id = mssparkutils.env.getJobId()

stat_df = spark.read.format('delta').load(f'synfs:/{job_id}/mnt/silver/fact_stats/season={season}')
player_df = spark.read.format('delta').load(f'synfs:/{job_id}/mnt/silver/dim_player')
team_df = spark.read.format('delta').load(f'synfs:/{job_id}/mnt/silver/dim_team')

### Transformations

In [83]:
# Drop ambiguous columns
player_df = player_df.withColumnRenamed('player_id', 'p_player_id')
team_df = team_df.withColumnRenamed('team_id', 't_team_id')


stat_joined_df = stat_df.join(player_df, (stat_df.player_id == player_df.p_player_id) & (player_df.is_active == True)) \
    .join(team_df, (stat_df.team_id == team_df.t_team_id) & (team_df.is_active == True))

In [84]:
from pyspark.sql.functions import lit

# Replaces dim ids with their surrogate keys
stat_final_df = stat_joined_df.select('stat_id', 'game_id', 'team_key', 'player_key',
     'ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga', 'fgm', 'ft_pct',
     'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl', 'turnover') \
     .withColumn('season', lit(season)) \
     .distinct()

In [85]:
from pyspark.sql.functions import col, when

# Handle NULL values
stat_final_df = stat_final_df \
    .withColumn('ast', when(col('ast').isNull(), 0).otherwise(col('ast'))) \
    .withColumn('blk', when(col('blk').isNull(), 0).otherwise(col('blk'))) \
    .withColumn('dreb', when(col('dreb').isNull(), 0).otherwise(col('dreb'))) \
    .withColumn('fg3_pct', when(col('fg3_pct').isNull(), 0.0).otherwise(col('fg3_pct'))) \
    .withColumn('fg3a', when(col('fg3a').isNull(), 0).otherwise(col('fg3a'))) \
    .withColumn('fg3m', when(col('fg3m').isNull(), 0).otherwise(col('fg3m'))) \
    .withColumn('fg_pct', when(col('fg_pct').isNull(), 0.0).otherwise(col('fg_pct'))) \
    .withColumn('fga', when(col('fga').isNull(), 0).otherwise(col('fga'))) \
    .withColumn('fgm', when(col('fgm').isNull(), 0).otherwise(col('fgm'))) \
    .withColumn('ft_pct', when(col('ft_pct').isNull(), 0.0).otherwise(col('ft_pct'))) \
    .withColumn('fta', when(col('fta').isNull(), 0).otherwise(col('fta'))) \
    .withColumn('ftm', when(col('ftm').isNull(), 0).otherwise(col('ftm'))) \
    .withColumn('min', when(col('min').isNull(), '00:00').otherwise(col('min'))) \
    .withColumn('oreb', when(col('oreb').isNull(), 0).otherwise(col('oreb'))) \
    .withColumn('pf', when(col('pf').isNull(), 0).otherwise(col('pf'))) \
    .withColumn('pts', when(col('pts').isNull(), 0).otherwise(col('pts'))) \
    .withColumn('reb', when(col('reb').isNull(), 0).otherwise(col('reb'))) \
    .withColumn('stl', when(col('stl').isNull(), 0).otherwise(col('stl'))) \
    .withColumn('turnover', when(col('turnover').isNull(), 0).otherwise(col('turnover'))) \
    .withColumn('season', when(col('season').isNull(), 0).otherwise(col('season'))) 

### Merge Data

In [86]:
container = 'gold'
database = 'prize_picks_gold'
table = 'fact_stats'
file_format = 'delta'
partition_col = 'season'
merge_condition = 'tgt.stat_id = src.stat_id'

merge_data(stat_final_df, container, database, table, file_format, partition_col, merge_condition )

In [None]:
%%sql
DROP TABLE IF EXISTS prize_picks_silver.fact_stats;

In [87]:
# spark.sql(f'SELECT COUNT(*) FROM prize_picks_gold.fact_stats').show()