In [0]:
from pyspark.sql.functions import *

## READ TABLE

In [0]:
#read data
df = spark.read.format("parquet").option("header", "true").load("abfss://bronze@olympicsprogectstorage.dfs.core.windows.net/athletes/")

In [0]:
display(df)

## MAKE THE TABLE USABLE IN SQL

In [0]:
# convert df to sql view
DF = df.createOrReplaceTempView("df")

In [0]:
%sql
SELECT * 
FROM DF
WHERE DF.name like '%ISMAIL%';

## CHANGE ROW VALUE

In [0]:
# edit rows with regex

df = df.withColumn("country_long", regexp_replace(col("country_long"), "Islamic Republic of Iran", "Iran"))
display(df)

## CHANGE DATA TYPE

In [0]:
# change data type
df = df.withColumn("weight", col("weight").cast("float"))
df = df.withColumn("height", col("height").cast("float"))

## ISNULL

In [0]:
# manage nulls
df = df.fillna("unknown", subset=["occupation"])

## CASE WHEN

In [0]:
# case when
df = df.withColumn("height", when(col("height") == 0.0, None).otherwise(col("height")  ))

display(df)

## FILTER ROWS

In [0]:
# filter data
from pyspark.sql.functions import col

#df_filtered = df.filter(col("name").rlike("ISMAIL"))
df_filtered = df.filter((col("current") == True) & (col("name").isin("ISMAIL Mohamed", "ISMAIL Malak")))

display(df_filtered)

## GROUP BY

In [0]:
# aggregate values
df_grouped = df.groupBy("country").agg(count("country").alias("count")).orderBy(desc("count"))

display(df_grouped)

## RENAME COLUMN

In [0]:
# rename a column
df = df.withColumnRenamed("code", "athlete_id")

## SORT DATA

In [0]:
df_sorted = df.sort("height", "weight", ascending = [0,1])
display(df_sorted)

## SPLIT DATA

In [0]:
df = df.withColumn("first_name", split(col("name"), " ").getItem(0))

display(df)

## WINDOW FUNCTION: SUM OVER PARTITION BY

### HOW YOU DO IT IN SQL

In [0]:
%sql
SELECT 
DF.code, 
DF.name, 
DF.country, 
DF.height, 
AVG(DF.height) OVER (PARTITION  BY DF.country ORDER BY DF.height ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as avg_height
FROM DF
WHERE DF.gender = 'Male' AND DF.height IS NOT NULL AND DF.height > 0.0
ORDER BY DF.height DESC, avg_height DESC

### HOW YOU DO IT IN PYTHON

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, col

df_final = df.filter(
    (col("height").isNotNull()) & (col("gender") == 'Male') & (col("height") > 0) 
    ).withColumn(
    "height_avg",
    avg(col("height")).over(
        Window.partitionBy("country").orderBy(col("height")).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    )
)
    
df_final = df_final.select("code","country", "name", "height", "height_avg").sort("height", "height_avg",  ascending=[0,0])
display(df_final)


## LOAD TABLE TO SILVER

In [0]:
df_final.write.mode("overwrite").format("delta")\
    .option("path", "abfss://silver@olympicsprogectstorage.dfs.core.windows.net/athletes")\
    .saveAsTable("olympicspariscatalag.silver.athletes")