# Exercises for Apache Spark™ and Scala Workshops

This are my own solutions version in PySpark of the Exercises proposed by Jacek Laskowski in https://github.com/jaceklaskowski/spark-workshop/tree/gh-pages/exercises

## Exercises:

31. [Calculating percent rank](#31) 
32. [Finding First Non-Null Value per Group](#32)
33. [Finding Longest Sequence (Window Aggregation)](#33)
34. [Finding Most Common Non-null Prefix per Group (Occurences)](#34) 
35. [Using rollup Operator for Total and Average Salaries by Department and Company-Wide](#35)

# SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf


spark = SparkSession.builder.appName("Test_spark").master("local[*]").getOrCreate()

spark

# LIBRARIES

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *

## 31. Calculating percent rank <a id='31'></a>

In [4]:
df_input = spark.createDataFrame([
    ["Tony",50],
    ["Alan",45],
    ["Lee",60],
    ["David",35],
    ["Steve",65],
    ["Paul",48],
    ["Micky",62],
    ["George",80],
    ["Nigel",64],
    ["John",42]],("Employee","Salary"))

df_input.show()

+--------+------+
|Employee|Salary|
+--------+------+
|    Tony|    50|
|    Alan|    45|
|     Lee|    60|
|   David|    35|
|   Steve|    65|
|    Paul|    48|
|   Micky|    62|
|  George|    80|
|   Nigel|    64|
|    John|    42|
+--------+------+



In [12]:
df_output = (df_input
             .withColumn("Percentage",
                         percent_rank()
                         .over(Window.orderBy("Salary")))
             .withColumn("Percentage"
                         ,when(col("Percentage") > 0.7, "High" )
                         .when(col("Percentage") > 0.3,"Average")
                         .otherwise("Low"))
             .orderBy(col("Salary").desc()))
df_output.show()

+--------+------+----------+
|Employee|Salary|Percentage|
+--------+------+----------+
|  George|    80|      High|
|   Steve|    65|      High|
|   Nigel|    64|      High|
|   Micky|    62|   Average|
|     Lee|    60|   Average|
|    Tony|    50|   Average|
|    Paul|    48|   Average|
|    Alan|    45|       Low|
|    John|    42|       Low|
|   David|    35|       Low|
+--------+------+----------+



## 32. Finding First Non-Null Value per Group <a id='32'></a>

In [13]:
df_input = spark.createDataFrame([
  [None, 0],
  [None, 1],
  [2, 0],
  [None, 1],
  [4, 1]],("id", "group"))

df_input.show()

+----+-----+
|  id|group|
+----+-----+
|null|    0|
|null|    1|
|   2|    0|
|null|    1|
|   4|    1|
+----+-----+



In [23]:
# Set Window
firstWindow = (Window
               .partitionBy("group")
               .orderBy("id")
               .rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))
# Get first Non Null Values
df_output = df_input.withColumn("first_non_null",first("id",ignorenulls=True).over(firstWindow))
df_output.show()

+----+-----+--------------+
|  id|group|first_non_null|
+----+-----+--------------+
|null|    0|             2|
|   2|    0|             2|
|null|    1|             4|
|null|    1|             4|
|   4|    1|             4|
+----+-----+--------------+



In [24]:
# Remove duplicates
df_output = df_output.select("group","first_non_null").dropDuplicates()
df_output.show()

+-----+--------------+
|group|first_non_null|
+-----+--------------+
|    0|             2|
|    1|             4|
+-----+--------------+



## 33. Finding Longest Sequence (Window Aggregation) <a id='33'></a>

In [25]:
df_input = spark.createDataFrame([
    [1,1],
    [1,2],
    [1,4],
    [1,7],
    [1,8],
    [1,9],
    [2,1],
    [3,1],
    [3,2],
    [3,3]],("ID","time"))

df_input.show()

+---+----+
| ID|time|
+---+----+
|  1|   1|
|  1|   2|
|  1|   4|
|  1|   7|
|  1|   8|
|  1|   9|
|  2|   1|
|  3|   1|
|  3|   2|
|  3|   3|
+---+----+



## 34. Finding Most Common Non-null Prefix per Group (Occurences) <a id='34'></a>

In [26]:
df_input = spark.createDataFrame([
  [1, "Mr"],
  [1, "Mme"],
  [1, "Mr"],
  [1, None],
  [1, None],
  [1, None],
  [2, "Mr"],
  [3, None]],("UNIQUE_GUEST_ID", "PREFIX"))

df_input.show()

+---------------+------+
|UNIQUE_GUEST_ID|PREFIX|
+---------------+------+
|              1|    Mr|
|              1|   Mme|
|              1|    Mr|
|              1|  null|
|              1|  null|
|              1|  null|
|              2|    Mr|
|              3|  null|
+---------------+------+



In [46]:
# Set row Window
rowWindow = (Window
               .partitionBy("UNIQUE_GUEST_ID")
               .orderBy(col("Repetitions").desc()))
# Estimate repetitions               
df_output = (df_input
             .groupby("UNIQUE_GUEST_ID","PREFIX")
             .agg(count("PREFIX").alias("Repetitions"))
             .orderBy("UNIQUE_GUEST_ID",col("Repetitions").desc())
            )
# Get first row
df_output = (df_output
             .withColumn("rn",row_number().over(rowWindow))
             .where(col("rn") == 1)
             .drop("rn")
             .orderBy("UNIQUE_GUEST_ID")
df_output.show()

+---------------+------+-----------+
|UNIQUE_GUEST_ID|PREFIX|Repetitions|
+---------------+------+-----------+
|              1|    Mr|          2|
|              3|  null|          0|
|              2|    Mr|          1|
+---------------+------+-----------+



## 35. Using rollup Operator for Total and Average Salaries by Department and Company-Wide <a id='35'></a>

In [47]:
df_input = spark.createDataFrame([
    [1,"Hunter Fields","IT",15],
    [2,"Leonard Lewis","Support",81],
    [3,"Jason Dawson","Support",90],
    [4,"Andre Grant","Support",25],
    [5,"Earl Walton","IT",40],
    [6,"Alan Hanson","IT",24],
    [7,"Clyde Matthews","Support",31],
    [8,"Josephine Leonard","Support",1],
    [9,"Owen Boone","HR",27],
    [10,"Max McBride","IT",75]], ("id","name","department","salary"))

df_input.show()

+---+-----------------+----------+------+
| id|             name|department|salary|
+---+-----------------+----------+------+
|  1|    Hunter Fields|        IT|    15|
|  2|    Leonard Lewis|   Support|    81|
|  3|     Jason Dawson|   Support|    90|
|  4|      Andre Grant|   Support|    25|
|  5|      Earl Walton|        IT|    40|
|  6|      Alan Hanson|        IT|    24|
|  7|   Clyde Matthews|   Support|    31|
|  8|Josephine Leonard|   Support|     1|
|  9|       Owen Boone|        HR|    27|
| 10|      Max McBride|        IT|    75|
+---+-----------------+----------+------+



In [48]:
df_output = (df_input
             .rollup("department")
             .agg(sum("salary"),avg("salary")))
df_output.show()

+----------+-----------+-----------+
|department|sum(salary)|avg(salary)|
+----------+-----------+-----------+
|      null|        409|       40.9|
|   Support|        228|       45.6|
|        IT|        154|       38.5|
|        HR|         27|       27.0|
+----------+-----------+-----------+



In [49]:
# Rollup enables to apply aggregation function over specific groups
df_input.rollup("department").agg(max("salary")).show()

+----------+-----------+
|department|max(salary)|
+----------+-----------+
|      null|         90|
|   Support|         90|
|        IT|         75|
|        HR|         27|
+----------+-----------+

