In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import IntegerType

In [25]:
spark = SparkSession.builder.getOrCreate()

data_path = "../StudentsPerformance.csv"
df = spark.read.csv(data_path, header=True)

print(df.columns)

keys = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
target = 'math score'

df = df \
    .withColumnRenamed(keys[0], keys[0][0]) \
    .withColumnRenamed(keys[1], keys[1][0]) \
    .withColumnRenamed(keys[2], keys[2][0]) \
    .withColumnRenamed(keys[3], keys[3][0]) \
    .withColumnRenamed(keys[4], keys[4][0])
keys = list(map(lambda key: key[0], keys))

In [26]:
print("columns: ", df.columns)
print("# of rows: ", df.count())

df.select('gender').distinct().show(100)
df.select('race/ethnicity').distinct().show(100)
df.select('parental level of education').distinct().show(100)
df.select('lunch').distinct().show(100)
df.select('test preparation course').distinct().show(100)

df.select(F.isnan('gender'),
          F.isnan('race/ethnicity'),
          F.isnan('parental level of education'),
          F.isnan('lunch'),
          F.isnan('test preparation course'))

columns:  ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score', 'reading score', 'writing score']
# of rows:  1000


In [27]:
agg_df = df.cube(*keys).agg(F.avg(F.col(target)), F.var_pop(F.col(target))) \
   .fillna('All', keys)
    
agg_df.filter("g='female' and r='All' and p='All' and l='All' and t='All'").show()

w = Window.partitionBy("g", "r", "p", "l", "t").orderBy('math score')
rank_df = df.withColumn("rank", F.rank().over(w))
rank_df.orderBy(*keys).show()

df.groupBy(*keys).agg(F.avg(target), F.var_pop(target), F.count(F.lit(1))/2) \
  .withColumnRenamed('avg(math score)', 'avg') \
  .withColumnRenamed('var_pop(math score)', 'var') \
  .withColumnRenamed('(count(1) / 2)', 'med_rank_double') \
  .withColumn('med_rank', F.col('med_rank_double').cast(IntegerType())) \
  .join(rank_df, keys, 'left') \
  .filter("rank == med_rank") \
  .select(*keys, 'avg', 'var', 'math score') \
  .orderBy(*keys) \
  .show()



+------+
|gender|
+------+
|female|
|  male|
+------+

+--------------+
|race/ethnicity|
+--------------+
|       group B|
|       group C|
|       group D|
|       group A|
|       group E|
+--------------+

+---------------------------+
|parental level of education|
+---------------------------+
|           some high school|
|         associate's degree|
|                high school|
|          bachelor's degree|
|            master's degree|
|               some college|
+---------------------------+

+------------+
|       lunch|
+------------+
|free/reduced|
|    standard|
+------------+

+-----------------------+
|test preparation course|
+-----------------------+
|              completed|
|                   none|
+-----------------------+



DataFrame[isnan(gender): boolean, isnan(race/ethnicity): boolean, isnan(parental level of education): boolean, isnan(lunch): boolean, isnan(test preparation course): boolean]

In [29]:
df.select('gender').distinct().show(100)

+------+
|gender|
+------+
|female|
|  male|
+------+



+------+-------+------------------+------------+---------+----------+-------------+-------------+----+
|     g|      r|                 p|           l|        t|math score|reading score|writing score|rank|
+------+-------+------------------+------------+---------+----------+-------------+-------------+----+
|female|group A|associate's degree|free/reduced|     none|        37|           57|           56|   1|
|female|group A|associate's degree|free/reduced|     none|        41|           51|           48|   2|
|female|group A|associate's degree|free/reduced|     none|        65|           85|           76|   3|
|female|group A|associate's degree|    standard|completed|        55|           65|           62|   1|
|female|group A|associate's degree|    standard|completed|        65|           70|           74|   2|
|female|group A|associate's degree|    standard|     none|        82|           93|           93|   1|
|female|group A| bachelor's degree|    standard|     none|        59|    

+------+-------+------------------+------------+---------+------------------+------------------+----------+
|     g|      r|                 p|           l|        t|               avg|               var|math score|
+------+-------+------------------+------------+---------+------------------+------------------+----------+
|female|group A|associate's degree|free/reduced|     none|47.666666666666664|152.88888888888889|        37|
|female|group A|associate's degree|    standard|completed|              60.0|              25.0|        55|
|female|group A| bachelor's degree|    standard|     none|51.666666666666664|32.888888888888886|        45|
|female|group A|       high school|free/reduced|completed|54.666666666666664|309.55555555555554|        34|
|female|group A|       high school|    standard|completed|              71.5|             12.25|        68|
|female|group A|       high school|    standard|     none|              58.0|               9.0|        55|
|female|group A|      some c

+------+---+---+---+---+------------------+-------------------+
|     g|  r|  p|  l|  t|   avg(math score)|var_pop(math score)|
+------+---+---+---+---+------------------+-------------------+
|female|All|All|All|All|63.633204633204635|   239.521831815268|
+------+---+---+---+---+------------------+-------------------+



In [33]:
keys

['g', 'r', 'p', 'l', 't']