# Spark API Lesson

## Sorting and Ordering

In [1]:
import pandas as pd
import numpy as np
import pyspark
from pydataset import data


spark = pyspark.sql.SparkSession.builder.getOrCreate()
np.random.seed(13)

In [2]:
pandas_dataframe = data("tips")

In [3]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [4]:
df.sort(df.day, df.size).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.58|1.92|  Male|   Yes|Fri| Lunch|   1|
|     12.46| 1.5|  Male|    No|Fri|Dinner|   2|
|     12.03| 1.5|  Male|   Yes|Fri|Dinner|   2|
|     16.32| 4.3|Female|   Yes|Fri|Dinner|   2|
|     12.16| 2.2|  Male|   Yes|Fri| Lunch|   2|
|     27.28| 4.0|  Male|   Yes|Fri|Dinner|   2|
|     11.35| 2.5|Female|   Yes|Fri|Dinner|   2|
|     22.49| 3.5|  Male|    No|Fri|Dinner|   2|
|     10.09| 2.0|Female|   Yes|Fri| Lunch|   2|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|
|     15.38| 3.0|Female|   Yes|Fri|Dinner|   2|
|     21.01| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     13.42|3.48|Female|   Yes|Fri| Lunch|   2|
|     16.27| 2.5|Female|   Yes|Fri| Lunch|   2|
|     13.42|1.58|  Male|   Yes|Fri| Lunch|   2|
|     28.97| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     22.75|3.25|Female|    No|Fri|Dinner|   2|
|     15.98| 3.0|Female|    No|Fri| Lunc

In [5]:
from pyspark.sql.functions import asc, desc

In [6]:
df.sort(df.day, desc('size')).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     40.17|4.73|  Male|   Yes|Fri|Dinner|   4|
|     15.98| 3.0|Female|    No|Fri| Lunch|   3|
|     22.75|3.25|Female|    No|Fri|Dinner|   2|
|     22.49| 3.5|  Male|    No|Fri|Dinner|   2|
|     28.97| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     12.46| 1.5|  Male|    No|Fri|Dinner|   2|
|     13.42|3.48|Female|   Yes|Fri| Lunch|   2|
|     15.38| 3.0|Female|   Yes|Fri|Dinner|   2|
|     27.28| 4.0|  Male|   Yes|Fri|Dinner|   2|
|     10.09| 2.0|Female|   Yes|Fri| Lunch|   2|
|     13.42|1.58|  Male|   Yes|Fri| Lunch|   2|
|     21.01| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     12.03| 1.5|  Male|   Yes|Fri|Dinner|   2|
|     16.27| 2.5|Female|   Yes|Fri| Lunch|   2|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|
|     11.35| 2.5|Female|   Yes|Fri|Dinner|   2|
|     12.16| 2.2|  Male|   Yes|Fri| Lunch|   2|
|     16.32| 4.3|Female|   Yes|Fri|Dinne

In [7]:
#sort by day, then time with dinner first alphabet, then descending by size startign with largest to smallest
df.sort(df.day, asc('time'),  desc('size')).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     40.17|4.73|  Male|   Yes|Fri|Dinner|   4|
|     27.28| 4.0|  Male|   Yes|Fri|Dinner|   2|
|     12.46| 1.5|  Male|    No|Fri|Dinner|   2|
|     21.01| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     22.75|3.25|Female|    No|Fri|Dinner|   2|
|     28.97| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     15.38| 3.0|Female|   Yes|Fri|Dinner|   2|
|     22.49| 3.5|  Male|    No|Fri|Dinner|   2|
|     11.35| 2.5|Female|   Yes|Fri|Dinner|   2|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|
|     12.03| 1.5|  Male|   Yes|Fri|Dinner|   2|
|     16.32| 4.3|Female|   Yes|Fri|Dinner|   2|
|     15.98| 3.0|Female|    No|Fri| Lunch|   3|
|     13.42|1.58|  Male|   Yes|Fri| Lunch|   2|
|     10.09| 2.0|Female|   Yes|Fri| Lunch|   2|
|     13.42|3.48|Female|   Yes|Fri| Lunch|   2|
|     12.16| 2.2|  Male|   Yes|Fri| Lunch|   2|
|     16.27| 2.5|Female|   Yes|Fri| Lunc

In [9]:
# df.sort(col('size').desc(), col('time')).show()

In [10]:
#.where and filter do the same thing
df.where(df.tip < 4).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|
|     20.65|3.35|  Male|    No|Sat|Dinne

In [None]:
#pandas would give boolean values
#now we get column expression representing this expression
#turn into mask then show

df.tip < 4

In [None]:
mask = df.tip < 4


In [12]:
#pipe line is "or"
df.filter((df.time == 'Dinner')|(df.tip <= 2)).sort('tip').show()

+----------+----+------+------+----+------+----+
|total_bill| tip|   sex|smoker| day|  time|size|
+----------+----+------+------+----+------+----+
|      5.75| 1.0|Female|   Yes| Fri|Dinner|   2|
|      7.25| 1.0|Female|    No| Sat|Dinner|   1|
|      12.6| 1.0|  Male|   Yes| Sat|Dinner|   2|
|      3.07| 1.0|Female|   Yes| Sat|Dinner|   1|
|     16.99|1.01|Female|    No| Sun|Dinner|   2|
|      12.9| 1.1|Female|   Yes| Sat|Dinner|   2|
|     32.83|1.17|  Male|   Yes| Sat|Dinner|   2|
|     10.07|1.25|  Male|    No| Sat|Dinner|   2|
|     10.51|1.25|  Male|    No| Sat|Dinner|   2|
|      8.51|1.25|Female|    No|Thur| Lunch|   2|
|      9.68|1.32|  Male|    No| Sun|Dinner|   2|
|     18.64|1.36|Female|    No|Thur| Lunch|   3|
|      7.74|1.44|  Male|   Yes| Sat|Dinner|   2|
|      7.56|1.44|  Male|    No|Thur| Lunch|   2|
|      9.55|1.45|  Male|    No| Sat|Dinner|   2|
|     10.77|1.47|  Male|    No| Sat|Dinner|   2|
|      8.52|1.48|  Male|    No|Thur| Lunch|   2|
|     19.08| 1.5|  M

In [14]:
#muli .where is like "and"
df.where(df.smoker == 'Yes').where(df.day == 'Sat').show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     38.01| 3.0|  Male|   Yes|Sat|Dinner|   4|
|     11.24|1.76|  Male|   Yes|Sat|Dinner|   2|
|     20.29|3.21|  Male|   Yes|Sat|Dinner|   2|
|     13.81| 2.0|  Male|   Yes|Sat|Dinner|   2|
|     11.02|1.98|  Male|   Yes|Sat|Dinner|   2|
|     18.29|3.76|  Male|   Yes|Sat|Dinner|   4|
|      3.07| 1.0|Female|   Yes|Sat|Dinner|   1|
|     15.01|2.09|  Male|   Yes|Sat|Dinner|   2|
|     26.86|3.14|Female|   Yes|Sat|Dinner|   2|
|     25.28| 5.0|Female|   Yes|Sat|Dinner|   2|
|     17.92|3.08|  Male|   Yes|Sat|Dinner|   2|
|      44.3| 2.5|Female|   Yes|Sat|Dinner|   3|
|     22.42|3.48|Female|   Yes|Sat|Dinner|   2|
|     15.36|1.64|  Male|   Yes|Sat|Dinner|   2|
|     20.49|4.06|  Male|   Yes|Sat|Dinner|   2|
|     25.21|4.29|  Male|   Yes|Sat|Dinner|   2|
|     14.31| 4.0|Female|   Yes|Sat|Dinner|   2|
|     10.59|1.61|Female|   Yes|Sat|Dinne

## Grouping and Aggregating