# Spark Exercises

In [39]:
from pydataset import data
import pyspark
from pyspark.sql.functions import *

In [40]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
df = spark.createDataFrame(data("mpg"))
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [107]:
df.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [110]:
# Check columns

df.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [111]:
# Other ways to check columns

df.dtypes

[('manufacturer', 'string'),
 ('model', 'string'),
 ('displ', 'double'),
 ('year', 'bigint'),
 ('cyl', 'bigint'),
 ('trans', 'string'),
 ('drv', 'string'),
 ('cty', 'bigint'),
 ('hwy', 'bigint'),
 ('fl', 'string'),
 ('class', 'string')]

#### Create new Columns

In [42]:
df2 = df.select('model', 'trans', 'class', (df.cty + df.hwy).alias('city_hwy'))

In [43]:
df2.show(6)

+-----+----------+-------+--------+
|model|     trans|  class|city_hwy|
+-----+----------+-------+--------+
|   a4|  auto(l5)|compact|      47|
|   a4|manual(m5)|compact|      50|
|   a4|manual(m6)|compact|      51|
|   a4|  auto(av)|compact|      51|
|   a4|  auto(l5)|compact|      42|
|   a4|manual(m5)|compact|      44|
+-----+----------+-------+--------+
only showing top 6 rows



#### Get every columns

In [44]:
df.select('*').show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



### Calculations

In [45]:
from pyspark.sql.functions import sum

In [46]:
import pyspark.sql.functions as F

In [47]:
f.sum

<function pyspark.sql.functions.sum(col)>

In [48]:
# df.select(F.sum('cty') )

In [49]:
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [52]:
df.select('class')

DataFrame[class: string]

In [53]:
df1 = spark.createDataFrame([('100-200',)], ['str'])
df1.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
df1.show()

+-------+
|    str|
+-------+
|100-200|
+-------+



In [54]:

df1 = spark.createDataFrame([('foo',)], ['str'])
df1.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
df1.show()

+---+
|str|
+---+
|foo|
+---+



In [55]:
df1 = spark.createDataFrame([('aaaac',)], ['str'])
df1.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
df1.show()

+-----+
|  str|
+-----+
|aaaac|
+-----+



#### Sorting and filtering

- where and filter does the same thing
- Orderby and sort does same thing

In [61]:
df.filter(df['class'] == 'compact')
df.show(3)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 3 rows



In [68]:
df.where((df['class'] == 'compact') & (df.year > 1999)).show(3)


+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 3 rows



In [77]:
# Order by
df.orderBy(col('model')).show()

+------------+-----------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|      model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----------+-----+----+---+----------+---+---+---+---+-------+
|      toyota|4runner 4wd|  2.7|1999|  4|  auto(l4)|  4| 16| 20|  r|    suv|
|      toyota|4runner 4wd|  4.7|2008|  8|  auto(l5)|  4| 14| 17|  r|    suv|
|      toyota|4runner 4wd|  2.7|1999|  4|manual(m5)|  4| 15| 20|  r|    suv|
|      toyota|4runner 4wd|  4.0|2008|  6|  auto(l5)|  4| 16| 20|  r|    suv|
|      toyota|4runner 4wd|  3.4|1999|  6|manual(m5)|  4| 15| 17|  r|    suv|
|      toyota|4runner 4wd|  3.4|1999|  6|  auto(l4)|  4| 15| 19|  r|    suv|
|        audi|         a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|         a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|         a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|         a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|

### Aggregation
- groupBy
- pivot table

In [83]:
df.groupBy('manufacturer', 'model').agg(mean('cty')).alias('average_city').show()

+------------+-------------------+------------------+
|manufacturer|              model|          avg(cty)|
+------------+-------------------+------------------+
|        audi|         a4 quattro|            17.125|
|        audi|                 a4|18.857142857142858|
|   chevrolet| c1500 suburban 2wd|              12.8|
|   chevrolet|           corvette|              15.4|
|        audi|         a6 quattro|              16.0|
|       dodge|        caravan 2wd|15.818181818181818|
|   chevrolet|    k1500 tahoe 4wd|              12.5|
|   chevrolet|             malibu|              18.8|
|       dodge|  dakota pickup 4wd|12.777777777777779|
|       dodge|ram 1500 pickup 4wd|              11.4|
|       dodge|        durango 4wd|11.857142857142858|
|        ford|    f150 pickup 4wd|              13.0|
|        ford|       explorer 4wd|13.666666666666666|
|        ford|     expedition 2wd|11.333333333333334|
|        ford|            mustang| 15.88888888888889|
|     hyundai|             s

In [92]:
df.groupBy('model').pivot('cty').agg(mean('cty')).show(3)

+------------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|             model|   9|  11|  12|  13|  14|  15|  16|  17|  18|  19|  20|  21|  22|  23|  24|  25|  26|  28|  29|  33|  35|
+------------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|grand cherokee 4wd| 9.0|11.0|null|13.0|14.0|15.0|null|17.0|null|null|null|null|null|null|null|null|null|null|null|null|null|
|            altima|null|null|null|null|null|null|null|null|null|19.0|null|21.0|null|23.0|null|null|null|null|null|null|null|
|     navigator 2wd|null|11.0|12.0|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|
+------------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
only showing top 3 rows



### Additional Features

In [93]:
df.explain()

== Physical Plan ==
*(1) Scan ExistingRDD[manufacturer#486,model#487,displ#488,year#489L,cyl#490L,trans#491,drv#492,cty#493L,hwy#494L,fl#495,class#496]




### Null Values

In [106]:
df.groupBy('class').pivot('year').agg(mean('cty')).na.drop(subset=['hwy']).show()

AnalysisException: Cannot resolve column name "hwy" among (class, 1999, 2008)

### Spark Sql 

In [96]:
df.createOrReplaceTempView('mpg')

In [103]:
spark.sql('SELECT * FROM mpg').sort('cty').show()

+------------+--------------------+-----+----+---+----------+---+---+---+---+------+
|manufacturer|               model|displ|year|cyl|     trans|drv|cty|hwy| fl| class|
+------------+--------------------+-----+----+---+----------+---+---+---+---+------+
|       dodge|         durango 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge| ram 1500 pickup 4wd|  4.7|2008|  8|manual(m6)|  4|  9| 12|  e|pickup|
|       dodge| ram 1500 pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
|        jeep|  grand cherokee 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge|   dakota pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
|       dodge|         durango 4wd|  5.9|1999|  8|  auto(l4)|  4| 11| 15|  r|   suv|
|        ford|     f150 pickup 4wd|  5.4|1999|  8|  auto(l4)|  4| 11| 15|  r|pickup|
|       dodge| ram 1500 pickup 4wd|  5.9|1999|  8|  auto(l4)|  4| 11| 15|  r|pickup|
|     lincoln|       navigator 2wd|  5.4|1999|  8|  auto(l4)|  r|