# Spark Exercises

In [1]:
# Get libraries
import re
import pyspark
from pydataset import data
spark = pyspark.sql.SparkSession.builder.getOrCreate()
from pyspark.sql.functions import *

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/24 14:59:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### (1). Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [2]:
# Make dictionary

data = [{'language': 'python'}, {'language':'Java'}, {'language':'html'}, {'langauge':'sql'},{'language': 'css'}, {'language':'go'}, {'language':'c++'}, {'langauge':'javascript'}]


In [3]:
# Create spark context object

from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("projectName").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)

In [4]:
# Create rdd

rdd = sc.parallelize(data)
type(rdd)

pyspark.rdd.RDD

In [5]:
# Convert rdd to DF
df = rdd.toDF()
type(df)

                                                                                

pyspark.sql.dataframe.DataFrame

In [6]:
# Display the DF
df.show()

+--------+
|language|
+--------+
|  python|
|    Java|
|    html|
|    null|
|     css|
|      go|
|     c++|
|    null|
+--------+



In [7]:
# Fill nulls 

df.na.fill('', subset = 'language').show()

+--------+
|language|
+--------+
|  python|
|    Java|
|    html|
|        |
|     css|
|      go|
|     c++|
|        |
+--------+



In [8]:
# Drop nulls 

df.na.drop(subset = 'language').show()

# # or (with one column)
# df.na.drop().show()


+--------+
|language|
+--------+
|  python|
|    Java|
|    html|
|     css|
|      go|
|     c++|
+--------+



In [9]:
# First 5

df.show(5)

+--------+
|language|
+--------+
|  python|
|    Java|
|    html|
|    null|
|     css|
+--------+
only showing top 5 rows



In [10]:
# Shape pf DF
print((df.count(), len(df.columns)))

(8, 1)


### (2). Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:
        
        The 1999 audi a4 has a 4 cylinder engine.
- For each vehicle.

    - Transform the trans column so that it only contains either manual or auto.

In [11]:
# Import data

from pydataset import data
mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [12]:
# Create desctiption col with message (The 1999 audi a4 has a 4 cylinder engine)

mpg.select('*', concat(lit('The '), 'year', lit(' '),\
                       'manufacturer', lit(' '), 'model',\
                       lit(' has a '), 'cyl', lit(' cylinder engine'))\
           .alias('description')).show(5, truncate = False)


+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------------------------------------+
|manufacturer|model|displ|year|cyl|trans     |drv|cty|hwy|fl |class  |description                             |
+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------------------------------------+
|audi        |a4   |1.8  |1999|4  |auto(l5)  |f  |18 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine|
|audi        |a4   |1.8  |1999|4  |manual(m5)|f  |21 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine|
|audi        |a4   |2.0  |2008|4  |manual(m6)|f  |20 |31 |p  |compact|The 2008 audi a4 has a 4 cylinder engine|
|audi        |a4   |2.0  |2008|4  |auto(av)  |f  |21 |30 |p  |compact|The 2008 audi a4 has a 4 cylinder engine|
|audi        |a4   |2.8  |1999|6  |auto(l5)  |f  |16 |26 |p  |compact|The 1999 audi a4 has a 6 cylinder engine|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------------------------------

In [13]:
# Get trans col to either manual or auto

mpg.select('*', regexp_extract('trans', r'^(.*).{4}', 1).alias('transmission')).show(3)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+------------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|transmission|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+------------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|        auto|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|      manual|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|      manual|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+------------+
only showing top 3 rows



In [14]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



### (3). Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [15]:
# Load the dataframe
tips = spark.createDataFrame(data('tips'))
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

#### Create tip Percent

In [16]:
tips.select('*', ((tips.tip / tips.total_bill)*100).alias('tip_percentage')).show(5)

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|    tip_percentage|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|5.9446733372572105|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|16.054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|16.658733936220845|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 13.97804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|14.680764538430255|
+----------+----+------+------+---+------+----+------------------+
only showing top 5 rows



> #### Cols using col, expr

The __expr__ function is more powerful than __col__. It does everything col does and more. expr returns the same type of column object, but allows us to express manipulations to the column within the string that defines the column.

In [17]:
# Import req libraries
from pyspark.sql.functions import col, expr

In [18]:
# using col

tip_percent = (col('tip') / col('total_bill') * 100)

tips.select('*', tip_percent.alias('tip_percent')).show()

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|       tip_percent|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|5.9446733372572105|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|16.054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|16.658733936220845|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 13.97804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|14.680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4| 18.62396204033215|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| 22.80501710376283|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|11.607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|13.031914893617023|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|21.853856562922868|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 16.65043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|14.180374361883

In [19]:
# Using expr
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [20]:
# Average tip by sex and smoker

tip_percent = (col('tip') / col('total_bill') * 100)

tp= tips.select('*', tip_percent.alias('tip_percent'))
tp.groupBy('sex', 'smoker').agg(mean('tip')).show()

+------+------+------------------+
|   sex|smoker|          avg(tip)|
+------+------+------------------+
|  Male|    No|3.1134020618556706|
|Female|    No| 2.773518518518518|
|  Male|   Yes| 3.051166666666666|
|Female|   Yes| 2.931515151515151|
+------+------+------------------+



### (4). Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [21]:
# Get Seatle weather set
from vega_datasets import data

weather = data.seattle_weather().assign(date = lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



###    Conversion Formula

-    (0°C × 9/5) + 32 = 32°F
    
#### Convert the temperatures to fahrenheit.


In [22]:
# Temperatures to fahrenheit

weather.select('*', \
               expr('ROUND((temp_max * (9/5) + 32), 2) AS temp_max_farenheight'), \
               expr('ROUND((temp_min * (9/5) + 32), 2) AS temp_min_farenheight')).show(6)

+----------+-------------+--------+--------+----+-------+--------------------+--------------------+
|      date|precipitation|temp_max|temp_min|wind|weather|temp_max_farenheight|temp_min_farenheight|
+----------+-------------+--------+--------+----+-------+--------------------+--------------------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|               55.04|                41.0|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|               51.08|               37.04|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|               53.06|               44.96|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|               53.96|               42.08|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|               48.02|               37.04|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|               39.92|               35.96|
+----------+-------------+--------+--------+----+-------+--------------------+--------------------+


In [23]:
# Alternative way 
weather = weather.withColumn(
'temp_max', (col('temp_max') * 9 / 5 + 32)
).withColumn('temp_min', (col('temp_min') * 9/5 + 32))
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05|          1.3|   48.02|   37.04| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



#### Temperatures to fahrenheit (alternative way with manipulating expr)

In [24]:
# Temperatures to fahrenheit (alternative way with manipulating expr)

weather.selectExpr('date','temp_max', 'temp_min',\
               ('ROUND((temp_max * (9/5) + 32), 2) AS temp_max_farenheight'), \
               ('ROUND((temp_min * (9/5) + 32), 2) AS temp_min_farenheight')).show(6)

+----------+--------+--------+--------------------+--------------------+
|      date|temp_max|temp_min|temp_max_farenheight|temp_min_farenheight|
+----------+--------+--------+--------------------+--------------------+
|2012-01-01|   55.04|    41.0|              131.07|               105.8|
|2012-01-02|   51.08|   37.04|              123.94|               98.67|
|2012-01-03|   53.06|   44.96|              127.51|              112.93|
|2012-01-04|   53.96|   42.08|              129.13|              107.74|
|2012-01-05|   48.02|   37.04|              118.44|               98.67|
|2012-01-06|   39.92|   35.96|              103.86|               96.73|
+----------+--------+--------+--------------------+--------------------+
only showing top 6 rows



#### Which month has the most rain, on average?


In [25]:
# Which month has the most rain, on average?
weather.select('*',\
               regexp_extract('date', r'^(.\d{3,5})', 1).alias('year'),\
#                regexp_extract('date', r'(\d\d)\-', 1).alias('month'),\
               regexp_extract('date', r'(\d\d$)', 1).alias('day'),

              ).show(6)

+----------+-------------+--------+--------+----+-------+----+---+
|      date|precipitation|temp_max|temp_min|wind|weather|year|day|
+----------+-------------+--------+--------+----+-------+----+---+
|2012-01-01|          0.0|   55.04|    41.0| 4.7|drizzle|2012| 01|
|2012-01-02|         10.9|   51.08|   37.04| 4.5|   rain|2012| 02|
|2012-01-03|          0.8|   53.06|   44.96| 2.3|   rain|2012| 03|
|2012-01-04|         20.3|   53.96|   42.08| 4.7|   rain|2012| 04|
|2012-01-05|          1.3|   48.02|   37.04| 6.1|   rain|2012| 05|
|2012-01-06|          2.5|   39.92|   35.96| 2.2|   rain|2012| 06|
+----------+-------------+--------+--------+----+-------+----+---+
only showing top 6 rows



In [26]:
# Correct Example (most rain on average)

row = (
    weather.withColumn("month", month("date"))
    .withColumn("year", year("date"))
    .groupBy("month", "year")
    .agg(sum("precipitation").alias("total_monthly_precipitation"))
    .groupBy("month")
    .agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
    .sort(col("avg_monthly_rain").desc())
    .first()
)
row

Row(month=11, avg_monthly_rain=160.625)

#### Which year was the windiest?

In [27]:
(
    weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(sum("wind").alias("total_winds"))
    .sort(col("total_winds").desc())
    .head(5)
)

[Row(year=2012, total_winds=1244.6999999999998),
 Row(year=2014, total_winds=1236.5),
 Row(year=2015, total_winds=1153.3),
 Row(year=2013, total_winds=1100.8000000000002)]

#### What is the most frequent type of weather in January?

In [28]:
# What is the most frequent type of weather in January?

(weather.withColumn('month', month('date'))
 .filter(col('month') == 1)
 .groupBy('Weather')
 .count()
 .sort(col('count').desc())
 .show()
)

+-------+-----+
|Weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [29]:
# Group By
weather.groupBy('weather').count().show()

+-------+-----+
|weather|count|
+-------+-----+
|drizzle|   54|
|   rain|  259|
|    sun|  714|
|   snow|   23|
|    fog|  411|
+-------+-----+



#### What is the average high and low temperature on sunny days in July in 2013 and 2014?

In [30]:
(
    weather.filter(month("date") == 7)
    .filter(year("date") > 2012)
    .filter(year("date") < 2015)
    .filter(col("weather") == lit("sun"))
    .agg(
        avg("temp_max").alias("average_high_temp"),
        avg("temp_min").alias("average_low_temp"),
    )
    .show()
)

+-----------------+-----------------+
|average_high_temp| average_low_temp|
+-----------------+-----------------+
|80.29192307692308|57.52884615384615|
+-----------------+-----------------+



#### What percentage of days were rainy in q3 of 2015?


In [31]:
# What percentage of days were rainy in q3 of 2015?

# in pandas -- (df.weather == "rain").mean()
# measure a rainy day by weather == rain
(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(when(col("weather") == "rain", 1).otherwise(0).alias("rain"))
    .agg(mean("rain"))
    .show()
)

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



#### For each year, find what percentage of days it rained (had non-zero precipitation).

In [32]:
(
weather.withColumn('year', year('date'))
    .select(when(col('precipitation') > 0, 1).otherwise(0).alias('did_rain'), 'year')
    .groupBy('year')
    .agg(mean('did_rain'))
    .show()
)

+----+-------------------+
|year|      avg(did_rain)|
+----+-------------------+
|2012|0.48360655737704916|
|2013|0.41643835616438357|
|2014|  0.410958904109589|
|2015|0.39452054794520547|
+----+-------------------+

