In [7]:
import pyspark
import pandas as pd
import numpy as np
from pyspark.sql.functions import col, expr
from pyspark.sql.functions import *
from pyspark.sql.functions import lit
from pyspark.sql.functions import asc, desc
from pyspark.sql.functions import month, year, quarter

from pydataset import data



# Spark 101 Exercises

## Create a spark data frame that contains your favorite programming languages.

* The name of the column should be language
* View the schema of the dataframe
* Output the shape of the dataframe
* Show the first 5 records in the dataframe

In [3]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("ProgrammingLanguages").getOrCreate()

# Define the data
data = [("Python",),
        ("Java",),
        ("JavaScript",),
        ("C++",),
        ("Ruby",),
        ("Go",)]

# Create the DataFrame
df = spark.createDataFrame(data, ["language"])

# View the schema of the DataFrame
df.printSchema()

# Output the shape of the DataFrame
print("Shape of the DataFrame: ", (df.count(), len(df.columns)))

# Show the first 5 records in the DataFrame
df.show(5)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/30 13:43:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- language: string (nullable = true)



                                                                                

Shape of the DataFrame:  (6, 1)
+----------+
|  language|
+----------+
|    Python|
|      Java|
|JavaScript|
|       C++|
|      Ruby|
+----------+
only showing top 5 rows



DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]

## 2. Load the mpg dataset as a spark dataframe.

Create 1 column of output that contains a message like the one below:


The 1999 audi a4 has a 4 cylinder engine.
For each vehicle.

Transform the trans column so that it only contains either manual or auto.

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

mpg_df = spark.createDataFrame(data('mpg'))

In [11]:
mpg_df.show(5)


+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lit, regexp_extract, when

# Create a SparkSession
spark = SparkSession.builder.appName("MPG").getOrCreate()

# Assuming you have already loaded the DataFrame as 'mpg_df'

# Create a new column 'output' with the desired message
mpg_df = mpg_df.withColumn("output", concat(
    lit("The "),
    mpg_df.year.cast("String"),
    lit(" "),
    mpg_df.manufacturer,
    lit(" "),
    mpg_df.model,
    lit(" has a "),
    mpg_df.cyl.cast("String"),
    lit(" cylinder engine.")
))

# Transform the 'trans' column to contain either 'manual' or 'auto'
mpg_df = mpg_df.withColumn("trans", when(regexp_extract(mpg_df.trans, r"(\w+)", 1) == "manual", "manual").otherwise("auto"))

# Show the DataFrame
mpg_df.show(5, truncate=False)


+------------+-----+-----+----+---+------+---+---+---+---+-------+-----------------------------------------+
|manufacturer|model|displ|year|cyl|trans |drv|cty|hwy|fl |class  |output                                   |
+------------+-----+-----+----+---+------+---+---+---+---+-------+-----------------------------------------+
|audi        |a4   |1.8  |1999|4  |auto  |f  |18 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine.|
|audi        |a4   |1.8  |1999|4  |manual|f  |21 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.0  |2008|4  |manual|f  |20 |31 |p  |compact|The 2008 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.0  |2008|4  |auto  |f  |21 |30 |p  |compact|The 2008 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.8  |1999|6  |auto  |f  |16 |26 |p  |compact|The 1999 audi a4 has a 6 cylinder engine.|
+------------+-----+-----+----+---+------+---+---+---+---+-------+-----------------------------------------+
only showing top 5 

# 3. Load the tips dataset as a spark dataframe.

* What percentage of observations are smokers?
* Create a column that contains the tip percentage
* Calculate the average tip percentage for each combination of sex and smoker.

In [15]:
tips_df = spark.createDataFrame(data('tips'))

In [17]:
tips_df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [18]:


# Calculate the percentage of observations that are smokers
num_smokers = tips_df.filter(col("smoker") == "Yes").count()
num_observations = tips_df.count()
smokers_percentage = (num_smokers / num_observations) * 100

print("Percentage of observations that are smokers: {:.2f}%".format(smokers_percentage))

# Create a column for tip percentage
tips_df = tips_df.withColumn("tip_percentage", (col("tip") / col("total_bill")) * 100)

# Calculate the average tip percentage for each combination of sex and smoker
average_tip_percentage = tips_df.groupBy("sex", "smoker").avg("tip_percentage")

average_tip_percentage.show()


Percentage of observations that are smokers: 38.11%


[Stage 20:>                                                       (0 + 12) / 12]

+------+------+-------------------+
|   sex|smoker|avg(tip_percentage)|
+------+------+-------------------+
|  Male|    No|  16.06687151291298|
|Female|    No|  15.69209707691836|
|  Male|   Yes| 15.277117520248511|
|Female|   Yes|  18.21503526994103|
+------+------+-------------------+



                                                                                

## 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

* Convert the temperatures to fahrenheit.
* Which month has the most rain, on average?
* Which year was the windiest?
* What is the most frequent type of weather in January?
* What is the average high and low temperature on sunny days in July in 2013 and 2014?
* What percentage of days were rainy in q3 of 2015?
* For each year, find what percentage of days it rained (had non-zero precipitation).



In [20]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)


+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, year, avg, max, count, when, sum, expr


# Assuming you have already loaded the DataFrame as 'weather'

# Convert temperatures to Fahrenheit
weather = weather.withColumn("temp_max", expr("temp_max * 9/5 + 32"))
weather = weather.withColumn("temp_min", expr("temp_min * 9/5 + 32"))

# Which month has the most rain, on average?
rainiest_month = weather.groupBy(month("date").alias("month")).avg("precipitation").orderBy("avg(precipitation)", ascending=False).first()["month"]
print("Month with the most rain, on average:", rainiest_month)

# Which year was the windiest?
windiest_year = weather.groupBy(year("date").alias("year")).sum("wind").orderBy("sum(wind)", ascending=False).first()["year"]
print("Windiest year:", windiest_year)

# What is the most frequent type of weather in January?
most_frequent_weather_january = weather.filter(month("date") == 1).groupBy("weather").count().orderBy("count", ascending=False).first()["weather"]
print("Most frequent weather in January:", most_frequent_weather_january)

# What is the average high and low temperature on sunny days in July in 2013 and 2014?
average_high_low_sunny_july = weather.filter((year("date").isin([2013, 2014])) & (month("date") == 7) & (weather.weather == "sunny")).agg(avg("temp_max").alias("average_high"), avg("temp_min").alias("average_low")).first()
print("Average high temperature on sunny days in July (2013 and 2014):", average_high_low_sunny_july["average_high"])
print("Average low temperature on sunny days in July (2013 and 2014):", average_high_low_sunny_july["average_low"])

# What percentage of days were rainy in Q3 of 2015?
total_days_q3_2015 = weather.filter((year("date") == 2015) & (month("date").between(7, 9))).count()
rainy_days_q3_2015 = weather.filter((year("date") == 2015) & (month("date").between(7, 9)) & (weather.precipitation > 0)).count()
rainy_percentage_q3_2015 = (rainy_days_q3_2015 / total_days_q3_2015) * 100
print("Percentage of rainy days in Q3 of 2015:", rainy_percentage_q3_2015)

# For each year, find the percentage of days it rained (had non-zero precipitation)
rainy_days_percentage_by_year = weather.groupBy(year("date").alias("year")).agg((sum(when(weather.precipitation > 0, 1)) / count("*") * 100).alias("rainy_percentage"))
rainy_days_percentage_by_year.show()


                                                                                

Month with the most rain, on average: 11


                                                                                

Windiest year: 2012


                                                                                

Most frequent weather in January: fog
Average high temperature on sunny days in July (2013 and 2014): None
Average low temperature on sunny days in July (2013 and 2014): None
Percentage of rainy days in Q3 of 2015: 18.478260869565215


[Stage 42:>                                                       (0 + 12) / 12]

+----+-----------------+
|year| rainy_percentage|
+----+-----------------+
|2012|48.36065573770492|
|2013|41.64383561643836|
|2014| 41.0958904109589|
|2015|39.45205479452055|
+----+-----------------+



                                                                                