In [1]:
# first things:
# imports and session intitialization
import pyspark
from pyspark.sql import functions as F
import pandas
from pydataset import data

In [2]:
#  initialize a session
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/19 11:36:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/19 11:36:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/19 11:36:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### Create a spark data frame that contains your favorite programming languages.

In [4]:
langs = pandas.DataFrame(
{
    'languages': [
        'python',
        'julia',
        'c',
        'r',
        'sql',
        'java',
        'scala']
}
)

In [5]:
langs

Unnamed: 0,languages
0,python
1,julia
2,c
3,r
4,sql
5,java
6,scala


In [7]:
# using the spark session, turn our tabular data into a spark df
langs = spark.createDataFrame(langs)

The name of the column should be language

In [13]:
langs = langs.withColumnRenamed('languages', 'language')

In [9]:
# langs.withColumn(
# 'language', langs.languages
# ).drop('languages').show()

+--------+
|language|
+--------+
|  python|
|   julia|
|       c|
|       r|
|     sql|
|    java|
|   scala|
+--------+



View the schema of the dataframe

In [14]:
langs.printSchema()

root
 |-- language: string (nullable = true)



Output the shape of the dataframe

In [17]:
print(f'{langs.count()} rows, {len(langs.columns)} columns')

7 rows, 1 columns


Show the first 5 records in the dataframe

In [19]:
langs.show(5)

+--------+
|language|
+--------+
|  python|
|   julia|
|       c|
|       r|
|     sql|
+--------+
only showing top 5 rows



### Load the mpg dataset as a spark dataframe.

In [20]:
mpg = spark.createDataFrame(data('mpg'))

Create 1 column of output that contains a message like the one below:

The 1999 audi a4 has a 4 cylinder engine.

In [27]:
mpg.select(
    (F.concat(
        F.lit('The '),
        mpg.year,
        F.lit(' '),
        mpg.manufacturer,
        F.lit(' '),
        mpg.model,
        F.lit(' has a '),
        mpg.cyl,
        F.lit(' cylinder engine')
    )).alias('statement')).show(5, truncate=False)

+----------------------------------------+
|statement                               |
+----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 6 cylinder engine|
+----------------------------------------+
only showing top 5 rows



In [30]:
mpg.withColumn(
    'statement',
    F.concat(
        F.lit('The '),
        mpg.year,
        F.lit(' '),
        mpg.manufacturer,
        F.lit(' '),
        mpg.model,
        F.lit(' has a '),
        mpg.cyl,
        F.lit(' cylinder engine')
    )
).show(1, truncate=False, vertical=True)

-RECORD 0------------------------------------------------
 manufacturer | audi                                     
 model        | a4                                       
 displ        | 1.8                                      
 year         | 1999                                     
 cyl          | 4                                        
 trans        | auto(l5)                                 
 drv          | f                                        
 cty          | 18                                       
 hwy          | 29                                       
 fl           | p                                        
 class        | compact                                  
 statement    | The 1999 audi a4 has a 4 cylinder engine 
only showing top 1 row



For each vehicle.

Transform the trans column so that it only contains either manual or auto.

In [32]:
mpg.select(
    (
        F.when(
        mpg.trans.contains('auto'), 'auto'
    ).otherwise('manual')
    ).alias('trans_type')
).show()

+----------+
|trans_type|
+----------+
|      auto|
|    manual|
|    manual|
|      auto|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|      auto|
|      auto|
|      auto|
|      auto|
+----------+
only showing top 20 rows



### Load the tips dataset as a spark dataframe.

In [33]:
# make the variable tips,
# use the spark session to create a tips dataframe
# tips dataframe comes from pydataset tips function
tips = spark.createDataFrame(data('tips'))

What percentage of observations are smokers?

In [35]:
tips.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [36]:
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [38]:
(tips.filter(tips.smoker == 'Yes').count() / tips.count()) * 100

38.114754098360656

In [44]:
tips.groupby(tips.smoker).agg(
    F.round((F.count('time') / tips.count()),2).alias('perc')
).show()

+------+----+
|smoker|perc|
+------+----+
|    No|0.62|
|   Yes|0.38|
+------+----+



Create a column that contains the tip percentage

In [46]:
tips.withColumn(
    'tip_perc',
    F.round((tips.tip / tips.total_bill) * 100, 2)
).select('tip','total_bill','tip_perc').show(5)

+----+----------+--------+
| tip|total_bill|tip_perc|
+----+----------+--------+
|1.01|     16.99|    5.94|
|1.66|     10.34|   16.05|
| 3.5|     21.01|   16.66|
|3.31|     23.68|   13.98|
|3.61|     24.59|   14.68|
+----+----------+--------+
only showing top 5 rows



Calculate the average tip percentage for each combination of sex and smoker.

In [51]:
tips.withColumn(
    'tip_perc',
    F.round((tips.tip / tips.total_bill) * 100, 2)
).groupby('sex','smoker').agg(
    F.round(F.mean('tip_perc'), 2).alias('avg_perc')).show()

+------+------+--------+
|   sex|smoker|avg_perc|
+------+------+--------+
|  Male|    No|   16.07|
|Female|    No|   15.69|
|  Male|   Yes|   15.28|
|Female|   Yes|   18.21|
+------+------+--------+



In [None]:
tips = tips.withColumn(
    'tip_perc',
    F.round((tips.tip / tips.total_bill) * 100, 2)
)

In [53]:
tips.withColumn(
    'tip_perc',
    F.round((tips.tip / tips.total_bill) * 100, 2)
).groupby('sex').pivot('smoker').agg(
    F.round(F.avg('tip_perc'), 2).alias('avg_perc')).show()

[Stage 104:>                                                      (0 + 12) / 12]

+------+-----+-----+
|   sex|   No|  Yes|
+------+-----+-----+
|Female|15.69|18.21|
|  Male|16.07|15.28|
+------+-----+-----+



                                                                                

### Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [54]:
from vega_datasets import data
weather = data.seattle_weather()
weather = spark.createDataFrame(weather)

In [58]:
# head equivalent:
weather.show(2, vertical=True)

-RECORD 0----------------------------
 date          | 2012-01-01 00:00:00 
 precipitation | 0.0                 
 temp_max      | 12.8                
 temp_min      | 5.0                 
 wind          | 4.7                 
 weather       | drizzle             
-RECORD 1----------------------------
 date          | 2012-01-02 00:00:00 
 precipitation | 10.9                
 temp_max      | 10.6                
 temp_min      | 2.8                 
 wind          | 4.5                 
 weather       | rain                
only showing top 2 rows



In [55]:
# c to f: (x°C × 9/5) + 32 = 32°F

Convert the temperatures to fahrenheit.

In [64]:
weather = weather.withColumn('temp_min', 
                   weather.temp_min * (9/5) + 32
                  ).withColumn(
    'temp_max', 
    weather.temp_max * (9/5) + 32
                  )

In [65]:
weather.select('temp_min', 'temp_max').show(5)

+--------+------------------+
|temp_min|          temp_max|
+--------+------------------+
|    41.0|55.040000000000006|
|   37.04|             51.08|
|   44.96|             53.06|
|   42.08|             53.96|
|   37.04|48.019999999999996|
+--------+------------------+
only showing top 5 rows



Which month has the most rain, on average?

In [66]:
# weather.groupby('weather').sum('precipitation').show()

[Stage 118:>                                                      (0 + 12) / 12]

+-------+------------------+
|weather|sum(precipitation)|
+-------+------------------+
|drizzle|               1.0|
|   rain|1321.8000000000004|
|    sun|239.39999999999998|
|   snow|             208.1|
|    fog|            2655.7|
+-------+------------------+



                                                                                

In [76]:
weather.filter(weather.weather == 'rain'
).\
withColumn(
    'month',
    F.month(weather.date)
).groupby('month').agg(
    (F.mean('precipitation')).alias('mean_precip'),
    (F.sum('precipitation')).alias('total_precip')
    ).sort('total_precip').show()

+-----+-------------------+------------------+
|month|        mean_precip|      total_precip|
+-----+-------------------+------------------+
|    9|0.22499999999999998|0.8999999999999999|
|    7| 1.8785714285714286|              26.3|
|    8|  6.433333333333334|              38.6|
|    5| 3.2624999999999997|52.199999999999996|
|    4|  3.429999999999999| 68.59999999999998|
|    6|  3.952631578947368|              75.1|
|   12| 5.0260869565217385|115.59999999999998|
|    2| 3.1725000000000003|             126.9|
|    3|  4.921621621621622|             182.1|
|   10|              9.675|             193.5|
|   11|               8.42|             210.5|
|    1|  6.614285714285714|231.49999999999997|
+-----+-------------------+------------------+



Which year was the windiest?

In [81]:
weather.withColumn(
    'year',
    F.year(weather.date)
).groupby(
'year'
).agg(
 (F.mean('wind')).alias('mean_wind'),
    (F.sum('wind')).alias('total_wind')   
).sort(F.col('total_wind').desc()).first()

Row(year=2012, mean_wind=3.4008196721311483, total_wind=1244.7000000000003)

What is the most frequent type of weather in January?

In [84]:
weather.filter(
    F.month(weather.date) == 1
).groupby('weather').count().sort(F.col('count').desc()).show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



What is the average high and low temperature on sunny days in July in 2013 and 2014?

In [None]:
weather.select(
    F.when(weather.date.contains('2012'))

In [96]:
# steps ahead:
# filter for: july (month)
# year (2013, 2014)
# weather: sunny
# aggregate on high and low temp averages
weather.filter(
    (F.month(weather.date) == 7) &
    (F.year(weather.date).isin(['2013','2014'])) &
    (weather.weather == 'sun')
).agg(
    F.round(F.mean('temp_max'),2).alias('avg_high'), 
      F.round(F.mean('temp_min'), 2).alias('avg_low')
).show()

[Stage 173:>                                                      (0 + 12) / 12]

+--------+-------+
|avg_high|avg_low|
+--------+-------+
|   80.29|  57.53|
+--------+-------+



                                                                                

What percentage of days were rainy in q3 of 2015?

In [102]:
# filter for q3:
# filter for year
# calculate percentage of rainy days
weather.filter(
    F.quarter(weather.date) == 3).filter(
    F.year(weather.date) == 2015
).select(
    (F.when(weather.weather == 'rain', 1).otherwise(0)).alias('rain')
).agg((F.round(F.mean('rain') * 100, 2)).alias('perc_rain')).show()

+---------+
|perc_rain|
+---------+
|     2.17|
+---------+



For each year, find what percentage of days it rained (had non-zero precipitation).

In [109]:
# nonzero precipitation
weather.withColumn(
    'year',
    F.year(weather.date)).withColumn(
    'has_rained',
    F.expr('precipitation > 0').cast('int')
).groupby('year').agg(
    F.round(F.avg('has_rained') * 100, 2).alias('percent_precip')
).show()
    

[Stage 200:>                                                      (0 + 12) / 12]

+----+--------------+
|year|percent_precip|
+----+--------------+
|2012|         48.36|
|2013|         41.64|
|2014|          41.1|
|2015|         39.45|
+----+--------------+



                                                                                