### 1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be `language`
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [1]:
# imports
import pyspark
import pandas as pd

# create spark environment
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [10]:
# create pandas dataframe
p_df = pd.DataFrame({'language':['python', 'sql', 'html', 'ruby', 'c', 'scala']})
p_df

Unnamed: 0,language
0,python
1,sql
2,html
3,ruby
4,c
5,scala


In [11]:
# create spark dataframe
df = spark.createDataFrame(p_df)

In [9]:
# print schema
df.printSchema()

root
 |-- language: string (nullable = true)



In [14]:
# output shape
print("DataFrame shape: ", df.count(), " x ", len(df.columns))

DataFrame shape:  6  x  1


In [13]:
# view first 5 records
df.show(5)

+--------+
|language|
+--------+
|  python|
|     sql|
|    html|
|    ruby|
|       c|
+--------+
only showing top 5 rows



### 2. Load the `mpg` dataset as a spark dataframe.

#### a. Create 1 column of output that contains a message like the one below for each record:

    The 1999 audi a4 has a 4 cylinder engine.

> Hint: You will need to concatenate values that already exist in the data with string literals

In [15]:
# import
from pydataset import data

# create dataframe
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [28]:
# imports
from pyspark.sql.functions import lit
from pyspark.sql.functions import concat

# create string column
mpg.select(concat(lit("The "),
                  mpg.year,
                  lit(" "),
                  mpg.manufacturer,
                  lit(" "),
                  mpg.model,
                  lit(" has a "),
                  mpg.cyl,
                  lit(" cylinder engine."))).show(5, truncate=False)

+------------------------------------------------------------------------------+
|concat(The , year,  , manufacturer,  , model,  has a , cyl,  cylinder engine.)|
+------------------------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.                                     |
|The 1999 audi a4 has a 4 cylinder engine.                                     |
|The 2008 audi a4 has a 4 cylinder engine.                                     |
|The 2008 audi a4 has a 4 cylinder engine.                                     |
|The 1999 audi a4 has a 6 cylinder engine.                                     |
+------------------------------------------------------------------------------+
only showing top 5 rows



#### b. Transform the trans column so that it only contains either manual or auto.

> Hint: Consider spark string methods and `when().otherwise()` chaining

In [31]:
# preview dataframe
mpg.show(10)

+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|     model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|        audi|        a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|        a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|        a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|        a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|        a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|        a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|        a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|compact|
|        audi|a4 quattro|  2.0|2008|  4|manual(m6)|  4| 20| 28|  p|compact|
+-----------

In [51]:
# import
from pyspark.sql.functions import when, regexp_extract, regexp_replace

mpg.select(mpg.trans, 
           when(mpg.trans.like("a%"), "auto")
           .otherwise("manual")
           .alias("trans_type")
          ).show()

+----------+----------+
|     trans|trans_type|
+----------+----------+
|  auto(l5)|      auto|
|manual(m5)|    manual|
|manual(m6)|    manual|
|  auto(av)|      auto|
|  auto(l5)|      auto|
|manual(m5)|    manual|
|  auto(av)|      auto|
|manual(m5)|    manual|
|  auto(l5)|      auto|
|manual(m6)|    manual|
|  auto(s6)|      auto|
|  auto(l5)|      auto|
|manual(m5)|    manual|
|  auto(s6)|      auto|
|manual(m6)|    manual|
|  auto(l5)|      auto|
|  auto(s6)|      auto|
|  auto(s6)|      auto|
|  auto(l4)|      auto|
|  auto(l4)|      auto|
+----------+----------+
only showing top 20 rows

