In [1]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

Convert the pandas dataframe to a spark dataframe. From this point
       forward, do all of your work with the spark dataframe, not the pandas
       dataframe.

In [2]:
import pyspark
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean, lit, regexp_extract, regexp_replace
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Show the first 3 rows of the dataframe.

In [3]:
df = spark.createDataFrame(pandas_dataframe)
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



Show the first 7 rows of the dataframe.

In [4]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



View a summary of the data using `.describe`.

In [5]:
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.36640264498852165| null|
| stddev| 0.8905322898155364| null|
|    min| -1.261605945319069|    x|
|    max| 2.1503829673811126|    z|
+-------+-------------------+-----+



Use `.select` to create a new dataframe with just the `n` and `abool`
       columns. View the first 5 rows of this dataframe.

In [6]:
df.select('n', df.abool).show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



Use `.select` to create a new dataframe with just the `group` and `abool`
       columns. View the first 5 rows of this dataframe.

In [7]:
df2 = df.select(df.group, df.abool)
df2.show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



Use `.select` to create a new dataframe with the `group` column and the
       `abool` column renamed to `a_boolean_value`. Show the first 3 rows of
       this dataframe.

In [8]:
df3 = df.select(df.group, df.abool.alias('a_boolean_value'))
df3.show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



Use `.select` to create a new dataframe with the `group` column and the
       `n` column renamed to `a_numeric_value`. Show the first 6 rows of this
       dataframe.

In [9]:
df4 = df.select(df.group, df.n.alias('a_numeric_value'))
df4.show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



Use `.select` to add 4 to the `n` column. Show the results.

In [10]:
df.select(df.n, df.n +4).show()

+--------------------+------------------+
|                   n|           (n + 4)|
+--------------------+------------------+
|  -0.712390662050588|3.2876093379494122|
|   0.753766378659703| 4.753766378659703|
|-0.04450307833805...|3.9554969216619464|
| 0.45181233874578974|  4.45181233874579|
|  1.3451017084510097|5.3451017084510095|
|  0.5323378882945463| 4.532337888294546|
|  1.3501878997225267| 5.350187899722527|
|  0.8612113741693206|  4.86121137416932|
|  1.4786857374358966| 5.478685737435897|
| -1.0453771305385342| 2.954622869461466|
| -0.7889890249515489|3.2110109750484512|
|  -1.261605945319069| 2.738394054680931|
|  0.5628467852810314| 4.562846785281032|
|-0.24332625188556253|3.7566737481144377|
|  0.9137407048596775| 4.913740704859677|
| 0.31735092273633597| 4.317350922736336|
| 0.12730328020698067| 4.127303280206981|
|  2.1503829673811126| 6.150382967381113|
|  0.6062886568962988| 4.606288656896298|
|-0.02677164998644...|3.9732283500135592|
+--------------------+------------

Subtract 5 from the `n` column and view the results.

In [11]:
df.select(df.n, df.n -5).show(5)

+--------------------+-------------------+
|                   n|            (n - 5)|
+--------------------+-------------------+
|  -0.712390662050588| -5.712390662050588|
|   0.753766378659703| -4.246233621340297|
|-0.04450307833805...| -5.044503078338053|
| 0.45181233874578974|  -4.54818766125421|
|  1.3451017084510097|-3.6548982915489905|
+--------------------+-------------------+
only showing top 5 rows



Multiply the `n` column by 2. View the results along with the original
       numbers.

In [12]:
df.select(df.n, df.n*2).show()

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
|  0.5323378882945463|  1.0646757765890926|
|  1.3501878997225267|  2.7003757994450535|
|  0.8612113741693206|  1.7224227483386412|
|  1.4786857374358966|   2.957371474871793|
| -1.0453771305385342| -2.0907542610770684|
| -0.7889890249515489| -1.5779780499030978|
|  -1.261605945319069|  -2.523211890638138|
|  0.5628467852810314|  1.1256935705620628|
|-0.24332625188556253|-0.48665250377112507|
|  0.9137407048596775|   1.827481409719355|
| 0.31735092273633597|  0.6347018454726719|
| 0.12730328020698067| 0.25460656041396135|
|  2.1503829673811126|   4.300765934762225|
|  0.6062886568962988|  1.2125773137925977|
|-0.02677164998644...|-0.0535432

Add a new column named `n2` that is the `n` value multiplied by -1. Show
       the first 4 rows of your dataframe. You should see the original `n` value
       as well as `n2`.

In [13]:
from pyspark.sql.functions import col, expr

In [14]:
col('n')

Column<b'n'>

In [15]:
df.select(
    col('n'),
    (col('n')*-1).alias('n2')).show(5)

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
|  1.3451017084510097| -1.3451017084510097|
+--------------------+--------------------+
only showing top 5 rows



Add a new column named `n3` that is the n value squared. Show the first 5
       rows of your dataframe. You should see both `n`, `n2`, and `n3`.

In [16]:
df.select(
    col('n'),
    (col('n')*-1).alias('n2'),
    (col('n') ** 2 ).alias('n3')
).show(5)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
+--------------------+--------------------+--------------------+
only showing top 5 rows



What happens when you run the code below? 

In [17]:
df.group + df.abool

Column<b'(group + abool)'>

Try adding various other columns together. What are the results of
       combining the different data types?

In [18]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [19]:
df.n + df.abool

Column<b'(n + abool)'>

In [20]:
(df.n + df.group).show()

TypeError: 'Column' object is not callable

Use `.printSchema` to view the datatypes in your dataframe.  df.Use `.dtypes` to view the datatypes in your dataframe.

In [21]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



In [22]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

What is the difference between the two code samples below?

In [23]:
df.abool.cast('int').show()

TypeError: 'Column' object is not callable

In [24]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



Use `.select` and `.cast` to convert the `abool` column to an integer
       type. View the results.

In [25]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



Convert the `group` column to a integer data type and view the results.
       What happens?

In [26]:
df.select(df.group.cast('int')).show()

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
+-----+



Convert the `n` column to a integer data type and view the results. What
       happens?

In [27]:
df.select(df.n.cast('int')).show()

+---+
|  n|
+---+
|  0|
|  0|
|  0|
|  0|
|  1|
|  0|
|  1|
|  0|
|  1|
| -1|
|  0|
| -1|
|  0|
|  0|
|  0|
|  0|
|  0|
|  2|
|  0|
|  0|
+---+



Convert the `abool` column to a string data type and view the results.
       What happens?

In [28]:
df.select(df.abool.cast('string')).show()

+-----+
|abool|
+-----+
|false|
|false|
|false|
|false|
|false|
|false|
|false|
|false|
| true|
| true|
|false|
|false|
| true|
| true|
|false|
|false|
|false|
| true|
|false|
| true|
+-----+



Find the highest `n` value.
    1. Find the lowest `n` value.
    1. Find the average `n` value.

In [29]:
df.createOrReplaceTempView("df")

In [30]:
from pyspark.sql.functions import round

In [31]:
df.select(min(df.n)).round(3).show()

AttributeError: 'DataFrame' object has no attribute 'round'

In [None]:
df.select(max(df.n)).show()

In [None]:
df.select(mean(df.n)).show()

Use `concat` to change the `group` column to say, e.g. "Group: x" or
       "Group: y"

In [None]:
df.select(concat(lit('Group: '), col('group' ))).show()

Use `concat` to combine the `n` and `group` columns to produce results
       that look like this: "x: -1.432" or "z: 2.352"

In [None]:
df.select(concat(col('group'), lit(': '), col('n' ))).show()

In [32]:
from pyspark.sql.functions import regexp_extract, regexp_replace

Use the starter code above to re-create a spark dataframe.
    1. Use `.filter` or `.where` to select just the rows where the group is `y`
       and view the results.
    1. Select just the columns where the `abool` column is false and view the
       results.
    1. Find the columns where the `group` column is *not* `y`.
    1. Find the columns where `n` is positive.
    1. Find the columns where `abool` is true and the `group` column is `z`.
    1. Find the columns where `abool` is true or the `group` column is `z`.
    1. Find the columns where `abool` is false and `n` is less than 1
    1. Find the columns where `abool` is false or `n` is less than 1

In [33]:
df.filter(df.group == 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



In [34]:
df.filter(df.abool == 'false').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



In [35]:
df.filter(df.group != 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [36]:
df.filter(df.n > 0).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
| 1.3501878997225267|    z|false|
| 0.8612113741693206|    x|false|
| 1.4786857374358966|    z| true|
| 0.5628467852810314|    y| true|
| 0.9137407048596775|    y|false|
|0.31735092273633597|    x|false|
|0.12730328020698067|    z|false|
| 2.1503829673811126|    y| true|
| 0.6062886568962988|    x|false|
+-------------------+-----+-----+



In [38]:
df.filter(df.abool == True).where(df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



In [43]:
df.filter((df.abool == True) | (df.group == 'z')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [45]:
df.filter(df.abool == False).where(df.n < 1).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



In [46]:
df.filter((df.abool == False) | (df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



When / Otherwise

    1. Use the starter code above to re-create a spark dataframe.
    1. Use `when` and `.otherwise` to create a column that contains the text "It
       is true" when `abool` is true and "It is false"" when `abool` is false.
    1. Create a column that contains 0 if n is less than 0, otherwise, the
       original n value.

In [48]:
from pyspark.sql.functions import when

In [50]:
df.select(
    df.abool,
    when(df.abool == True, 'It is true')
    .otherwise('It is false')
    .alias('T/F')
).show(2)

+-----+-----------+
|abool|        T/F|
+-----+-----------+
|false|It is false|
|false|It is false|
+-----+-----------+
only showing top 2 rows



In [51]:
df.select(
    df.n,
    when(df.n < 0, 0)
    .otherwise(df.n)
    .alias('Less than 0')
).show(20)

+--------------------+-------------------+
|                   n|        Less than 0|
+--------------------+-------------------+
|  -0.712390662050588|                0.0|
|   0.753766378659703|  0.753766378659703|
|-0.04450307833805...|                0.0|
| 0.45181233874578974|0.45181233874578974|
|  1.3451017084510097| 1.3451017084510097|
|  0.5323378882945463| 0.5323378882945463|
|  1.3501878997225267| 1.3501878997225267|
|  0.8612113741693206| 0.8612113741693206|
|  1.4786857374358966| 1.4786857374358966|
| -1.0453771305385342|                0.0|
| -0.7889890249515489|                0.0|
|  -1.261605945319069|                0.0|
|  0.5628467852810314| 0.5628467852810314|
|-0.24332625188556253|                0.0|
|  0.9137407048596775| 0.9137407048596775|
| 0.31735092273633597|0.31735092273633597|
| 0.12730328020698067|0.12730328020698067|
|  2.1503829673811126| 2.1503829673811126|
|  0.6062886568962988| 0.6062886568962988|
|-0.02677164998644...|                0.0|
+----------

Sorting

    1. Use the starter code above to re-create a spark dataframe.
    1. Sort by the `n` value.
    1. Sort by the `group` value, both ascending and descending.
    1. Sort by the group value first, then, within each group, sort by `n`
       value.
    1. Sort by `abool`, `group`, and `n`. Does it matter in what order you
       specify the columns when sorting?



In [52]:
from pyspark.sql.functions import asc, desc

In [54]:
df.sort(df.n.desc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  2.1503829673811126|    y| true|
|  1.4786857374358966|    z| true|
|  1.3501878997225267|    z|false|
|  1.3451017084510097|    z|false|
|  0.9137407048596775|    y|false|
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
|  0.6062886568962988|    x|false|
|  0.5628467852810314|    y| true|
|  0.5323378882945463|    y|false|
| 0.45181233874578974|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|-0.02677164998644...|    x| true|
|-0.04450307833805...|    z|false|
|-0.24332625188556253|    y| true|
|  -0.712390662050588|    z|false|
| -0.7889890249515489|    x|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
+--------------------+-----+-----+



In [55]:
df.sort(df.n.asc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.24332625188556253|    y| true|
|-0.04450307833805...|    z|false|
|-0.02677164998644...|    x| true|
| 0.12730328020698067|    z|false|
| 0.31735092273633597|    x|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  0.9137407048596775|    y|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



In [56]:
df.sort(col('n').desc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  2.1503829673811126|    y| true|
|  1.4786857374358966|    z| true|
|  1.3501878997225267|    z|false|
|  1.3451017084510097|    z|false|
|  0.9137407048596775|    y|false|
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
|  0.6062886568962988|    x|false|
|  0.5628467852810314|    y| true|
|  0.5323378882945463|    y|false|
| 0.45181233874578974|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|-0.02677164998644...|    x| true|
|-0.04450307833805...|    z|false|
|-0.24332625188556253|    y| true|
|  -0.712390662050588|    z|false|
| -0.7889890249515489|    x|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
+--------------------+-----+-----+



In [57]:
df.sort(desc('n')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  2.1503829673811126|    y| true|
|  1.4786857374358966|    z| true|
|  1.3501878997225267|    z|false|
|  1.3451017084510097|    z|false|
|  0.9137407048596775|    y|false|
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
|  0.6062886568962988|    x|false|
|  0.5628467852810314|    y| true|
|  0.5323378882945463|    y|false|
| 0.45181233874578974|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|-0.02677164998644...|    x| true|
|-0.04450307833805...|    z|false|
|-0.24332625188556253|    y| true|
|  -0.712390662050588|    z|false|
| -0.7889890249515489|    x|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
+--------------------+-----+-----+



mpg.sort(mpg.hwy.desc())

is the same as
mpg.sort(col("hwy").desc())

is the same as
mpg.sort(desc("hwy")).show(5)

Sort by the group value first, then, within each group, sort by `n`
   value.

In [59]:
df.sort(asc('group'), df.n.desc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
|  0.6062886568962988|    x|false|
| 0.31735092273633597|    x|false|
|-0.02677164998644...|    x| true|
| -0.7889890249515489|    x|false|
|  2.1503829673811126|    y| true|
|  0.9137407048596775|    y|false|
|  0.5628467852810314|    y| true|
|  0.5323378882945463|    y|false|
| 0.45181233874578974|    y|false|
|-0.24332625188556253|    y| true|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
|  1.4786857374358966|    z| true|
|  1.3501878997225267|    z|false|
|  1.3451017084510097|    z|false|
| 0.12730328020698067|    z|false|
|-0.04450307833805...|    z|false|
|  -0.712390662050588|    z|false|
+--------------------+-----+-----+



In [None]:
mpg.sort(desc("class"), mpg.cyl.asc(), col("hwy").desc()).show()

Sort by `abool`, `group`, and `n`. Does it matter in what order you
   specify the columns when sorting?

In [64]:
df.sort(desc('abool'), 
        col('group').desc(), 
        col('n').desc()
       ).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.4786857374358966|    z| true|
|  2.1503829673811126|    y| true|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
| -1.0453771305385342|    y| true|
|-0.02677164998644...|    x| true|
|  1.3501878997225267|    z|false|
|  1.3451017084510097|    z|false|
| 0.12730328020698067|    z|false|
|-0.04450307833805...|    z|false|
|  -0.712390662050588|    z|false|
|  0.9137407048596775|    y|false|
|  0.5323378882945463|    y|false|
| 0.45181233874578974|    y|false|
|  -1.261605945319069|    y|false|
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
|  0.6062886568962988|    x|false|
| 0.31735092273633597|    x|false|
| -0.7889890249515489|    x|false|
+--------------------+-----+-----+



In [65]:
df.sort(desc('group'), 
        col('abool').desc(), 
        col('n').desc()
       ).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.4786857374358966|    z| true|
|  1.3501878997225267|    z|false|
|  1.3451017084510097|    z|false|
| 0.12730328020698067|    z|false|
|-0.04450307833805...|    z|false|
|  -0.712390662050588|    z|false|
|  2.1503829673811126|    y| true|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
| -1.0453771305385342|    y| true|
|  0.9137407048596775|    y|false|
|  0.5323378882945463|    y|false|
| 0.45181233874578974|    y|false|
|  -1.261605945319069|    y|false|
|-0.02677164998644...|    x| true|
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
|  0.6062886568962988|    x|false|
| 0.31735092273633597|    x|false|
| -0.7889890249515489|    x|false|
+--------------------+-----+-----+



Aggregating

    1. What is the average `n` value for each group in the `group` column?
    1. What is the maximum `n` value for each group in the `group` column?
    1. What is the minimum `n` value by `abool`?
    1. What is the average `n` value for each unique combination of the `group`
       and `abool` column?

In [69]:
df.groupBy(df.group).agg(avg(df.n)).show()

+-----+-------------------+
|group|             avg(n)|
+-----+-------------------+
|    x|0.28714277625394485|
|    z|  0.590730814237962|
|    y| 0.2576014196023739|
+-----+-------------------+



In [70]:
df.groupBy(df.group).agg(max(df.n)).show()

+-----+------------------+
|group|            max(n)|
+-----+------------------+
|    x|0.8612113741693206|
|    z|1.4786857374358966|
|    y|2.1503829673811126|
+-----+------------------+



In [71]:
df.groupBy(df.abool).agg(min(df.n)).show()

+-----+-------------------+
|abool|             min(n)|
+-----+-------------------+
| true|-1.0453771305385342|
|false| -1.261605945319069|
+-----+-------------------+



#### Use a pivot table

In [73]:
df.groupBy('abool').pivot('group').agg(mean('n')).show()

+-----+--------------------+-------------------+-------------------+
|abool|                   x|                  y|                  z|
+-----+--------------------+-------------------+-------------------+
| true|-0.02677164998644...|0.35613159255951177| 1.4786857374358966|
|false|   0.349925661502022|0.15907124664523611|0.41313982959837514|
+-----+--------------------+-------------------+-------------------+



What is the average `n` value for each unique combination of the `group`
   and `abool` column?