#### References
https://docs.azuredatabricks.net/user-guide/visualizations/index.html<br>

https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame<br>

https://stackoverflow.com/questions/tagged/apache-spark+apache-spark-sql+python<br>
https://stackoverflow.com/questions/tagged/databricks+python

#### Run data ingest so we can use dataframe here

In [3]:
%run ./adb_3_ingest_to_df

### Dataframe API for data operations

In [5]:
# Group by state so we can count airports per state; rename count column (confusing with describe() otherwise); sort by count descending
df_airports_by_state = df_airports\
  .groupBy("state")\
  .count()\
  .withColumnRenamed("count", "n")\
  .sort("n", ascending=False)

In [6]:
# Basic descriptive statistics

display(df_airports_by_state.describe())

summary,state,n
count,56,56.0
mean,,6.517857142857143
stddev,,6.234773660478992
min,AK,1.0
max,WY,32.0


In [7]:
# Spark execution plan

df_airports_by_state.explain()

In [8]:
display(df_airports_by_state)

state,n
CA,32
TX,29
AK,22
FL,20
NY,14
MI,12
CO,11
NC,11
IL,8
WI,8


In [9]:
# We can also use dataframe select, passing it a list of column names, which emits a new dataframe that can be operated on with dataframe API

In [10]:
display(df_airports\
  .select("*")\
  .sort("name", ascending=True))

airport_id,city,state,name
10141,Aberdeen,SD,Aberdeen Regional
10136,Abilene,TX,Abilene Regional
14952,Springfield,IL,Abraham Lincoln Capital
10165,Adak Island,AK,Adak
12992,Little Rock,AR,Adams Field
10874,Akron,OH,Akron-Canton Regional
10257,Albany,NY,Albany International
13795,Jacksonville/Camp Lejeune,NC,Albert J Ellis
10140,Albuquerque,NM,Albuquerque International Sunport
10185,Alexandria,LA,Alexandria International


### Spark SQL for data operations

In [12]:
# To use a DF in explicit SQL queries, register it as a temp view (cluster lifetime scope)

df_airports.createOrReplaceTempView("df_airports")

In [13]:
# A Spark SQL SELECT query will emit a new dataframe. This is the same query as the dataframe API query above, for example.

display(sql("SELECT state, COUNT(state) AS n FROM df_airports GROUP BY state ORDER BY n DESC").limit(10))

state,n
CA,32
TX,29
AK,22
FL,20
NY,14
MI,12
CO,11
NC,11
GA,8
IL,8
