In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("chapter10").getOrCreate()

df = spark.read.format("csv").option("header", "true").option("header", "true").load("flight-data/csv/2010-summary.csv")

df.createOrReplaceTempView("someview")
df.write.mode("overwrite").saveAsTable("flights_someview")
spark.sql("select * from flights_someview").show(5,False)

spark.sql(""" select dest_country_name, origin_country_name, sum(count) as sum_count
from someview group by dest_country_name, origin_country_name """)\
.where("dest_country_name like 'S%'").where(col("sum_count") > 25).show(10, False)


spark.sql("drop table if exists flights")
spark.sql("""CREATE TABLE if not exists flights (
  DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG)
USING JSON location 'flight-data/json/2015-summary.json'""")
# workaround started
flightDf = spark.read.format("json").load('flight-data/json/2015-summary.json')
flightDf.write.mode("overwrite").saveAsTable("flights")
# workaround ended
spark.sql("select * from flights").show(5, False)
spark.sql("show tables").show()

spark.sql("drop table flights_csv")
spark.sql(""" create table if not exists flights_csv(dest_country_name string, origin_country_name string, count long) 
using csv options (header true, path 'flight-data/csv/2010-summary.csv')""")
# workaround start
flightsCsvDf = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load('flight-data/csv/2015-summary.csv')
flightsCsvDf.write.mode("overwrite").saveAsTable("flights_csv")
# workaround end
spark.sql("select * from flights_csv").show(10, False)
# spark.sql("show tables").show()

# spark.sql(""" create table if not exists flights_from_select using parquet as select * from flights""")

# spark.sql("show tables").show()

# spark.sql(""" create table if not exists flights_from_hive(dest_country_name string, origin_country_name string, count long)
# row format delimited fields terminated by ',' location 'flight-data-hive'""")

# spark.sql("show tables").show()

spark.sql(""" insert into flights_from_select select * from flights limit 20""")

spark.sql("select * from flights_from_select").show(20, False)

spark.sql("describe table flights_csv").show()

spark.sql("""create view if not exists flights_usa as select * from flights 
where dest_country_name='United States' or origin_country_name='United States'""")

spark.sql("select * from flights_usa").show(10, False)

spark.sql("create or replace view flights_view as select (dest_country_name, origin_country_name) as complex, count from flights_csv")
spark.sql("select * from flights_view").show(5, False)
spark.sql("select complex.dest_country_name, count from flights_view").show(5, False)

spark.sql("""select S.* from (select dest_country_name, collect_list(count) as countList, 
collect_set(origin_country_name) as originSet from 
flights_csv group by dest_country_name) S where size(S.originSet) > 1 """).show(1, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+---------+
|dest_country_name|origin_country_name|sum_count|
+-----------------+-------------------+---------+
|Sint Maarten     |United States      |61.0     |
|Senegal          |United States      |29.0     |
|Samoa            |United States      |28.0     |
|Saint Barthelemy |United States      |28.0     |
|South Korea      |United States      |683.0    |
|Sweden           |United States      |65.0     |
|Saint Lucia      |United States      |116.0    |
|Saudi Arabia     |United States      |42.0     |
|Spain     