In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession
        .builder
        .appName("SparkSQLExampleApp")
        .getOrCreate())

In [3]:
csv_file = "./flights/departuredelays.csv"

In [4]:
# Read and create a temporary view
# Infer schema (note that for larger files you
# may want to specify the schema)
df = (spark.read.format("csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .load(csv_file))

In [5]:
df.show(10)

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
|1030605|    0|     602|   ABE|        ATL|
|1041243|   10|     602|   ABE|        ATL|
|1040605|   28|     602|   ABE|        ATL|
|1051245|   88|     602|   ABE|        ATL|
|1050605|    9|     602|   ABE|        ATL|
+-------+-----+--------+------+-----------+
only showing top 10 rows



In [6]:
df.createOrReplaceTempView("us_delay_flights_tbl")

In [7]:
# In Python
# schema = "'date' STRING, 'delay' INT, 'distance' INT,'origin' STRING, 'destination' STRING"

In [8]:
spark.sql("""DESC us_delay_flights_tbl""").show()

+-----------+---------+-------+
|   col_name|data_type|comment|
+-----------+---------+-------+
|       date|      int|   NULL|
|      delay|      int|   NULL|
|   distance|      int|   NULL|
|     origin|   string|   NULL|
|destination|   string|   NULL|
+-----------+---------+-------+



In [9]:
spark.sql("""select * from us_delay_flights_tbl limit 10""").show()

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
|1030605|    0|     602|   ABE|        ATL|
|1041243|   10|     602|   ABE|        ATL|
|1040605|   28|     602|   ABE|        ATL|
|1051245|   88|     602|   ABE|        ATL|
|1050605|    9|     602|   ABE|        ATL|
+-------+-----+--------+------+-----------+



In [10]:
spark.sql("""SELECT distance, origin, destination
FROM us_delay_flights_tbl WHERE distance > 1000
ORDER BY distance DESC""").show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [11]:
spark.sql("""SELECT date, delay, origin, destination
FROM us_delay_flights_tbl
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD'
ORDER by delay DESC""").show(10)

+-------+-----+------+-----------+
|   date|delay|origin|destination|
+-------+-----+------+-----------+
|2190925| 1638|   SFO|        ORD|
|1031755|  396|   SFO|        ORD|
|1022330|  326|   SFO|        ORD|
|1051205|  320|   SFO|        ORD|
|1190925|  297|   SFO|        ORD|
|2171115|  296|   SFO|        ORD|
|1071040|  279|   SFO|        ORD|
|1051550|  274|   SFO|        ORD|
|3120730|  266|   SFO|        ORD|
|1261104|  258|   SFO|        ORD|
+-------+-----+------+-----------+
only showing top 10 rows



In [12]:
spark.sql("""SELECT date, concat(substr(date, 1, 1), '-', 
substr(date, 2, 2), ' ', substr(date, 4, 2), ':', substr(date, 6, 2), ' ') DATE,
delay, origin, destination
FROM us_delay_flights_tbl
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD'
ORDER by delay DESC""").show(10)

+-------+-----------+-----+------+-----------+
|   date|       DATE|delay|origin|destination|
+-------+-----------+-----+------+-----------+
|2190925|2-19 09:25 | 1638|   SFO|        ORD|
|1031755|1-03 17:55 |  396|   SFO|        ORD|
|1022330|1-02 23:30 |  326|   SFO|        ORD|
|1051205|1-05 12:05 |  320|   SFO|        ORD|
|1190925|1-19 09:25 |  297|   SFO|        ORD|
|2171115|2-17 11:15 |  296|   SFO|        ORD|
|1071040|1-07 10:40 |  279|   SFO|        ORD|
|1051550|1-05 15:50 |  274|   SFO|        ORD|
|3120730|3-12 07:30 |  266|   SFO|        ORD|
|1261104|1-26 11:04 |  258|   SFO|        ORD|
+-------+-----------+-----+------+-----------+
only showing top 10 rows



In [14]:
spark.sql("""SELECT any_value(date), delay, 
FROM us_delay_flights_tbl
WHERE delay > 120
GROUP BY SUBSTR(date, 1, 1), delay
ORDER by delay DESC""").show(10)

AnalysisException: [UNRESOLVED_COLUMN.WITHOUT_SUGGESTION] A column or function parameter with name `delay` cannot be resolved. ; line 3 pos 6;
'Sort ['delay DESC NULLS LAST], true
+- 'Aggregate ['SUBSTR('date, 1, 1), 'delay], [unresolvedalias(any_value('date, false), None), 'delay, 'FROM AS us_delay_flights_tbl#182]
   +- 'Filter ('delay > 120)
      +- OneRowRelation


In [53]:
spark.sql("""SELECT any_value(date), delay,  concat(substr(date, 1, 1), '-', 
substr(date, 2, 2), ' ', substr(date, 4, 2), ':', substr(date, 6, 2), ' ') DATE
FROM us_delay_flights_tbl
WHERE delay > 120
GROUP BY SUBSTR(date, 1, 1), delay
ORDER by delay DESC""").show(10)

AnalysisException: [MISSING_AGGREGATION] The non-aggregating expression "date" is based on columns which are not participating in the GROUP BY clause.
Add the columns or the expression to the GROUP BY, aggregate the expression, or use "any_value(date)" if you do not care which of the values within a group is returned.;
Sort [delay#18 DESC NULLS LAST], true
+- Aggregate [substr(cast(date#17 as string), 1, 1), delay#18], [any_value(date#17, false) AS any_value(date)#810, delay#18, concat(substr(cast(date#17 as string), 1, 1), -, substr(cast(date#17 as string), 2, 2),  , substr(cast(date#17 as string), 4, 2), :, substr(cast(date#17 as string), 6, 2),  ) AS DATE#809]
   +- Filter (delay#18 > 120)
      +- SubqueryAlias us_delay_flights_tbl
         +- View (`us_delay_flights_tbl`, [date#17,delay#18,distance#19,origin#20,destination#21])
            +- Relation [date#17,delay#18,distance#19,origin#20,destination#21] csv


In [49]:
spark.sql("""SELECT date, delay, concat(substr(date, 1, 1), '-', 
substr(date, 2, 2), ' ', substr(date, 4, 2), ':', substr(date, 6, 2), ' ') DATE,
FROM us_delay_flights_tbl
WHERE delay > 120
GROUP BY SUBSTR(date, 1, 1)
ORDER by delay DESC""").show(10)

AnalysisException: [UNRESOLVED_COLUMN.WITHOUT_SUGGESTION] A column or function parameter with name `delay` cannot be resolved. ; line 4 pos 6;
'Sort ['delay DESC NULLS LAST], true
+- 'Aggregate ['SUBSTR('date, 1, 1)], ['date, 'delay, 'concat('substr('date, 1, 1), -, 'substr('date, 2, 2),  , 'substr('date, 4, 2), :, 'substr('date, 6, 2),  ) AS DATE#782, 'FROM AS us_delay_flights_tbl#783]
   +- 'Filter ('delay > 120)
      +- OneRowRelation


In [15]:
spark.sql("""SELECT delay, origin, destination,
CASE
WHEN delay > 360 THEN 'Very Long Delays'
WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
WHEN delay = 0 THEN 'No Delays'
ELSE 'Early'
END AS Flight_Delays
FROM us_delay_flights_tbl
ORDER BY origin, delay DESC""").show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it