In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

In [0]:
spark.sql("""select 1 + 1 as compute""").show()

+-------+
|compute|
+-------+
|      2|
+-------+



In [0]:
flight_data = spark.read.option('header',True).csv("/tmp/2015-summary.csv")

In [0]:
import os
os.makedirs("/dbfs/tmp/", exist_ok=True)

In [0]:
import urllib.request

url = "https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/refs/heads/master/data/flight-data/csv/2015-summary.csv"

path = "/dbfs/tmp/2015-summary.csv"

urllib.request.urlretrieve(url, path)

('/dbfs/tmp/2015-summary.csv', <http.client.HTTPMessage at 0x771ca350ecd0>)

In [0]:
spark.read.options(header=True).csv("/tmp/2015-summary.csv").createOrReplaceTempView("some_sql_view") 

In [0]:
%sql
select * from some_sql_view limit 5

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
spark.sql("select sum(count), DEST_COUNTRY_NAME from some_sql_view group by DEST_COUNTRY_NAME") \
    .where("DEST_COUNTRY_NAME like 'S%'").where("`sum(count)` > 10").count()

12

In [0]:
%sql
create table flights_from_select using parquet as select * from some_sql_view;

In [0]:
%sql
CREATE TABLE partitioned_flights USING parquet PARTITIONED BY (DEST_COUNTRY_NAME)
AS SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM some_sql_view LIMIT 5

In [0]:
%sql
INSERT INTO flights_from_select
SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM some_sql_view LIMIT 20

In [0]:
%sql
describe table flights_from_select

col_name,data_type,comment
DEST_COUNTRY_NAME,string,
ORIGIN_COUNTRY_NAME,string,
count,string,


In [0]:
%sql
show partitions partitioned_flights

partition
DEST_COUNTRY_NAME=Egypt
DEST_COUNTRY_NAME=United States


In [0]:
%sql
REFRESH table partitioned_flights

In [0]:
%sql
show partitions partitioned_flights

partition
DEST_COUNTRY_NAME=Egypt
DEST_COUNTRY_NAME=United States


In [0]:
%sql
MSCK REPAIR TABLE partitioned_flights

In [0]:
%sql
DROP TABLE IF EXISTS flights_csv;

In [0]:
%sql
cache table flights_from_select

In [0]:
%sql
uncache table flights_from_select

In [0]:
%sql
create view just_usa_view as select * from flights_from_select where DEST_COUNTRY_NAME = 'United States'

In [0]:
%sql
CREATE TEMP VIEW just_usa_view_temp AS
SELECT * FROM flights_from_select WHERE dest_country_name = 'United States'

In [0]:
%sql
CREATE GLOBAL TEMP VIEW just_usa_global_view_temp AS
SELECT * FROM flights_from_select WHERE dest_country_name = 'United States'

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW just_usa_view_temp AS
SELECT * FROM flights_from_select WHERE dest_country_name = 'United States'

In [0]:
%sql
select * from flights_from_select limit 5

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
%sql
EXPLAIN SELECT * FROM just_usa_view

plan
"== Physical Plan == *(1) ColumnarToRow +- PhotonResultStage  +- PhotonScan parquet spark_catalog.default.flights_from_select[DEST_COUNTRY_NAME#1858,ORIGIN_COUNTRY_NAME#1859,count#1860] DataFilters: [isnotnull(DEST_COUNTRY_NAME#1858), (DEST_COUNTRY_NAME#1858 = United States)], DictionaryFilters: [(DEST_COUNTRY_NAME#1858 = United States)], Format: parquet, Location: InMemoryFileIndex(1 paths)[dbfs:/user/hive/warehouse/flights_from_select], OptionalDataFilters: [], PartitionFilters: [], ReadSchema: struct, RequiredDataFilters: [isnotnull(DEST_COUNTRY_NAME#1858), (DEST_COUNTRY_NAME#1858 = United States)] == Photon Explanation == The query is fully supported by Photon."


In [0]:
%sql
DROP VIEW IF EXISTS just_usa_view;

In [0]:
%sql
CREATE DATABASE some_db

In [0]:
%sql
USE some_db

In [0]:
%sql
SHOW tables

database,tableName,isTemporary
,_sqldf,True
,just_usa_view_temp,True
,some_sql_view,True


In [0]:
%sql
SELECT * FROM default.flights_from_select limit 5

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
%sql
SELECT current_database()

current_schema()
some_db


In [0]:
%sql
USE default;

In [0]:
%sql
DROP DATABASE IF EXISTS some_db;

In [0]:
%sql
CREATE VIEW IF NOT EXISTS nested_data AS
SELECT (DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME) as country, count FROM flights_from_select

In [0]:
%sql
SELECT * FROM nested_data limit 5

country,count
"List(United States, Romania)",15
"List(United States, Croatia)",1
"List(United States, Ireland)",344
"List(Egypt, United States)",15
"List(United States, India)",62


In [0]:
%sql
SELECT country.DEST_COUNTRY_NAME, count FROM nested_data limit 5

DEST_COUNTRY_NAME,count
United States,15
United States,1
United States,344
Egypt,15
United States,62


In [0]:
%sql
SELECT country.*, count FROM nested_data limit 5

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
%sql
SELECT DEST_COUNTRY_NAME as new_name, collect_list(count) as flight_counts,
collect_set(ORIGIN_COUNTRY_NAME) as origin_set
FROM flights_from_select GROUP BY DEST_COUNTRY_NAME

new_name,flight_counts,origin_set
Algeria,"List(4, 4)",List(United States)
Angola,List(15),List(United States)
Anguilla,"List(41, 41)",List(United States)
Antigua and Barbuda,List(126),List(United States)
Argentina,List(180),List(United States)
Aruba,List(346),List(United States)
Australia,List(329),List(United States)
Austria,List(62),List(United States)
Azerbaijan,List(21),List(United States)
Bahrain,List(19),List(United States)


In [0]:
%sql
SELECT DEST_COUNTRY_NAME, ARRAY(1, 2, 3) FROM flights_from_select

DEST_COUNTRY_NAME,"array(1, 2, 3)"
United States,"List(1, 2, 3)"
United States,"List(1, 2, 3)"
United States,"List(1, 2, 3)"
Egypt,"List(1, 2, 3)"
United States,"List(1, 2, 3)"
United States,"List(1, 2, 3)"
United States,"List(1, 2, 3)"
Costa Rica,"List(1, 2, 3)"
Senegal,"List(1, 2, 3)"
Moldova,"List(1, 2, 3)"


In [0]:
%sql
SELECT DEST_COUNTRY_NAME as new_name, collect_list(count)[0]
FROM flights_from_select
GROUP BY DEST_COUNTRY_NAME

new_name,collect_list(count)[0]
Anguilla,41
Russia,176
Paraguay,60
Senegal,40
Sweden,118
Kiribati,26
Guyana,64
Philippines,134
Djibouti,1
Malaysia,2


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW flights_agg AS
SELECT DEST_COUNTRY_NAME, collect_list(count) as collected_counts
FROM flights_from_select GROUP BY DEST_COUNTRY_NAME

In [0]:
%sql
SELECT explode(collected_counts), DEST_COUNTRY_NAME FROM flights_agg

col,DEST_COUNTRY_NAME
41,Anguilla
41,Anguilla
176,Russia
60,Paraguay
40,Senegal
40,Senegal
118,Sweden
26,Kiribati
64,Guyana
64,Guyana


In [0]:
%sql
show functions

function
!
!=
%
&
*
+
-
/
<
<<


In [0]:
%sql
show system functions

function
!
!=
%
&
*
+
-
/
<
<<


In [0]:
%sql
show user functions

function
getargument


In [0]:
%sql
SHOW FUNCTIONS "s*";

function
schema_of_csv
schema_of_json
schema_of_json_agg
schema_of_variant
schema_of_variant_agg
schema_of_xml
sec
second
secret
sentences


In [0]:
%sql
show functions like "collect*"

function
collect_list
collect_set


In [0]:
%sql
SELECT dest_country_name FROM flights_from_select
GROUP BY dest_country_name ORDER BY sum(count) DESC LIMIT 5

dest_country_name
United States
Canada
Mexico
United Kingdom
Japan


In [0]:
%sql
SELECT * FROM flights_from_select
WHERE origin_country_name IN (SELECT dest_country_name FROM flights_from_select
GROUP BY dest_country_name ORDER BY sum(count) DESC LIMIT 5)

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
Egypt,United States,15
Costa Rica,United States,588
Senegal,United States,40
Moldova,United States,1
Guyana,United States,64
Malta,United States,1
Anguilla,United States,41
Bolivia,United States,30
Algeria,United States,4
Turks and Caicos Islands,United States,230


In [0]:
%sql
SELECT * FROM flights_from_select f1
WHERE EXISTS (SELECT 1 FROM flights_from_select f2
WHERE f1.dest_country_name = f2.origin_country_name)
AND EXISTS (SELECT 1 FROM flights_from_select f2
WHERE f2.dest_country_name = f1.origin_country_name)

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40
United States,Sint Maarten,325


In [0]:
%sql
SELECT *, (SELECT max(count) FROM flights_from_select) AS maximum FROM flights_from_select

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,maximum
United States,Romania,15,986
United States,Croatia,1,986
United States,Ireland,344,986
Egypt,United States,15,986
United States,India,62,986
United States,Singapore,1,986
United States,Grenada,62,986
Costa Rica,United States,588,986
Senegal,United States,40,986
Moldova,United States,1,986
