# Spark SQL

In [1]:
%run ./Includes/paths.py

In [15]:
spark.stop()

22/06/12 14:47:41 ERROR Schema: Failed initialising database.
Unable to open a test connection to the given database. JDBC url = jdbc:derby:;databaseName=metastore_db;create=true, username = APP. Terminating connection pool (set lazyInit to true if you expect to start your database after your app). Original Exception: ------
java.sql.SQLException: Failed to start database 'metastore_db' with class loader jdk.internal.loader.ClassLoaders$AppClassLoader@5ffd2b27, see the next exception for details.
	at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
	at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
	at org.apache.derby.impl.jdbc.Util.seeNextException(Unknown Source)
	at org.apache.derby.impl.jdbc.EmbedConnection.bootDatabase(Unknown Source)
	at org.apache.derby.impl.jdbc.EmbedConnection.<init>(Unknown Source)
	at org.apache.derby.jdbc.InternalDriver$1.run(Unknown Source)
	at org.apache.derby.jdbc.InternalDriver$1.run(Unknown

In [3]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

from delta import *
import pandas as pd

# start spark
builder = (pyspark.sql.SparkSession.builder.appName("Spark-Course")
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
                .config("spark.sql.warehouse.dir", spark_warehouse_path)
                .config("spark.sql.catalogImplementation", "hive"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# setting log-level to ERROR to decrease verbosity
# log4j log-levels are: OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE, ALL
spark.sparkContext.setLogLevel("ERROR")

# so that we can register UDFs in SQL
spark.builder.enableHiveSupport()

spark

In [4]:
def display(spark_df, rows=10):
    return spark_df.limit(rows).toPandas().head(rows)

In [5]:
%load_ext sparksql_magic

## Create Database

In [6]:
%%sparksql

drop database if exists spark_course_db cascade

In [7]:
query = f'create database spark_course_db'

_ = spark.sql(query)

In [8]:
# check if the database is in the desired location
assert(spark.sql('describe database spark_course_db').where('info_name = "Location"').select('info_value').toPandas().iloc[0][0].replace('file:', '') == spark_warehouse_path + 'spark_course_db.db')

In [9]:
%%sparksql

use spark_course_db

## Create Table

In [191]:
# read directly from file before extracting

display(
    spark.sql(f'select * from csv.`{sample_data_path}fellowship.csv`')
)

Unnamed: 0,_c0,_c1,_c2
0,name,race,age
1,Frodo,Hobbit,50
2,Sam,Hobbit,38
3,Merry,Hobbit,36
4,Pippin,Hobbit,28
5,Gandalf,Maia,2000
6,Legolas,Elf,2931
7,Gimli,Dwarf,139
8,Aragorn,Man,87
9,Boromir,Man,40


In [193]:
# csv files don't store schema

spark.sql(f'select * from csv.`{sample_data_path}fellowship.csv`').printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



### External Tables

In [10]:
%%sparksql

drop table if exists fellowship

In [12]:
# create external table

query = f'''
create table fellowship (
    name string,
    race string,
    age int comment "Gandalf's age refers to his physical body's approximate age"
)
using csv
options (header true,
            path "{sample_data_path}fellowship.csv",
            mode "dropMalformed"
        )
'''

_ = spark.sql(query)


In [155]:
%%sparksql

select * from fellowship

0,1,2
name,race,age
Frodo,Hobbit,50
Sam,Hobbit,38
Merry,Hobbit,36
Pippin,Hobbit,28
Gandalf,Maia,2000
Legolas,Elf,2931
Gimli,Dwarf,139
Aragorn,Man,87
Boromir,Man,40


In [13]:
display(
    spark.sql('describe extended fellowship').where('col_name not in ("Created Time", "Location")')
, 100)

Unnamed: 0,col_name,data_type,comment
0,name,string,
1,race,string,
2,age,int,Gandalf's age refers to his physical body's ap...
3,,,
4,# Detailed Table Information,,
5,Database,spark_course_db,
6,Table,fellowship,
7,Owner,siladitya,
8,Last Access,UNKNOWN,
9,Created By,Spark 3.2.1,


In [160]:
%%bash -s "$spark_warehouse_path/spark_course_db.db"
ls -a $1

.
..


No table files in the database dir since `fellowship` is an external table.

Files associated with `fellowship` will remain after dropping table.

In [207]:
%%sparksql

drop table if exists fellowship2

In [210]:
query = f'''create external table fellowship2
            row format delimited fields terminated by ","
            location "{data_path}fellowship2"
            as select * from fellowship'''

_ = spark.sql(query)

In [211]:
%%bash -s "$data_path/fellowship2"
ls $1

part-00000-649df399-f44e-4793-b008-d616af70047a-c000


In [212]:
%%sparksql

drop table if exists fellowship2

In [213]:
%%bash -s "$data_path/fellowship2"
ls $1

part-00000-649df399-f44e-4793-b008-d616af70047a-c000


The files associated with the dropped table still exist. Need to delete them manually.

In [216]:
%%bash -s "$data_path"

cd $1
rm -rf fellowship2

### Managed Tables

In [170]:
%%sparksql

drop table if exists fellowship_mgd

In [171]:
%%sparksql

-- create managed table
create table fellowship_mgd as
select * from fellowship

In [162]:
%%sparksql

select * from fellowship_mgd

0,1,2
name,race,age
Frodo,Hobbit,50
Sam,Hobbit,38
Merry,Hobbit,36
Pippin,Hobbit,28
Gandalf,Maia,2000
Legolas,Elf,2931
Gimli,Dwarf,139
Aragorn,Man,87
Boromir,Man,40


In [164]:
display(
    spark.sql('describe extended fellowship_mgd').where('col_name not in ("Created Time", "Location")')
, 100)

Unnamed: 0,col_name,data_type,comment
0,name,string,
1,race,string,
2,age,int,
3,,,
4,# Detailed Table Information,,
5,Database,spark_course_db,
6,Table,fellowship_mgd,
7,Owner,siladitya,
8,Last Access,UNKNOWN,
9,Created By,Spark 3.2.1,


In [165]:
%%bash -s "$spark_warehouse_path/spark_course_db.db"
ls -a $1

.
..
fellowship_mgd


Now we have Parquet files associated with the managed table `fellowship_mgd`.

In [166]:
%%bash -s "$spark_warehouse_path/spark_course_db.db/fellowship_mgd"
ls -a $1

.
..
part-00000-4401182c-2569-4751-ac30-4113c2c8babb-c000
.part-00000-4401182c-2569-4751-ac30-4113c2c8babb-c000.crc


In [176]:
%%sparksql

drop table if exists fellowship_mgd

In [179]:
%%bash -s "$spark_warehouse_path/spark_course_db.db"
ls -a $1

.
..


Files associated with `fellowship_mgd` have been deleted.

#### Managed Delta Table

In [195]:
%%sparksql

drop table if exists fellowship_delta

In [14]:
%%sparksql

-- create managed delta table

create table fellowship_delta using delta as
select * from fellowship

                                                                                

In [183]:
%%bash -s "$spark_warehouse_path/spark_course_db.db/fellowship_delta"
ls -a $1

.
..
_delta_log
part-00000-c2ff031a-40d0-412b-92dd-b06d7a54edce-c000.snappy.parquet
.part-00000-c2ff031a-40d0-412b-92dd-b06d7a54edce-c000.snappy.parquet.crc


In [190]:
# read directly from Parquet file

display(
    spark.sql(f'select * from parquet.`{spark_warehouse_path}spark_course_db.db/fellowship_delta/part-00000-c2ff031a-40d0-412b-92dd-b06d7a54edce-c000.snappy.parquet`')
)

Unnamed: 0,name,race,age
0,Frodo,Hobbit,50
1,Sam,Hobbit,38
2,Merry,Hobbit,36
3,Pippin,Hobbit,28
4,Gandalf,Maia,2000
5,Legolas,Elf,2931
6,Gimli,Dwarf,139
7,Aragorn,Man,87
8,Boromir,Man,40


In [192]:
# parquet files store schema

spark.sql(f'select * from parquet.`{spark_warehouse_path}spark_course_db.db/fellowship_delta/part-00000-c2ff031a-40d0-412b-92dd-b06d7a54edce-c000.snappy.parquet`').printSchema()

root
 |-- name: string (nullable = true)
 |-- race: string (nullable = true)
 |-- age: integer (nullable = true)



##### Delta Transaction Log

In [None]:
%%bash -s "$spark_warehouse_path/spark_course_db.db/fellowship_delta/_delta_log"
ls -a $1

.
..
00000000000000000000.json
.00000000000000000000.json.crc


In [186]:
display(
    spark.sql(f'select * from json.`{spark_warehouse_path}spark_course_db.db/fellowship_delta/_delta_log/00000000000000000000.json`')
)

Unnamed: 0,add,commitInfo,metaData,protocol
0,,,,"(1, 2)"
1,,,"(1655021106040, (parquet,), 8992d826-97a7-4617...",
2,"(True, 1655021106283, part-00000-c2ff031a-40d0...",,,
3,,"(Apache-Spark/3.2.1 Delta-Lake/1.2.1, True, Se...",,


#### Managed Partitioned Delta Table

In [None]:
%%sparksql

create table fellowship_delta_part using delta partitioned by (race)
as select * from fellowship_delta

                                                                                

In [32]:
%%bash -s "$spark_warehouse_path/spark_course_db.db/fellowship_delta_part"
ls -a $1

.
..
_delta_log
race=Dwarf
race=Elf
race=Hobbit
race=Maia
race=Man
race=Pony


In [35]:
display(
    spark.sql(f'select * from parquet.`{spark_warehouse_path}spark_course_db.db/fellowship_delta_part/race=Hobbit`')
)

Unnamed: 0,name,age
0,Frodo,50
1,Sam,38
2,Merry,36
3,Pippin,28


## Insert

In [17]:
%%sparksql

insert into fellowship_delta
values ("Bill", "Pony", null),
       ("Gollum", "Hobbit", "589")

                                                                                

In [18]:
%%sparksql

select * from fellowship_delta

0,1,2
name,race,age
Frodo,Hobbit,50
Sam,Hobbit,38
Merry,Hobbit,36
Pippin,Hobbit,28
Gandalf,Maia,2000
Legolas,Elf,2931
Gimli,Dwarf,139
Aragorn,Man,87
Boromir,Man,40


In [None]:
%%bash -s "$spark_warehouse_path/spark_course_db.db/fellowship_delta/_delta_log"
ls -a $1

.
..
00000000000000000000.json
.00000000000000000000.json.crc
00000000000000000001.json
.00000000000000000001.json.crc


In [21]:
# new JSON log

display(
    spark.sql(f'select * from json.`{spark_warehouse_path}spark_course_db.db/fellowship_delta/_delta_log/00000000000000000001.json`')
)

Unnamed: 0,add,commitInfo
0,"(True, 1655026550019, part-00000-3b510383-2c43...",
1,"(True, 1655026550011, part-00001-4b7b0e14-04d1...",
2,,"(Apache-Spark/3.2.1 Delta-Lake/1.2.1, True, Se..."


## Delete

In [22]:
%%sparksql

delete from fellowship_delta
where name = 'Gollum'

                                                                                

In [23]:
%%bash -s "$spark_warehouse_path/spark_course_db.db/fellowship_delta/_delta_log"
ls -a $1

.
..
00000000000000000000.json
.00000000000000000000.json.crc
00000000000000000001.json
.00000000000000000001.json.crc
00000000000000000002.json
.00000000000000000002.json.crc


In [24]:
# new JSON log

display(
    spark.sql(f'select * from json.`{spark_warehouse_path}spark_course_db.db/fellowship_delta/_delta_log/00000000000000000002.json`')
)

Unnamed: 0,add,commitInfo,remove
0,,,"(True, 1655026693647, True, part-00001-4b7b0e1..."
1,"(True, 1655026693635, part-00000-1ad64207-c555...",,
2,,"(Apache-Spark/3.2.1 Delta-Lake/1.2.1, False, S...",


## Metadata

In [40]:
%%sparksql

select current_database()

0
current_database()
spark_course_db


In [39]:
%%sparksql

describe fellowship_delta_part

0,1,2
col_name,data_type,comment
name,string,
race,string,
age,int,Gandalf's age refers to his physical body's approximate age
,,
# Partitioning,,
Part 0,race,


In [37]:
%%sparksql

describe extended fellowship_delta_part

0,1,2
col_name,data_type,comment
name,string,
race,string,
age,int,Gandalf's age refers to his physical body's approximate age
,,
# Partitioning,,
Part 0,race,
,,
# Detailed Table Information,,
Name,spark_course_db.fellowship_delta_part,


In [46]:
%%sparksql

-- drop database if exists spark_course_db cascade

In [47]:
# spark.stop()