## Metastore in PySpark

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Metastore with Hive Support") \
    .master("local[*]") \
    .config("spark.sql.warehouse.dir", "spark-warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

spark

In [2]:
# Validate if catalog is Hive
spark.conf.get("spark.sql.catalogImplementation")

'hive'

In [3]:
%load_ext sparksql_magic

In [4]:
%%sparksql

show databases;

0
namespace
default


In [10]:
# Read data from Parquet

df = spark.read \
    .format("parquet") \
    .load("dataset/sales.parquet")

df.printSchema()
df.show(10)

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: string (nullable = true)
 |-- retailer_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- city_id: string (nullable = true)

+--------------------+----------+-----------+--------------------+--------------------+----------+
|       transacted_at|    trx_id|retailer_id|         description|              amount|   city_id|
+--------------------+----------+-----------+--------------------+--------------------+----------+
|2017-11-24T19:00:...|1995601912| 2077350195|Walgreen       11-25|197.2300000000000...| 216510442|
|2017-11-24T19:00:...|1734117021|  644879053|unkn    ppd id: 7...|8.580000000000000000| 930259917|
|2017-11-24T19:00:...|1734117022|  847200066|Wal-Mart  ppd id:...|1737.260000000000...|1646415505|
|2017-11-24T19:00:...|1734117030| 1953761884|Home Depot     pp...|384.5000000000000...| 287177635|
|2017-11-24T19:00:...|1734117089| 1898522855| Target   

In [12]:
# Write dataframe as table

df.write.saveAsTable("sales_managed")

In [5]:
%%sparksql

show tables in default;

0,1,2
namespace,tableName,isTemporary
default,sales_managed,False


In [16]:
%%sparksql

select * from sales_managed limit 10;

0,1,2,3,4,5
transacted_at,trx_id,retailer_id,description,amount,city_id
2017-11-24T19:00:00.000Z,1995601912,2077350195,Walgreen 11-25,197.230000000000000000,216510442
2017-11-24T19:00:00.000Z,1734117021,644879053,unkn ppd id: 768641 11-26,8.580000000000000000,930259917
2017-11-24T19:00:00.000Z,1734117022,847200066,Wal-Mart ppd id: 555914 Algiers 11-26,1737.260000000000000000,1646415505
2017-11-24T19:00:00.000Z,1734117030,1953761884,Home Depot ppd id: 265293 11-25,384.500000000000000000,287177635
2017-11-24T19:00:00.000Z,1734117089,1898522855,Target 11-25,66.330000000000000000,1855530529
2017-11-24T19:00:00.000Z,1734117117,997626433,Sears ppd id: 856095 Ashgabat,298.870000000000000000,957346984
2017-11-24T19:00:00.000Z,1734117123,1953761884,unkn ppd id: 153174 Little Rock 11-25,19.550000000000000000,45522086
2017-11-24T19:00:00.000Z,1734117152,1429095612,Ikea arc id: 527956 Saint John's 11-26,9.390000000000000000,1268541279
2017-11-24T19:00:00.000Z,1734117153,847200066,unkn Kingstown,2907.570000000000000000,1483931123


In [21]:
# Stop Spark to check is metastore persists
spark.stop()

In [1]:
# Create Spark Session with Hive Support

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Metastore Without HiveSupport") \
    .master("local[*]") \
    .config("spark.sql.warehouse.dir", "spark-warehouse") \
    .getOrCreate()

spark

In [2]:
# Validate CatalogImplementation
spark.conf.get("spark.sql.catalogImplementation")

'in-memory'