In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [2]:
session = SparkSession.builder.appName('parquet').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/08 11:44:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = session.read.json('/opt/bitnami/spark/data/people.json')
df.show()

                                                                                

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [5]:
df.write.parquet('/opt/bitnami/spark/data/temp/people.parquet')

                                                                                

In [8]:
df_par = session.read.parquet('/opt/bitnami/spark/data/temp/people.parquet')

df_par.createOrReplaceTempView('parquetFiles')

teen = session.sql('select name from parquetFiles where age >= 13 and age <=19')
teen.show()

+------+
|  name|
+------+
|Justin|
+------+



### Meging Schema

In [11]:
sc = session.sparkContext

In [18]:
df_o = session.createDataFrame(
    sc.parallelize(range(6)).map(lambda x: Row(single_x=x, double_x=x**2))
)
df_o.write.parquet('/opt/bitnami/spark/data/temp/testable/key=1')
df_o.show()

+--------+--------+
|single_x|double_x|
+--------+--------+
|       0|       0|
|       1|       1|
|       2|       4|
|       3|       9|
|       4|      16|
|       5|      25|
+--------+--------+



In [19]:
df_t = session.createDataFrame(
    sc.parallelize(range(6)).map(lambda x: Row(single_X=x, triple_x=x**3))
)

df_t.write.parquet('/opt/bitnami/spark/data/temp/testable/key=2')
df_t.show()

                                                                                

+--------+--------+
|single_X|triple_x|
+--------+--------+
|       0|       0|
|       1|       1|
|       2|       8|
|       3|      27|
|       4|      64|
|       5|     125|
+--------+--------+



In [21]:
mergeDf = session.read.option('mergeSchema', 'true').parquet('/opt/bitnami/spark/data/temp/testable')
mergeDf.printSchema()

root
 |-- single_x: long (nullable = true)
 |-- double_x: long (nullable = true)
 |-- triple_x: long (nullable = true)
 |-- key: integer (nullable = true)



### Columnar encryption

Since Spark 3.2, columnar encryption is supported for Parquet tables with Apache Parquet 1.12+.

Parquet uses the envelope encryption practice, where file parts are encrypted with “data encryption keys” (DEKs), and the DEKs are encrypted with “master encryption keys” (MEKs). 

The DEKs are randomly generated by Parquet for each encrypted file/column. 

The MEKs are generated, stored and managed in a Key Management Service (KMS) of user’s choice. 

The Parquet Maven repository has a jar with a mock KMS implementation that allows to run column encryption and decryption using a spark-shell only, without deploying a KMS server (download the parquet-hadoop-tests.jar file and place it in the Spark jars folder):

In [24]:
mergeDf.show()

+--------+--------+--------+---+
|single_x|double_x|triple_x|key|
+--------+--------+--------+---+
|       0|       0|    NULL|  1|
|       1|       1|    NULL|  1|
|       2|       4|    NULL|  1|
|       3|       9|    NULL|  1|
|       4|      16|    NULL|  1|
|       5|      25|    NULL|  1|
|       0|    NULL|       0|  2|
|       1|    NULL|       1|  2|
|       2|    NULL|       8|  2|
|       3|    NULL|      27|  2|
|       4|    NULL|      64|  2|
|       5|    NULL|     125|  2|
+--------+--------+--------+---+



In [25]:
mergeDf.write.\
option('parquet.encryption.column.keys', 'keyA:square').\
option('partition.encryption.footer.key', 'keyB')\
.parquet('/opt/bitnami/spark/data/temp/merged_encrypt.parquet.encrypted')

                                                                                

In [26]:
df_m = session.read.parquet('/opt/bitnami/spark/data/temp/merged_encrypt.parquet.encrypted')
df_m.show()

+--------+--------+--------+---+
|single_x|double_x|triple_x|key|
+--------+--------+--------+---+
|       0|       0|    NULL|  1|
|       1|       1|    NULL|  1|
|       4|      16|    NULL|  1|
|       5|      25|    NULL|  1|
|       0|    NULL|       0|  2|
|       1|    NULL|       1|  2|
|       4|    NULL|      64|  2|
|       5|    NULL|     125|  2|
|       2|       4|    NULL|  1|
|       3|       9|    NULL|  1|
|       2|    NULL|       8|  2|
|       3|    NULL|      27|  2|
+--------+--------+--------+---+

