In [46]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol','true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [47]:
spark

In [48]:
order_schema = 'order_id long, order_date date ' #, customer_id long, order_status string'

In [49]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders1.csv")

In [50]:
order_df.show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       1|2013-07-25|
|       2|2013-07-25|
+--------+----------+



In [51]:
order_df.write.mode("overwrite").option("path","/user/itv017244/datasets/parquet-schema-evol-demo1/parquet").save()

In [52]:
order_parquet_df = spark.read.load("/user/itv017244/datasets/parquet-schema-evol-demo1/parquet")

In [53]:
order_parquet_df.show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       1|2013-07-25|
|       2|2013-07-25|
+--------+----------+



In [54]:
order_schema = 'order_id long, order_date date , customer_id long'  #, order_status string'

In [55]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders2.csv")

In [56]:
order_df.show()

+--------+----------+-----------+
|order_id|order_date|customer_id|
+--------+----------+-----------+
|       3|2013-07-25|      12111|
|       4|2013-07-25|       8827|
+--------+----------+-----------+



In [57]:
order_df.write.mode("append").option("path","/user/itv017244/datasets/parquet-schema-evol-demo1/parquet").save()

In [58]:
order_parquet_df = spark.read.load("/user/itv017244/datasets/parquet-schema-evol-demo1/parquet")

In [59]:
order_parquet_df.show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       3|2013-07-25|
|       4|2013-07-25|
|       1|2013-07-25|
|       2|2013-07-25|
+--------+----------+



In [60]:
order_parquet_merged_df = spark.read.option("mergeSchema",True).load("/user/itv017244/datasets/parquet-schema-evol-demo1/parquet")

In [61]:
order_parquet_merged_df.show()

+--------+----------+-----------+
|order_id|order_date|customer_id|
+--------+----------+-----------+
|       3|2013-07-25|      12111|
|       4|2013-07-25|       8827|
|       1|2013-07-25|       null|
|       2|2013-07-25|       null|
+--------+----------+-----------+



In [62]:
order_schema = 'order_id long, order_date date ,  order_status string,customer_id long'

In [63]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders4.csv")

In [64]:
order_df.show()

+--------+----------+------------+-----------+
|order_id|order_date|order_status|customer_id|
+--------+----------+------------+-----------+
|       5|2013-07-25|    COMPLETE|      11318|
|       6|2013-07-25|    COMPLETE|       7130|
+--------+----------+------------+-----------+



In [65]:
order_df.write.mode("append").option("path","/user/itv017244/datasets/parquet-schema-evol-demo1/parquet").save()

In [66]:
order_parquet_df = spark.read.option("mergeSchema",True).load("/user/itv017244/datasets/parquet-schema-evol-demo1/parquet")

In [67]:
order_parquet_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
|       3|2013-07-25|      12111|        null|
|       4|2013-07-25|       8827|        null|
|       1|2013-07-25|       null|        null|
|       2|2013-07-25|       null|        null|
+--------+----------+-----------+------------+



In [68]:
order_schema = 'order_id long, order_date date , customer_id long, order_status string'

In [69]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders3.csv")

In [70]:
order_df.write.mode("append").option("path","/user/itv017244/datasets/parquet-schema-evol-demo1/parquet").save()

In [71]:
order_parquet_df = spark.read.option("mergeSchema",True).load("/user/itv017244/datasets/parquet-schema-evol-demo1/parquet")

In [72]:
order_parquet_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
|       3|2013-07-25|      12111|        null|
|       4|2013-07-25|       8827|        null|
|       1|2013-07-25|       null|        null|
|       2|2013-07-25|       null|        null|
+--------+----------+-----------+------------+

