## Delta table Column Mapping

In [1]:
# Create Spark Session with Delta JARS and conf

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Delta table with Column Mapping") \
    .config('spark.jars.packages', 'io.delta:delta-core_2.12:2.1.1') \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    ) \
    .config("spark.sql.warehouse.dir", "spark-warehouse") \
    .master("local[*]") \
    .enableHiveSupport() \
    .getOrCreate()

spark

In [2]:
# pip install sparksql-magic
# Run below command to enable sparksql
%load_ext sparksql_magic

In [97]:
%%sparksql

create table sales_delta_mapping
using delta
as
select * from sales_managed;

In [98]:
%%sparksql

describe sales_delta_mapping

0,1,2
col_name,data_type,comment
transacted_at,string,
trx_id,string,
retailer_id,string,
description,string,
amount,string,
city_id,string,
,,
# Partitioning,,
Not partitioned,,


In [12]:
%%sparksql

alter table sales_delta_mapping rename column amount to total_amount;

AnalysisException: Column rename is not supported for your Delta table. 
Please upgrade your Delta table to reader version 2 and writer version 5
and change the column mapping mode to 'name' mapping. You can use the following command:

ALTER TABLE <table_name> SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')


In [73]:
%%sparksql

ALTER TABLE sales_delta_mapping SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5')
;

In [75]:
%%sparksql

alter table sales_delta_mapping rename column amount to total_amount;

In [84]:
%%sparksql

describe sales_delta_mapping

0,1,2
col_name,data_type,comment
transacted_at,string,
trx_id,string,
retailer_id,string,
description,string,
total_amount,string,
city_id,string,
,,
# Partitioning,,
Not partitioned,,


In [99]:
%%sparksql

select * from sales_delta_mapping limit 10;

0,1,2,3,4,5
transacted_at,trx_id,retailer_id,description,amount,city_id
2017-11-24T19:00:00.000Z,1995601912,2077350195,Walgreen 11-25,197.230000000000000000,216510442
2017-11-24T19:00:00.000Z,1734117021,644879053,unkn ppd id: 768641 11-26,8.580000000000000000,930259917
2017-11-24T19:00:00.000Z,1734117022,847200066,Wal-Mart ppd id: 555914 Algiers 11-26,1737.260000000000000000,1646415505
2017-11-24T19:00:00.000Z,1734117030,1953761884,Home Depot ppd id: 265293 11-25,384.500000000000000000,287177635
2017-11-24T19:00:00.000Z,1734117089,1898522855,Target 11-25,66.330000000000000000,1855530529
2017-11-24T19:00:00.000Z,1734117117,997626433,Sears ppd id: 856095 Ashgabat,298.870000000000000000,957346984
2017-11-24T19:00:00.000Z,1734117123,1953761884,unkn ppd id: 153174 Little Rock 11-25,19.550000000000000000,45522086
2017-11-24T19:00:00.000Z,1734117152,1429095612,Ikea arc id: 527956 Saint John's 11-26,9.390000000000000000,1268541279
2017-11-24T19:00:00.000Z,1734117153,847200066,unkn Kingstown,2907.570000000000000000,1483931123


In [78]:
# Create a dataset with old name "amount"
_data = [
    ["2022-11-19T21:00:00.000Z", "0000", "10001", "Test data for rename", "100.00", "111"]
]

# Schema with old name
_schema = ["transacted_at", "trx_id", "retailer_id", "description", "total_amount", "city_id"]

old_df = spark.createDataFrame(data=_data, schema=_schema)
old_df.printSchema()
old_df.show()

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: string (nullable = true)
 |-- retailer_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- city_id: string (nullable = true)

+--------------------+------+-----------+--------------------+------------+-------+
|       transacted_at|trx_id|retailer_id|         description|total_amount|city_id|
+--------------------+------+-----------+--------------------+------------+-------+
|2022-11-19T21:00:...|  0000|      10001|Test data for rename|      100.00|    111|
+--------------------+------+-----------+--------------------+------------+-------+



In [79]:
# Lets write the data into the delta table

old_df.write.format('delta').mode("append").saveAsTable("sales_delta_mapping")

In [80]:
%%sparksql

select * from sales_delta_mapping where city_id = '111';

0,1,2,3,4,5
transacted_at,trx_id,retailer_id,description,total_amount,city_id
2022-11-19T21:00:00.000Z,0000,10001,Test data for rename,100.00,111


In [85]:
%%sparksql

describe history sales_delta_mapping;

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2022-11-19 06:14:30.370000,,,WRITE,"{'mode': 'Append', 'partitionBy': '[]'}",,,,2,Serializable,True,"{'numOutputRows': '1', 'numOutputBytes': '3321', 'numFiles': '2'}",,Apache-Spark/3.3.0 Delta-Lake/2.1.1
2,2022-11-19 06:13:50.667000,,,RENAME COLUMN,"{'newColumnPath': 'total_amount', 'oldColumnPath': 'amount'}",,,,1,Serializable,True,{},,Apache-Spark/3.3.0 Delta-Lake/2.1.1
1,2022-11-19 06:13:43.237000,,,SET TBLPROPERTIES,"{'properties': '{""delta.columnMapping.mode"":""name"",""delta.minReaderVersion"":""2"",""delta.minWriterVersion"":""5""}'}",,,,0,Serializable,True,{},,Apache-Spark/3.3.0 Delta-Lake/2.1.1
0,2022-11-19 06:13:32.797000,,,CREATE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'properties': '{}', 'isManaged': 'true'}",,,,,Serializable,True,"{'numOutputRows': '4132056', 'numOutputBytes': '123051140', 'numFiles': '2'}",,Apache-Spark/3.3.0 Delta-Lake/2.1.1


In [86]:
%%sparksql

alter table sales_delta_mapping drop column description;

In [87]:
%%sparksql

describe history sales_delta_mapping;

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2022-11-19 06:22:24.961000,,,DROP COLUMNS,"{'columns': '[""description""]'}",,,,3,Serializable,True,{},,Apache-Spark/3.3.0 Delta-Lake/2.1.1
3,2022-11-19 06:14:30.370000,,,WRITE,"{'mode': 'Append', 'partitionBy': '[]'}",,,,2,Serializable,True,"{'numOutputRows': '1', 'numOutputBytes': '3321', 'numFiles': '2'}",,Apache-Spark/3.3.0 Delta-Lake/2.1.1
2,2022-11-19 06:13:50.667000,,,RENAME COLUMN,"{'newColumnPath': 'total_amount', 'oldColumnPath': 'amount'}",,,,1,Serializable,True,{},,Apache-Spark/3.3.0 Delta-Lake/2.1.1
1,2022-11-19 06:13:43.237000,,,SET TBLPROPERTIES,"{'properties': '{""delta.columnMapping.mode"":""name"",""delta.minReaderVersion"":""2"",""delta.minWriterVersion"":""5""}'}",,,,0,Serializable,True,{},,Apache-Spark/3.3.0 Delta-Lake/2.1.1
0,2022-11-19 06:13:32.797000,,,CREATE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'properties': '{}', 'isManaged': 'true'}",,,,,Serializable,True,"{'numOutputRows': '4132056', 'numOutputBytes': '123051140', 'numFiles': '2'}",,Apache-Spark/3.3.0 Delta-Lake/2.1.1


In [88]:
%%sparksql

select * from sales_delta_mapping where city_id = '111';

0,1,2,3,4
transacted_at,trx_id,retailer_id,total_amount,city_id
2022-11-19T21:00:00.000Z,0000,10001,100.00,111


In [91]:
# Renaming column with underlying data

spark.read.table("sales_delta_mapping") \
    .withColumnRenamed("trx_id", "transaction_id") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("sales_delta_mapping")

In [92]:
%%sparksql

select * from sales_delta_mapping where city_id = '111';

0,1,2,3,4
transacted_at,transaction_id,retailer_id,total_amount,city_id
2022-11-19T21:00:00.000Z,0000,10001,100.00,111


In [93]:
# Dropping column with underlying data

spark.read.table("sales_delta_mapping") \
    .drop("transaction_id") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("sales_delta_mapping")

In [94]:
%%sparksql

select * from sales_delta_mapping where city_id = '111';

0,1,2,3
transacted_at,retailer_id,total_amount,city_id
2022-11-19T21:00:00.000Z,10001,100.00,111


In [96]:
%%sparksql

drop table sales_delta_mapping