# Schema Evolution
1. Adding New Columns (Manual / Automatic)
2. Widening Data Types (Supported Delta >= 3.2): Sometimes we need to expand a column's data type to accommodate larger values. Delta Lake allows "widening" type conversions that won't lose data, such as:
- `INT` to `BIGINT`
- `FLOAT` to `DOUBLE`
- `VARCHAR(10)` to `VARCHAR(20)`

3. Nested Structure Evolution (Manual / Automatic): Delta Lake supports evolution of complex data types like structs and arrays. We can:
- Add new fields to structs
- Modify nested field types
- Add new elements to arrays

4. Column Position Changes (Manual / Automatic): we can reorganize our columns

Note: 
- `INSERT` works by matching columns by position
- `MERGE` works by matching columns by name

#### Scenario 1: Adding New Columns 

In [0]:
DROP TABLE deltacatalog.deltadb.invoices_se;

In [0]:
SELECT *
FROM PARQUET.`abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet`
WHERE customer_id BETWEEN 1 AND 5

customer_id,invoice_no,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall,_rescued_data
1,I178410,Male,61,Clothing,5,1500.4,Credit Card,2021-11-26,Metrocity,
2,I158163,Male,34,Shoes,2,1200.34,Cash,2023-03-03,Kanyon,
3,I262373,Male,44,Toys,3,107.52,Credit Card,2022-12-01,Cevahir AVM,
4,I334895,Male,25,Food & Beverage,5,26.15,Cash,2021-08-15,Kanyon,
5,I202043,Female,21,Toys,1,35.84,Credit Card,2021-07-25,Metrocity,


In [0]:
CREATE OR REPLACE TABLE deltacatalog.deltadb.invoices_se (
  customer_id INT NOT NULL,  
  invoice_no STRING,
  price FLOAT, 
  invoice_date DATE
); 

INSERT INTO deltacatalog.deltadb.invoices_se
SELECT customer_id, invoice_no, price, invoice_date
FROM PARQUET.`abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet`
WHERE customer_id BETWEEN 1 AND 5

num_affected_rows,num_inserted_rows
5,5


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,invoice_date
1,I178410,1500.4,2021-11-26
2,I158163,1200.34,2023-03-03
3,I262373,107.52,2022-12-01
4,I334895,26.15,2021-08-15
5,I202043,35.84,2021-07-25


In [0]:
ALTER TABLE deltacatalog.deltadb.invoices_se
ADD COLUMNS (quantity INT);

In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se
SELECT customer_id, invoice_no, price, invoice_date, quantity
FROM PARQUET.`abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet`
WHERE customer_id BETWEEN 6 AND 10 

num_affected_rows,num_inserted_rows
5,5


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,invoice_date,quantity
6,I303349,71.68,2021-06-03,2.0
7,I336350,15.15,2021-11-25,1.0
8,I134255,1200.32,2022-10-13,4.0
9,I306929,600.16,2021-08-06,2.0
10,I280920,900.24,2021-04-23,3.0
1,I178410,1500.4,2021-11-26,
2,I158163,1200.34,2023-03-03,
3,I262373,107.52,2022-12-01,
4,I334895,26.15,2021-08-15,
5,I202043,35.84,2021-07-25,


In [0]:
SET spark.databricks.delta.schema.autoMerge.enabled = false;

key,value
spark.databricks.delta.schema.autoMerge.enabled,False


In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se
SELECT customer_id, invoice_no, price, invoice_date, quantity, payment_method 
FROM PARQUET.`abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet`
WHERE customer_id BETWEEN 11 AND 15 

num_affected_rows,num_inserted_rows
5,5


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,invoice_date,quantity,payment_method
11,I235733,162.64,2022-01-04,4.0,Cash
12,I272400,5.23,2022-01-10,1.0,Cash
13,I223729,107.52,2022-08-02,3.0,Cash
14,I299506,600.17,2021-01-06,1.0,Cash
15,I106485,30.3,2022-12-01,2.0,Debit Card
6,I303349,71.68,2021-06-03,2.0,
7,I336350,15.15,2021-11-25,1.0,
8,I134255,1200.32,2022-10-13,4.0,
9,I306929,600.16,2021-08-06,2.0,
10,I280920,900.24,2021-04-23,3.0,


#### Scenario 2: Type Widening

In [0]:
ALTER TABLE deltacatalog.deltadb.invoices_se 
SET TBLPROPERTIES ('delta.enableTypeWidening' = 'true');

In [0]:
DESCRIBE TABLE deltacatalog.deltadb.invoices_se; 

col_name,data_type,comment
customer_id,int,
invoice_no,string,
price,float,
invoice_date,date,
quantity,int,
payment_method,string,


In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se 
VALUES (123456789012345,	'I106485',	30.299999237060547,	'2022-12-01',	2,	'Debit Card')

num_affected_rows,num_inserted_rows
1,1


In [0]:
ALTER TABLE deltacatalog.deltadb.invoices_se
ALTER COLUMN customer_id TYPE BIGINT;

In [0]:
DESCRIBE TABLE deltacatalog.deltadb.invoices_se;

col_name,data_type,comment
customer_id,bigint,
invoice_no,string,
price,float,
invoice_date,date,
quantity,int,
payment_method,string,


In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se 
VALUES (123456789012345,	'I106485',	30.299999237060547,	'2022-12-01',	2,	'Debit Card')

num_affected_rows,num_inserted_rows
1,1


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se
WHERE customer_id = 123456789012345;

customer_id,invoice_no,price,invoice_date,quantity,payment_method
123456789012345,I106485,30.3,2022-12-01,2,Debit Card
123456789012345,I106485,30.3,2022-12-01,2,Debit Card


#### Scenario 3: Nested Structure Evolution

In [0]:
ALTER TABLE deltacatalog.deltadb.invoices_se
ADD COLUMNS purchase_details STRUCT<
  mall_pin_code INT,
  store_code INT
>;

In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se 
VALUES (16,	'I106485',	30.299999237060547,	'2022-12-01',	2,	'Debit Card', STRUCT(12345, 879));

num_affected_rows,num_inserted_rows
1,1


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,invoice_date,quantity,payment_method,purchase_details
16,I106485,30.3,2022-12-01,2.0,Debit Card,"List(12345, 879)"
11,I235733,162.64,2022-01-04,4.0,Cash,
12,I272400,5.23,2022-01-10,1.0,Cash,
13,I223729,107.52,2022-08-02,3.0,Cash,
14,I299506,600.17,2021-01-06,1.0,Cash,
15,I106485,30.3,2022-12-01,2.0,Debit Card,
123456789012345,I106485,30.3,2022-12-01,2.0,Debit Card,
123456789012345,I106485,30.3,2022-12-01,2.0,Debit Card,
6,I303349,71.68,2021-06-03,2.0,,
7,I336350,15.15,2021-11-25,1.0,,


In [0]:
ALTER TABLE deltacatalog.deltadb.invoices_se
ALTER COLUMN purchase_details.mall_pin_code TYPE BIGINT;

In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se 
VALUES (17,	'I106485',	30.299999237060547,	'2022-12-01',	2,	'Debit Card', STRUCT(123456789012346, 765));

num_affected_rows,num_inserted_rows
1,1


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,invoice_date,quantity,payment_method,purchase_details
17,I106485,30.3,2022-12-01,2.0,Debit Card,"List(123456789012346, 765)"
16,I106485,30.3,2022-12-01,2.0,Debit Card,"List(12345, 879)"
11,I235733,162.64,2022-01-04,4.0,Cash,
12,I272400,5.23,2022-01-10,1.0,Cash,
13,I223729,107.52,2022-08-02,3.0,Cash,
14,I299506,600.17,2021-01-06,1.0,Cash,
15,I106485,30.3,2022-12-01,2.0,Debit Card,
123456789012345,I106485,30.3,2022-12-01,2.0,Debit Card,
123456789012345,I106485,30.3,2022-12-01,2.0,Debit Card,
6,I303349,71.68,2021-06-03,2.0,,


In [0]:
ALTER TABLE deltacatalog.deltadb.invoices_se
ADD COLUMN purchase_details.store_loc STRING;

In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se 
VALUES (17,	'I106485',	30.299999237060547,	'2022-12-01',	2,	'Debit Card', STRUCT(7612, 765, 'ground floor'));

num_affected_rows,num_inserted_rows
1,1


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,invoice_date,quantity,payment_method,purchase_details
17,I106485,30.3,2022-12-01,2.0,Debit Card,"List(7612, 765, ground floor)"
17,I106485,30.3,2022-12-01,2.0,Debit Card,"List(123456789012346, 765, null)"
16,I106485,30.3,2022-12-01,2.0,Debit Card,"List(12345, 879, null)"
11,I235733,162.64,2022-01-04,4.0,Cash,
12,I272400,5.23,2022-01-10,1.0,Cash,
13,I223729,107.52,2022-08-02,3.0,Cash,
14,I299506,600.17,2021-01-06,1.0,Cash,
15,I106485,30.3,2022-12-01,2.0,Debit Card,
123456789012345,I106485,30.3,2022-12-01,2.0,Debit Card,
123456789012345,I106485,30.3,2022-12-01,2.0,Debit Card,


In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se 
VALUES (21,	'I106485',	30.299999237060547,	'2022-12-01',	2,	'Debit Card', 
  NAMED_STRUCT(
    'mall_pin_code', 7612, 
    'store_code', 765, 
    'store_loc', 'ground floor', 
    'staff_id', 'ST12736'
  )
);

num_affected_rows,num_inserted_rows
1,1


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,invoice_date,quantity,payment_method,purchase_details
21,I106485,30.3,2022-12-01,2.0,Debit Card,"List(7612, 765, ground floor, ST12736)"
17,I106485,30.3,2022-12-01,2.0,Debit Card,"List(7612, 765, ground floor, null)"
17,I106485,30.3,2022-12-01,2.0,Debit Card,"List(123456789012346, 765, null, null)"
16,I106485,30.3,2022-12-01,2.0,Debit Card,"List(12345, 879, null, null)"
11,I235733,162.64,2022-01-04,4.0,Cash,
12,I272400,5.23,2022-01-10,1.0,Cash,
13,I223729,107.52,2022-08-02,3.0,Cash,
14,I299506,600.17,2021-01-06,1.0,Cash,
15,I106485,30.3,2022-12-01,2.0,Debit Card,
123456789012345,I106485,30.3,2022-12-01,2.0,Debit Card,


#### Scenario 4: Column Position Changes

In [0]:
SET spark.databricks.delta.schema.autoMerge.enabled=false;

key,value
spark.databricks.delta.schema.autoMerge.enabled,False


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,age,invoice_date,quantity,payment_method,purchase_details
50,I349253,143.36,42.0,2022-10-02,4.0,Cash,
51,I192284,107.52,33.0,2023-01-05,3.0,Cash,
52,I766439,35.84,37.0,2023-01-26,1.0,Cash,
53,I142150,81.32,63.0,2021-01-31,2.0,Cash,
54,I383710,4200.0,35.0,2021-12-12,4.0,Debit Card,
55,I279206,300.08,64.0,2021-11-02,1.0,Credit Card,
21,I106485,30.3,,2022-12-01,2.0,Debit Card,"List(7612, 765, ground floor, ST12736)"
17,I106485,30.3,,2022-12-01,2.0,Debit Card,"List(7612, 765, ground floor, null)"
17,I106485,30.3,,2022-12-01,2.0,Debit Card,"List(123456789012346, 765, null, null)"
16,I106485,30.3,,2022-12-01,2.0,Debit Card,"List(12345, 879, null, null)"


In [0]:
-- ALTER TABLE deltacatalog.deltadb.invoices_se ADD COLUMNS (age INT FIRST);
ALTER TABLE deltacatalog.deltadb.invoices_se ADD COLUMNS (age INT AFTER price)

In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se
SELECT customer_id, invoice_no, price, age, invoice_date, quantity, payment_method, NULL AS purchase_details
FROM PARQUET.`abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet`
WHERE customer_id BETWEEN 50 AND 55

num_affected_rows,num_inserted_rows
6,6


In [0]:
SET spark.databricks.delta.schema.autoMerge.enabled = true;

key,value
spark.databricks.delta.schema.autoMerge.enabled,True


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,age,invoice_date,quantity,payment_method,purchase_details
50,I349253,143.36,42.0,2022-10-02,4.0,Cash,
51,I192284,107.52,33.0,2023-01-05,3.0,Cash,
52,I766439,35.84,37.0,2023-01-26,1.0,Cash,
53,I142150,81.32,63.0,2021-01-31,2.0,Cash,
54,I383710,4200.0,35.0,2021-12-12,4.0,Debit Card,
55,I279206,300.08,64.0,2021-11-02,1.0,Credit Card,
21,I106485,30.3,,2022-12-01,2.0,Debit Card,"List(7612, 765, ground floor, ST12736)"
17,I106485,30.3,,2022-12-01,2.0,Debit Card,"List(7612, 765, ground floor, null)"
17,I106485,30.3,,2022-12-01,2.0,Debit Card,"List(123456789012346, 765, null, null)"
16,I106485,30.3,,2022-12-01,2.0,Debit Card,"List(12345, 879, null, null)"


In [0]:
INSERT INTO deltacatalog.deltadb.invoices_se
SELECT customer_id, invoice_no, price, age, invoice_date, quantity, payment_method, category, NULL AS purchase_details
FROM PARQUET.`abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet`
WHERE customer_id BETWEEN 56 AND 60

org.apache.spark.sql.catalyst.ExtendedAnalysisException: [DATATYPE_MISMATCH.CAST_WITHOUT_SUGGESTION] Cannot resolve "category" due to data type mismatch: cannot cast "STRING" to "STRUCT<mall_pin_code: BIGINT, store_code: INT, store_loc: STRING, staff_id: STRING>". SQLSTATE: 42K09; line 1 pos 0;
'AppendData RelationV2[customer_id#11171L, invoice_no#11172, price#11173, age#11174, invoice_date#11175, quantity#11176, payment_method#11177, purchase_details#11178] deltacatalog.deltadb.invoices_se deltacatalog.deltadb.invoices_se, false, true
+- 'Project [cast(customer_id#11179 as bigint) AS customer_id#11193L, invoice_no#11180 AS invoice_no#11194, price#11185 AS price#11195, age#11182 AS age#11196, invoice_date#11187 AS invoice_date#11197, quantity#11184 AS quantity#11198, payment_method#11186 AS payment_method#11199, cast(category#11183 as struct<mall_pin_code:bigint,store_code:int,store_loc:string,staff_id:string>) AS purchase_details#11200, purchase_details#11170]
   +- Project [customer_

In [0]:
MERGE INTO deltacatalog.deltadb.invoices_se tgt
USING (
  SELECT customer_id, invoice_no, price, age, invoice_date, quantity, payment_method, category, NULL AS purchase_details
  FROM PARQUET.`abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet`
  WHERE customer_id BETWEEN 56 AND 60
) src 
ON tgt.customer_id = src.customer_id
WHEN NOT MATCHED THEN
  INSERT *

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
5,0,0,5


In [0]:
SELECT * FROM deltacatalog.deltadb.invoices_se;

customer_id,invoice_no,price,age,invoice_date,quantity,payment_method,purchase_details,category
57,I306026,2400.68,62.0,2022-11-24,4.0,Credit Card,,Shoes
58,I100522,1500.4,26.0,2021-11-06,5.0,Cash,,Clothing
56,I260809,203.3,59.0,2022-07-30,5.0,Cash,,Cosmetics
59,I284099,2400.68,30.0,2021-01-24,4.0,Cash,,Shoes
60,I290521,71.68,26.0,2022-03-22,2.0,Debit Card,,Toys
50,I349253,143.36000061035156,42.0,2022-10-02,4.0,Cash,,
51,I192284,107.5199966430664,33.0,2023-01-05,3.0,Cash,,
52,I766439,35.84000015258789,37.0,2023-01-26,1.0,Cash,,
53,I142150,81.31999969482422,63.0,2021-01-31,2.0,Cash,,
54,I383710,4200.0,35.0,2021-12-12,4.0,Debit Card,,


In [0]:
%python
from pyspark.sql.functions import *
df = (
  spark.read.parquet("abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet")
  .filter(col("customer_id").between(1, 10))
  .select("customer_id", "price", "invoice_date")
)
df.write.saveAsTable("deltacatalog.deltadb.invoices_se_spark_df")

In [0]:
%python
df = (
  spark.read.parquet("abfss://labdata@dbdeltalabstorageacct.dfs.core.windows.net/invoices/invoices_1_100.parquet")
  .filter(col("customer_id").between(11, 25))
  .select("customer_id", "price", "invoice_date", "quantity", "payment_method")
)
df.write.mode("append").option("mergeSchema", "true").saveAsTable("deltacatalog.deltadb.invoices_se_spark_df")