In [15]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("records").getOrCreate()


ConnectionRefusedError: [Errno 111] Connection refused

In [8]:
import time
from pyspark.sql.functions import col, when, rand, expr
from pyspark.sql import SparkSession

# Initialize Spark session with memory optimizations
spark = SparkSession.builder \
    .appName("Optimized Iceberg Update") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .getOrCreate()

# Load the Iceberg table
df = spark.table("demo.nyc.taxis_10M_50COLUMNS")

# Debugging: Print total row count
total_rows = df.count()
print(f"Total rows in table: {total_rows}")

# Get update percentage from user
update_percentage = float(input("Enter update percentage (e.g., 1 for 1%): ").strip()) / 100
num_rows = max(1, int(total_rows * update_percentage))  # Ensure at least 1 row is updated

# Define batch size
batch_size = 500  # Reduced batch size to optimize memory usage
num_batches = max(1, (num_rows // batch_size) + (1 if num_rows % batch_size else 0))  # Ensure at least 1 batch

print(f"Updating {num_rows} rows (~{update_percentage*100}%) in {num_batches} batches...")

st = time.time()

for batch_num in range(num_batches):
    print(f"Processing batch {batch_num + 1}/{num_batches}...")

    # Select a batch of unique row IDs without collecting to the driver
    sampled_df = df.select("extra_col_0").distinct().orderBy(rand()).limit(min(batch_size, num_rows))

    # Merge updates instead of overwriting partitions
    updated_batch = df.alias("source").join(sampled_df.alias("updates"), "extra_col_0", "inner") \
        .withColumn("extra_col_1", col("extra_col_1") + 10)

    # Optimize writing using append mode or merge
    updated_batch.repartition(100).writeTo("demo.nyc.taxis_10M_50COLUMNS").append()

    print(f"Batch {batch_num + 1}/{num_batches} updated.")

end = time.time() - st
print(f"\nTotal update time for {num_batches} batches: {end:.2f} sec")


Total rows in table: 10004500


Enter update percentage (e.g., 1 for 1%):  1


Updating 100045 rows (~1.0%) in 201 batches...
Processing batch 1/201...


25/03/04 09:17:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:17:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:17:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:17:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:17:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:17:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:17:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:17:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Batch 1/201 updated.
Processing batch 2/201...


25/03/04 09:20:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:20:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:20:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:20:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:20:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:20:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:20:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:20:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Batch 2/201 updated.
Processing batch 3/201...


25/03/04 09:23:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:23:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:23:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:23:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:23:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:23:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:23:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:23:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Batch 3/201 updated.
Processing batch 4/201...


25/03/04 09:26:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:26:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:26:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:26:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:26:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:26:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:26:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:26:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Batch 4/201 updated.
Processing batch 5/201...


25/03/04 09:30:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:30:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:30:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:30:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:30:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:30:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:30:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:30:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Batch 5/201 updated.
Processing batch 6/201...


25/03/04 09:32:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:32:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:32:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:32:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:32:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:32:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

Batch 6/201 updated.
Processing batch 7/201...


25/03/04 09:36:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:36:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:36:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:36:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:36:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:36:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:36:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:36:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Batch 7/201 updated.
Processing batch 8/201...


25/03/04 09:39:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:39:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:39:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:39:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:39:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:39:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:39:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:39:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Batch 8/201 updated.
Processing batch 9/201...


25/03/04 09:42:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:42:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:42:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:42:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:42:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:42:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:42:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 09:42:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

Batch 9/201 updated.
Processing batch 10/201...






25/03/04 09:45:04 ERROR Executor: Exception in task 231.0 in stage 305.0 (TID 10072)
java.lang.OutOfMemoryError: Java heap space
25/03/04 09:45:04 WARN TaskSetManager: Lost task 231.0 in stage 305.0 (TID 10072) (5e0ed98abed9 executor driver): java.lang.OutOfMemoryError: Java heap space

25/03/04 09:45:04 ERROR TaskSetManager: Task 231 in stage 305.0 failed 1 times; aborting job
25/03/04 09:45:04 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker for task 231.0 in stage 305.0 (TID 10072),5,main]
java.lang.OutOfMemoryError: Java heap space
25/03/04 09:45:04 WARN DiskBlockObjectWriter: Error deleting /tmp/blockmgr-244b373c-a8e3-4302-95fa-49829b358285/15/temp_shuffle_ed1e3a84-2ee4-4891-badc-0569d2bb0869
25/03/04 09:45:04 WARN DiskBlockObjectWriter: Error deleting /tmp/blockmgr-244b373c-a8e3-4302-95fa-49829b358285/3f/temp_shuffle_6327d217-3c86-4b4b-9eeb-a525b91abbb9
25/03/04 09:45:04 WARN DiskBlockObjectWriter: Error deleting /tmp/blockmgr-2

Py4JJavaError: An error occurred while calling o497.append.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 231 in stage 305.0 failed 1 times, most recent failure: Lost task 231.0 in stage 305.0 (TID 10072) (5e0ed98abed9 executor driver): java.lang.OutOfMemoryError: Java heap space

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.lang.OutOfMemoryError: Java heap space


In [2]:
import time
from pyspark.sql.functions import expr, rand
from pyspark.sql import SparkSession

# Initialize Spark session with memory optimizations
spark = SparkSession.builder \
    .appName("Optimized Iceberg Update") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.executor.memoryOverhead", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.memory.fraction", "0.7") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.sql.autoBroadcastJoinThreshold", "50MB") \
    .getOrCreate()

# Load the Iceberg table
df = spark.table("demo.nyc.taxis_10M_50COLUMNS")

# Debugging: Print total row count
total_rows = df.count()
print(f"Total rows in table: {total_rows}")

# Get update percentage from user
update_percentage = float(input("Enter update percentage (e.g., 1 for 1%): ").strip()) / 100
num_rows = max(1, int(total_rows * update_percentage))  # Ensure at least 1 row is updated

# Define batch size dynamically
batch_size = min(500, num_rows // 10)  # Adjust batch size based on total updates
num_batches = max(1, (num_rows // batch_size) + (1 if num_rows % batch_size else 0))  # Ensure at least 1 batch

print(f"Updating {num_rows} rows (~{update_percentage*100}%) in {num_batches} batches...")

st = time.time()

for batch_num in range(num_batches):
    print(f"Processing batch {batch_num + 1}/{num_batches}...")
    
    # Select a batch of unique row IDs without collecting to the driver
    sampled_df = df.select("extra_col_0").distinct().orderBy(rand()).limit(min(batch_size, num_rows))
    
    # Use MERGE INTO for efficient updates
    spark.sql(f'''
        MERGE INTO demo.nyc.taxis_10M_50COLUMNS AS source
        USING (SELECT extra_col_0 FROM sampled_df) AS updates
        ON source.extra_col_0 = updates.extra_col_0
        WHEN MATCHED THEN
        UPDATE SET source.extra_col_1 = source.extra_col_1 + 10
    ''')

    print(f"Batch {batch_num + 1}/{num_batches} updated.")

end = time.time() - st
print(f"\nTotal update time for {num_batches} batches: {end:.2f} sec")


25/03/04 09:56:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Total rows in table: 10009002


Enter update percentage (e.g., 1 for 1%):  1


Updating 100090 rows (~1.0%) in 201 batches...
Processing batch 1/201...


25/03/04 09:56:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `sampled_df` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 3 pos 39;
'MergeIntoTable ('source.extra_col_0 = 'updates.extra_col_0), [updateaction(None, assignment('source.extra_col_1, ('source.extra_col_1 + 10)))]
:- SubqueryAlias source
:  +- SubqueryAlias demo.nyc.taxis_10M_50COLUMNS
:     +- RelationV2[extra_col_0#165, extra_col_1#166, extra_col_2#167, extra_col_3#168, extra_col_4#169, extra_col_5#170, extra_col_6#171, extra_col_7#172, extra_col_8#173, extra_col_9#174, extra_col_10#175, extra_col_11#176, extra_col_12#177, extra_col_13#178, extra_col_14#179, extra_col_15#180, extra_col_16#181, extra_col_17#182, extra_col_18#183, extra_col_19#184, extra_col_20#185, extra_col_21#186, extra_col_22#187, extra_col_23#188, ... 26 more fields] demo.nyc.taxis_10M_50COLUMNS demo.nyc.taxis_10M_50COLUMNS
+- 'SubqueryAlias updates
   +- 'Project ['extra_col_0]
      +- 'UnresolvedRelation [sampled_df], [], false


In [3]:
import time
from pyspark.sql.functions import expr, rand
from pyspark.sql import SparkSession

# Initialize Spark session with memory optimizations
spark = SparkSession.builder \
    .appName("Optimized Iceberg Update") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.executor.memoryOverhead", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.memory.fraction", "0.7") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.sql.autoBroadcastJoinThreshold", "50MB") \
    .getOrCreate()

# Load the Iceberg table
df = spark.table("demo.nyc.taxis_10M_50COLUMNS")

# Debugging: Print total row count
total_rows = df.count()
print(f"Total rows in table: {total_rows}")

# Get update percentage from user
update_percentage = float(input("Enter update percentage (e.g., 1 for 1%): ").strip()) / 100
num_rows = max(1, int(total_rows * update_percentage))  # Ensure at least 1 row is updated

# Define batch size dynamically
batch_size = min(500, num_rows // 10)  # Adjust batch size based on total updates
num_batches = max(1, (num_rows // batch_size) + (1 if num_rows % batch_size else 0))  # Ensure at least 1 batch

print(f"Updating {num_rows} rows (~{update_percentage*100}%) in {num_batches} batches...")

st = time.time()

for batch_num in range(num_batches):
    print(f"Processing batch {batch_num + 1}/{num_batches}...")

    # Select a batch of unique row IDs without collecting to the driver
    sampled_df = df.select("extra_col_0").distinct().orderBy(rand()).limit(min(batch_size, num_rows))

    # Register the sampled DataFrame as a temporary view
    sampled_df.createOrReplaceTempView("sampled_temp_view")

    # Use MERGE INTO for efficient updates
    spark.sql(f'''
        MERGE INTO demo.nyc.taxis_10M_50COLUMNS AS source
        USING sampled_temp_view AS updates
        ON source.extra_col_0 = updates.extra_col_0
        WHEN MATCHED THEN
        UPDATE SET source.extra_col_1 = source.extra_col_1 + 10
    ''')

    print(f"Batch {batch_num + 1}/{num_batches} updated.")

end = time.time() - st
print(f"\nTotal update time for {num_batches} batches: {end:.2f} sec")


Total rows in table: 10009002


Enter update percentage (e.g., 1 for 1%):  1


Updating 100090 rows (~1.0%) in 201 batches...
Processing batch 1/201...


AnalysisException: [INVALID_NON_DETERMINISTIC_EXPRESSIONS] The operator expects a deterministic expression, but the actual expression is "(extra_col_0 = extra_col_0)", "exists(extra_col_0)".; line 2 pos 8;
ReplaceData
:  +- Filter (outer(extra_col_0#380) = extra_col_0#490)
:     +- SubqueryAlias updates
:        +- SubqueryAlias sampled_temp_view
:           +- View (`sampled_temp_view`, [extra_col_0#490])
:              +- GlobalLimit 500
:                 +- LocalLimit 500
:                    +- Project [extra_col_0#490]
:                       +- Sort [_nondeterministic#379 ASC NULLS FIRST], true
:                          +- Project [extra_col_0#490, rand(-1231214445441557296) AS _nondeterministic#379]
:                             +- Deduplicate [extra_col_0#490]
:                                +- Project [extra_col_0#490]
:                                   +- SubqueryAlias demo.nyc.taxis_10M_50COLUMNS
:                                      +- RelationV2[extra_col_0#490, extra_col_1#491, extra_col_2#492, extra_col_3#493, extra_col_4#494, extra_col_5#495, extra_col_6#496, extra_col_7#497, extra_col_8#498, extra_col_9#499, extra_col_10#500, extra_col_11#501, extra_col_12#502, extra_col_13#503, extra_col_14#504, extra_col_15#505, extra_col_16#506, extra_col_17#507, extra_col_18#508, extra_col_19#509, extra_col_20#510, extra_col_21#511, extra_col_22#512, extra_col_23#513, ... 26 more fields] demo.nyc.taxis_10M_50COLUMNS demo.nyc.taxis_10M_50COLUMNS
+- MergeRows[extra_col_0#438, extra_col_1#439, extra_col_2#440, extra_col_3#441, extra_col_4#442, extra_col_5#443, extra_col_6#444, extra_col_7#445, extra_col_8#446, extra_col_9#447, extra_col_10#448, extra_col_11#449, extra_col_12#450, extra_col_13#451, extra_col_14#452, extra_col_15#453, extra_col_16#454, extra_col_17#455, extra_col_18#456, extra_col_19#457, extra_col_20#458, extra_col_21#459, extra_col_22#460, extra_col_23#461, ... 27 more fields]
   +- Join LeftOuter, (extra_col_0#380 = extra_col_0#215), leftHint=(strategy=no_broadcast_and_replication)
      :- Project [extra_col_0#380, extra_col_1#381, extra_col_2#382, extra_col_3#383, extra_col_4#384, extra_col_5#385, extra_col_6#386, extra_col_7#387, extra_col_8#388, extra_col_9#389, extra_col_10#390, extra_col_11#391, extra_col_12#392, extra_col_13#393, extra_col_14#394, extra_col_15#395, extra_col_16#396, extra_col_17#397, extra_col_18#398, extra_col_19#399, extra_col_20#400, extra_col_21#401, extra_col_22#402, extra_col_23#403, ... 29 more fields]
      :  +- RelationV2[extra_col_0#380, extra_col_1#381, extra_col_2#382, extra_col_3#383, extra_col_4#384, extra_col_5#385, extra_col_6#386, extra_col_7#387, extra_col_8#388, extra_col_9#389, extra_col_10#390, extra_col_11#391, extra_col_12#392, extra_col_13#393, extra_col_14#394, extra_col_15#395, extra_col_16#396, extra_col_17#397, extra_col_18#398, extra_col_19#399, extra_col_20#400, extra_col_21#401, extra_col_22#402, extra_col_23#403, ... 27 more fields] demo.nyc.taxis_10M_50COLUMNS demo.nyc.taxis_10M_50COLUMNS
      +- Project [extra_col_0#215, true AS __row_from_source#437]
         +- SubqueryAlias updates
            +- SubqueryAlias sampled_temp_view
               +- View (`sampled_temp_view`, [extra_col_0#215])
                  +- GlobalLimit 500
                     +- LocalLimit 500
                        +- Project [extra_col_0#215]
                           +- Sort [_nondeterministic#379 ASC NULLS FIRST], true
                              +- Project [extra_col_0#215, rand(-1231214445441557296) AS _nondeterministic#379]
                                 +- Deduplicate [extra_col_0#215]
                                    +- Project [extra_col_0#215]
                                       +- SubqueryAlias demo.nyc.taxis_10M_50COLUMNS
                                          +- RelationV2[extra_col_0#215, extra_col_1#216, extra_col_2#217, extra_col_3#218, extra_col_4#219, extra_col_5#220, extra_col_6#221, extra_col_7#222, extra_col_8#223, extra_col_9#224, extra_col_10#225, extra_col_11#226, extra_col_12#227, extra_col_13#228, extra_col_14#229, extra_col_15#230, extra_col_16#231, extra_col_17#232, extra_col_18#233, extra_col_19#234, extra_col_20#235, extra_col_21#236, extra_col_22#237, extra_col_23#238, ... 26 more fields] demo.nyc.taxis_10M_50COLUMNS demo.nyc.taxis_10M_50COLUMNS


In [12]:
from pyspark.sql import SparkSession
import time

# Initialize Spark session with Iceberg support
spark = SparkSession.builder \
    .appName("Iceberg Merge") \
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.demo.type", "hadoop") \
    .config("spark.sql.catalog.demo.warehouse", "s3://your-bucket/warehouse") \
    .getOrCreate()

# Load the Iceberg table
df = spark.read.format("iceberg").load("demo.nyc.taxis_10M_50COLUMNS")

# Get user input for update percentage
update_percentage = float(input("Enter the percentage of records to update (0-100): ")) / 100

# Calculate batch size
num_rows = df.count()
batch_size = int(num_rows * update_percentage)
st = time.time()

# Select distinct values of extra_col_0 for sampling
sampled_df = df.select("extra_col_0").distinct().orderBy("extra_col_0").limit(batch_size)

# Register as a temporary view
sampled_df.createOrReplaceTempView("sampled_temp_view")

# Count records before update for verification
pre_update_count = df.filter(df.extra_col_0.isin([row.extra_col_0 for row in sampled_df.collect()])).count()

# Perform MERGE INTO operation
spark.sql('''
    MERGE INTO demo.nyc.taxis_10M_50COLUMNS AS source
    USING sampled_temp_view AS updates
    ON source.extra_col_0 = updates.extra_col_0
    WHEN MATCHED THEN
    UPDATE SET source.extra_col_1 = source.extra_col_1 + 10
''')

# Count records after update
post_update_count = df.filter(df.extra_col_0.isin([row.extra_col_0 for row in sampled_df.collect()])).count()

# Calculate the number of records updated
updated_count = post_update_count - pre_update_count

print(f"Number of records updated: {updated_count}")
print("Update completed successfully.")
print(f"Time taken: {time.time() - st} seconds")


Enter the percentage of records to update (0-100):  1


25/03/04 10:14:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 10:14:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 10:14:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 10:14:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 10:14:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 10:14:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 10:14:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 10:14:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/04 10:16:43 WARN DAGScheduler: Broadcasting large task bin



25/03/04 10:20:42 ERROR Utils: uncaught error in thread Spark Context Cleaner, stopping SparkContext
java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ContextCleaner$$Lambda$865/0x00000008406ee840.get$Lambda(Unknown Source)
	at java.base/java.lang.invoke.DirectMethodHandle$Holder.invokeStatic(DirectMethodHandle$Holder)
	at java.base/java.lang.invoke.Invokers$Holder.linkToTargetMethod(Invokers$Holder)
	at org.apache.spark.ContextCleaner.$anonfun$keepCleaning$1(ContextCleaner.scala:195)
	at org.apache.spark.ContextCleaner$$Lambda$781/0x00000008406a8c40.apply$mcV$sp(Unknown Source)
	at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1356)
	at org.apache.spark.ContextCleaner.org$apache$spark$ContextCleaner$$keepCleaning(ContextCleaner.scala:189)
	at org.apache.spark.ContextCleaner$$anon$1.run(ContextCleaner.scala:79)
25/03/04 10:20:43 ERROR Utils: throw uncaught fatal error in thread Spark Context Cleaner
java.lang.OutOfMemoryError: Java heap space
	at org.apa



Exception in thread "RemoteBlock-temp-file-clean-thread" java.lang.OutOfMemoryError: Java heap space
25/03/04 10:21:08 WARN Utils: Suppressing exception in finally: Cannot return unfinished footer.
java.lang.IllegalStateException: Cannot return unfinished footer.
	at org.apache.iceberg.shaded.org.apache.parquet.Preconditions.checkState(Preconditions.java:156)
	at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileWriter.getFooter(ParquetFileWriter.java:1386)
	at org.apache.iceberg.parquet.ParquetWriter.metrics(ParquetWriter.java:144)
	at org.apache.iceberg.io.DataWriter.close(DataWriter.java:90)
	at org.apache.iceberg.io.RollingFileWriter.closeCurrentWriter(RollingFileWriter.java:122)
	at org.apache.iceberg.io.RollingFileWriter.close(RollingFileWriter.java:147)
	at org.apache.iceberg.io.RollingDataWriter.close(RollingDataWriter.java:32)
	at org.apache.iceberg.spark.source.SparkWrite$UnpartitionedDataWriter.close(SparkWrite.java:747)
	at org.apache.spark.sql.execution.dataso

Py4JJavaError: An error occurred while calling o39.sql.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 49.0 failed 1 times, most recent failure: Lost task 1.0 in stage 49.0 (TID 3210) (5e0ed98abed9 executor driver): java.lang.OutOfMemoryError: Java heap space

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:385)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:359)
	at org.apache.spark.sql.execution.datasources.v2.ReplaceDataExec.writeWithV2(WriteToDataSourceV2Exec.scala:271)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run(WriteToDataSourceV2Exec.scala:337)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run$(WriteToDataSourceV2Exec.scala:336)
	at org.apache.spark.sql.execution.datasources.v2.ReplaceDataExec.run(WriteToDataSourceV2Exec.scala:271)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.Dataset.<init>(Dataset.scala:220)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.OutOfMemoryError: Java heap space


25/03/04 10:21:09 ERROR Executor: Exception in task 0.0 in stage 49.0 (TID 3209)
java.lang.OutOfMemoryError: Java heap space
25/03/04 10:21:09 ERROR SparkUncaughtExceptionHandler: [Container in shutdown] Uncaught exception in thread Thread[Executor task launch worker for task 0.0 in stage 49.0 (TID 3209),5,main]
java.lang.OutOfMemoryError: Java heap space
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539

In [13]:
from pyspark.sql import SparkSession
import time

# Initialize Spark session with Iceberg support
spark = SparkSession.builder \
    .appName("Iceberg Merge") \
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.demo.type", "hadoop") \
    .config("spark.sql.catalog.demo.warehouse", "s3://your-bucket/warehouse") \
    .getOrCreate()

# Load the Iceberg table
df = spark.read.format("iceberg").load("demo.nyc.taxis_10M_50COLUMNS")

# Get user input for update percentage
update_percentage = float(input("Enter the percentage of records to update (0-100): ")) / 100

# Calculate batch size
num_rows = df.count()
batch_size = int(num_rows * update_percentage)
st = time.time()

# Select distinct values of extra_col_0 for sampling
sampled_df = df.select("extra_col_0").distinct().orderBy("extra_col_0").limit(batch_size)

# Register as a temporary view
sampled_df.createOrReplaceTempView("sampled_temp_view")

# Perform MERGE INTO operation
spark.sql('''
    MERGE INTO demo.nyc.taxis_10M_50COLUMNS AS source
    USING sampled_temp_view AS updates
    ON source.extra_col_0 = updates.extra_col_0
    WHEN MATCHED THEN
    UPDATE SET source.extra_col_1 = source.extra_col_1 + 10
''')

print("Update completed successfully.")
print(f"Time taken: {time.time() - st} seconds")


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


Py4JError: SparkSession$ does not exist in the JVM

In [5]:
df = spark.sql("SELECT snapshot_id, made_current_at FROM demo.nyc.taxis_10M_50COLUMNS.history ORDER BY made_current_at DESC")

df.show()

+-------------------+--------------------+
|        snapshot_id|     made_current_at|
+-------------------+--------------------+
| 105668051919482591|2025-03-04 10:00:...|
|6814201661817395182|2025-03-04 09:42:...|
|6521404266517949867|2025-03-04 09:39:...|
|7604542288563918334|2025-03-04 09:36:...|
| 552092269728169030|2025-03-04 09:33:...|
|2209669498964280618|2025-03-04 09:30:...|
|7674319906980595297|2025-03-04 09:27:...|
|5468151910761078557|2025-03-04 09:23:...|
|7947044984117739091|2025-03-04 09:20:...|
|5673072808774441768|2025-03-04 09:17:...|
|1470626163957892397|2025-03-04 09:09:...|
|7160248633578306637|2025-03-04 09:06:...|
|5174434347034670474|2025-03-04 08:47:...|
|1794110359935204866|2025-03-04 08:44:...|
|6937317397142763324|2025-03-04 08:41:...|
|3089841895765010104|2025-03-04 08:38:...|
|7127818708494625786|2025-03-04 08:34:...|
|2707916599315632908|2025-03-04 08:30:...|
| 497229154627925394|2025-03-04 08:25:...|
|7776288292023736999|2025-03-04 08:13:...|
+----------

In [8]:
df=spark.sql("SELECT * FROM demo.nyc.taxis_10M_50COLUMNS.snapshots ORDER BY committed_at desc")
df.show()

+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2025-03-04 10:00:...| 105668051919482591|6814201661817395182|overwrite|s3://warehouse/ny...|{spark.app.id -> ...|
|2025-03-04 09:42:...|6814201661817395182|6521404266517949867|   append|s3://warehouse/ny...|{spark.app.id -> ...|
|2025-03-04 09:39:...|6521404266517949867|7604542288563918334|   append|s3://warehouse/ny...|{spark.app.id -> ...|
|2025-03-04 09:36:...|7604542288563918334| 552092269728169030|   append|s3://warehouse/ny...|{spark.app.id -> ...|
|2025-03-04 09:33:...| 552092269728169030|2209669498964280618|   append|s3://warehouse/ny...|{spark.app.id -> ...|
|2025-03-04 09:30:...|2209669498964280618|7674319906980595297|   append|s3://war