In [3]:
from pyspark.sql import SparkSession

# Initialize the SparkSession (step 1)
spark = SparkSession.builder.appName("records").getOrCreate()


25/02/26 20:33:28 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
from pyspark.sql import SparkSession

# Initialize Spark session with configurations
spark = SparkSession.builder \
    .appName("records") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()


25/02/26 20:34:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [17]:
spark.sql("DROP TABLE IF EXISTS demo.nyc.taxis_10M_50COLUMNS_where")

DataFrame[]

In [18]:
import os
iceberg_table_dir = "../warehouse/nyc/taxis_10M_50COLUMNS_where"
metadata_dir = f"{iceberg_table_dir}/metadata"
data_dir = f"{iceberg_table_dir}/data"
input_data_dir = f"../input_data"
analysis_info = []
records_before_op = 0

def append_to_file(file_path, msg):
    open_mode = "a"
    if not os.path.exists(file_path):
        open_mode = "w"

    # Open the CSV file in write mode
    with open(file_path, open_mode) as file:
        writer = csv.writer(file)
        
        if open_mode=="w":
            #writing header of the columns
            writer.writerows([list(msg.keys())])    

        row_values = [list(msg.values())]
        # Write the data to the CSV file
        writer.writerows(row_values)

def get_size():
    # List the metadata files
    manifest_pattern = re.compile(r".*-m\d+\.avro$")
    metadata_files = os.listdir(metadata_dir)
    
    # Initialize variables to store the sizes of different types of metadata files
    snap_avro_size = 0
    metadata_json_size = 0
    m_avro_size = 0

    data_dir_size = 0
    # get data dir size
    data_dir_files = os.listdir(data_dir)
    # print(data_dir_files)
    for filename in data_dir_files:
        file_path = os.path.join(data_dir, filename)
        data_dir_size += os.path.getsize(file_path) / 1024  # Convert size to KB
    
    # Iterate through the metadata files and calculate their sizes
    for file in metadata_files:
        file_path = os.path.join(metadata_dir, file)
        file_size_kb = os.path.getsize(file_path) / 1024  # Convert size to KB
        
        if file.startswith("snap-") and file.endswith(".avro"):
            snap_avro_size += file_size_kb
        elif file.endswith(".metadata.json"):
            metadata_json_size += file_size_kb
        elif manifest_pattern.match(file):
            m_avro_size += file_size_kb
    
    # Print the time taken and the sizes of the metadata files
    # print(f"Time taken to read Parquet files: {time_taken:.2f} seconds")
    # print(f"Size of snap-*.avro files: {snap_avro_size:.2f} KB")
    # print(f"Size of *.metadata.json files: {metadata_json_size:.2f} KB")
    # print(f"Size of *m{0-9}{1,}.avro files: {m_avro_size:.2f} KB")

    return {"data_dir_size": data_dir_size,"metadata_size": metadata_json_size,"snapshot_size": snap_avro_size,"manifest_size": m_avro_size}


In [19]:
from pyspark.sql.types import (
    DoubleType, FloatType, LongType, StructType, StructField, 
    StringType, IntegerType, DateType
)

# Define the schema with 50 columns based on the required data types
schema = StructType([
    # StructField("vendor_id", LongType(), True),  # INT
    # StructField("trip_id", LongType(), True),  # INT
    # StructField("trip_distance", FloatType(), True),  # FLOAT
    # StructField("fare_amount", DoubleType(), True),  # DOUBLE
    # StructField("store_and_fwd_flag", StringType(), True)  # STRING
# ] + [
    # Assigning VARCHAR, INT, STRING, and DATE data types in a cyclic pattern
    StructField(f"extra_col_{i}", StringType(), True) if i % 4 == 0 else  # VARCHAR
    StructField(f"extra_col_{i}", IntegerType(), True) if i % 4 == 1 else  # INT
    StructField(f"extra_col_{i}", StringType(), True) if i % 4 == 2 else  # STRING
    StructField(f"extra_col_{i}", DateType(), True)  # DATE
    for i in range(50)
])

# Create an empty DataFrame with the schema
df = spark.createDataFrame([], schema)

# Create the Iceberg table
df.writeTo("demo.nyc.taxis_10M_50COLUMNS_where").create()


In [20]:
df = spark.table("demo.nyc.taxis_10M_50COLUMNS_where")
df.show()

+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+
|extra_col_0|extra_col_1|extra_col_2|extra_col_3|extra_col_4|extra_col_5|extra_col_6|extra_col_7|extra_col_8|extra_col_9|extra_col_10|extra_col_11|extra_col_12|extra_col_13|extra_col_14|extra_col_15|extra_col_16|extra_col_17|extra_col_18|extra_col_19|extra_col_20|extra_col_21|extra_col_22|extra_col_23|extra_col_24|extra_col_25|extra_col_26|extra_col_27|ext

In [10]:
import time, csv
from pyspark.sql.functions import col, when, lit, broadcast
from pyspark.sql import functions as F
import os

input_data_dir = "../input_data"
output_dir = "../output"
analysis_info = []
records_before_op = 0
total_insertion_time = 0

file_type = input("Enter input file type csv or parquet? : ").lower().strip()
input_data_dir = os.path.join(input_data_dir, file_type)
input_files = os.listdir(input_data_dir)

analysis_file = os.path.join(output_dir, f"analysis_info_{file_type}.csv")
if os.path.exists(analysis_file):
    os.remove(analysis_file)

df = spark.table("demo.nyc.taxis_10M_50COLUMNS_where")
records_before_op = df.count()

for file in input_files:
    print(f"Started with file={file}")
    file_path = os.path.join(input_data_dir, file)

    st = time.time()
    if file_type == "parquet":
        df = spark.read.parquet(file_path)
    else:
        df = spark.read.csv(file_path, header=True)
        df = df.select(
            F.col("extra_col_0").cast("long"),
            F.col("extra_col_1").cast("int"),
            F.col("extra_col_2").cast("string"),
            F.col("extra_col_3").cast("date"),
            *[F.col(f"extra_col_{i}").cast("string" if i % 4 == 0 or i % 4 == 2 else "int" if i % 4 == 1 else "date") for i in range(4, 45)]
        )

    rows = df.count()
    
    df.writeTo("demo.nyc.taxis_10M_50COLUMNS_where").append()
    end = time.time() - st
    total_insertion_time += end

    details = {"time_taken": f"{end:.2f} sec", "Operation": f"Inserted {rows} records", "records_after_op": records_before_op + rows}
    records_before_op += rows

    print(f"Inserted {rows} records in {end:.2f} sec.")

# **PRINT INSERTION TIME BEFORE UPDATE**
print(f"\nTotal insertion time: {total_insertion_time:.2f} sec\n")

# === Get Update Percentage from User ===
update_percentage = float(input("Enter update percentage (e.g., 1 for 1%): ").strip()) / 100

df = spark.table("demo.nyc.taxis_10M_50COLUMNS_where")
total_rows = df.count()

num_rows = int(total_rows * update_percentage)
print(f"Updating {num_rows} rows (~{update_percentage*100}%)...")

# Sample random rows ensuring unique IDs (this will be done for each batch)
sampled_df = df.select("extra_col_0").distinct().orderBy(F.rand()).limit(num_rows)
sampled_ids = [row["extra_col_0"] for row in sampled_df.collect()]

# Convert sampled_ids to a DataFrame to use with join
sampled_ids_df = spark.createDataFrame([(id,) for id in sampled_ids], ["extra_col_0"])

# Broadcast the sampled DataFrame to avoid broadcasting large task binary
broadcast_sampled_ids = broadcast(sampled_ids_df)

st = time.time()

# === Using JOIN for Update Operation ===
# Instead of using `withColumn()` and `isin()` for the update, we join the sampled_ids to the original dataframe
updated_df = df.join(broadcast_sampled_ids, on="extra_col_0", how="left_outer") \
    .withColumn(
        "extra_col_1",
        when(col("extra_col_0").isNotNull(), col("extra_col_1") + 10)
        .otherwise(col("extra_col_1"))
    ) \
    .drop("extra_col_0")  # Drop the extra_col_0 after the join since it's no longer needed

# Overwrite the table partitions with the final updated data
updated_df.writeTo("demo.nyc.taxis_10M_50COLUMNS_where").overwritePartitions()

end = time.time() - st

print(f"Updated {num_rows} rows in {end:.2f} sec")


Enter input file type csv or parquet? :  parquet


Started with file=records_1000000_part_10_1740401457.66906.parquet


                                                                                

Inserted 1000000 records in 35.03 sec.
Started with file=records_1000000_part_1_1740398687.6853974.parquet


                                                                                

Inserted 1000000 records in 36.23 sec.
Started with file=records_1000000_part_2_1740398997.7710938.parquet


                                                                                

Inserted 1000000 records in 30.20 sec.
Started with file=records_1000000_part_3_1740399303.6597402.parquet


                                                                                

Inserted 1000000 records in 35.27 sec.
Started with file=records_1000000_part_4_1740399611.4401598.parquet


                                                                                

Inserted 1000000 records in 31.94 sec.
Started with file=records_1000000_part_5_1740399918.8825066.parquet


                                                                                

Inserted 1000000 records in 30.66 sec.
Started with file=records_1000000_part_6_1740400229.5675209.parquet


                                                                                

Inserted 1000000 records in 37.72 sec.
Started with file=records_1000000_part_7_1740400532.7327414.parquet


                                                                                

Inserted 1000000 records in 40.70 sec.
Started with file=records_1000000_part_8_1740400841.6608176.parquet


                                                                                

Inserted 1000000 records in 45.97 sec.
Started with file=records_1000000_part_9_1740401151.339735.parquet


                                                                                

Inserted 1000000 records in 37.14 sec.

Total insertion time: 360.86 sec



Enter update percentage (e.g., 1 for 1%):  1


Updating 100000 rows (~1.0%)...


25/02/26 20:42:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:42:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:42:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:42:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:42:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:42:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:42:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:42:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

AnalysisException: [INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA] Cannot write incompatible data for the table `demo`.`nyc`.`taxis_10M_50COLUMNS_where`: Cannot find data for the output column `extra_col_0`.

In [15]:
import time, csv
from pyspark.sql.functions import col, when, lit, broadcast
from pyspark.sql import functions as F
import os

input_data_dir = "../input_data"
output_dir = "../output"
analysis_info = []
records_before_op = 0
total_insertion_time = 0

file_type = input("Enter input file type csv or parquet? : ").lower().strip()
input_data_dir = os.path.join(input_data_dir, file_type)
input_files = os.listdir(input_data_dir)

analysis_file = os.path.join(output_dir, f"analysis_info_{file_type}.csv")
if os.path.exists(analysis_file):
    os.remove(analysis_file)

df = spark.table("demo.nyc.taxis_10M_50COLUMNS_where")
records_before_op = df.count()

for file in input_files:
    print(f"Started with file={file}")
    file_path = os.path.join(input_data_dir, file)

    st = time.time()
    if file_type == "parquet":
        df = spark.read.parquet(file_path)
    else:
        df = spark.read.csv(file_path, header=True)
        df = df.select(
            F.col("extra_col_0").cast("long"),
            F.col("extra_col_1").cast("int"),
            F.col("extra_col_2").cast("string"),
            F.col("extra_col_3").cast("date"),
            *[F.col(f"extra_col_{i}").cast("string" if i % 4 == 0 or i % 4 == 2 else "int" if i % 4 == 1 else "date") for i in range(4, 45)]
        )

    rows = df.count()
    
    # Writing the records into the existing table
    df.writeTo("demo.nyc.taxis_10M_50COLUMNS_where").append()
    end = time.time() - st
    total_insertion_time += end

    details = {"time_taken": f"{end:.2f} sec", "Operation": f"Inserted {rows} records", "records_after_op": records_before_op + rows}
    records_before_op += rows

    print(f"Inserted {rows} records in {end:.2f} sec.")

# **PRINT INSERTION TIME BEFORE UPDATE**
print(f"\nTotal insertion time: {total_insertion_time:.2f} sec\n")

# === Get Update Percentage from User ===
update_percentage = float(input("Enter update percentage (e.g., 1 for 1%): ").strip()) / 100

df = spark.table("demo.nyc.taxis_10M_50COLUMNS_where")
total_rows = df.count()

num_rows = int(total_rows * update_percentage)
print(f"Updating {num_rows} rows (~{update_percentage*100}%)...")

# Sample random rows ensuring unique IDs (this will be done for each batch)
sampled_df = df.select("extra_col_0").distinct().orderBy(F.rand()).limit(num_rows)
sampled_ids = [row["extra_col_0"] for row in sampled_df.collect()]

# Convert sampled_ids to a DataFrame to use with join
sampled_ids_df = spark.createDataFrame([(id,) for id in sampled_ids], ["extra_col_0"])

# Broadcast the sampled DataFrame to avoid broadcasting large task binary
broadcast_sampled_ids = broadcast(sampled_ids_df)

st = time.time()

# === Using JOIN for Update Operation ===
# Instead of using `withColumn()` and `isin()` for the update, we join the sampled_ids to the original dataframe
updated_df = df.join(broadcast_sampled_ids, on="extra_col_0", how="left_outer") \
    .withColumn(
        "extra_col_1",
        when(col("extra_col_0").isNotNull(), col("extra_col_1") + 10)
        .otherwise(col("extra_col_1"))
    )

# Ensure we keep the necessary column and write it to the table
updated_df = updated_df.dropna(subset=["extra_col_1"])  # Drop rows with NaN if needed

# **Ensure extra_col_0 is part of the final DataFrame for compatibility with the table schema**
updated_df = updated_df.select(
    "extra_col_0",
    "extra_col_1",  # Ensure other columns are selected
    *[col(c) for c in df.columns if c != "extra_col_0"]  # Include all other original columns
)

# Overwrite the table partitions with the final updated data
updated_df.writeTo("demo.nyc.taxis_10M_50COLUMNS_where").overwritePartitions()

end = time.time() - st

print(f"Updated {num_rows} rows in {end:.2f} sec")


Enter input file type csv or parquet? :  parquet


Started with file=records_1000000_part_10_1740401457.66906.parquet


                                                                                

Inserted 1000000 records in 38.04 sec.
Started with file=records_1000000_part_1_1740398687.6853974.parquet


                                                                                

Inserted 1000000 records in 31.85 sec.
Started with file=records_1000000_part_2_1740398997.7710938.parquet


                                                                                

Inserted 1000000 records in 35.17 sec.
Started with file=records_1000000_part_3_1740399303.6597402.parquet


                                                                                

Inserted 1000000 records in 30.31 sec.
Started with file=records_1000000_part_4_1740399611.4401598.parquet


                                                                                

Inserted 1000000 records in 36.82 sec.
Started with file=records_1000000_part_5_1740399918.8825066.parquet


                                                                                

Inserted 1000000 records in 28.93 sec.
Started with file=records_1000000_part_6_1740400229.5675209.parquet


                                                                                

Inserted 1000000 records in 34.69 sec.
Started with file=records_1000000_part_7_1740400532.7327414.parquet


                                                                                

Inserted 1000000 records in 32.40 sec.
Started with file=records_1000000_part_8_1740400841.6608176.parquet


                                                                                

Inserted 1000000 records in 33.85 sec.
Started with file=records_1000000_part_9_1740401151.339735.parquet


                                                                                

Inserted 1000000 records in 30.38 sec.

Total insertion time: 332.44 sec



Enter update percentage (e.g., 1 for 1%):  1


Updating 100000 rows (~1.0%)...


25/02/26 20:51:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:51:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:51:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:51:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:51:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:51:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:51:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 20:51:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

AnalysisException: [INSERT_COLUMN_ARITY_MISMATCH.TOO_MANY_DATA_COLUMNS] Cannot write to `demo`.`nyc`.`taxis_10M_50COLUMNS_where`, the reason is too many data columns:
Table columns: `extra_col_0`, `extra_col_1`, `extra_col_2`, `extra_col_3`, `extra_col_4`, `extra_col_5`, `extra_col_6`, `extra_col_7`, `extra_col_8`, `extra_col_9`, `extra_col_10`, `extra_col_11`, `extra_col_12`, `extra_col_13`, `extra_col_14`, `extra_col_15`, `extra_col_16`, `extra_col_17`, `extra_col_18`, `extra_col_19`, `extra_col_20`, `extra_col_21`, `extra_col_22`, `extra_col_23`, `extra_col_24`, `extra_col_25`, `extra_col_26`, `extra_col_27`, `extra_col_28`, `extra_col_29`, `extra_col_30`, `extra_col_31`, `extra_col_32`, `extra_col_33`, `extra_col_34`, `extra_col_35`, `extra_col_36`, `extra_col_37`, `extra_col_38`, `extra_col_39`, `extra_col_40`, `extra_col_41`, `extra_col_42`, `extra_col_43`, `extra_col_44`, `extra_col_45`, `extra_col_46`, `extra_col_47`, `extra_col_48`, `extra_col_49`.
Data columns: `extra_col_0`, `extra_col_1`, `extra_col_1`, `extra_col_2`, `extra_col_3`, `extra_col_4`, `extra_col_5`, `extra_col_6`, `extra_col_7`, `extra_col_8`, `extra_col_9`, `extra_col_10`, `extra_col_11`, `extra_col_12`, `extra_col_13`, `extra_col_14`, `extra_col_15`, `extra_col_16`, `extra_col_17`, `extra_col_18`, `extra_col_19`, `extra_col_20`, `extra_col_21`, `extra_col_22`, `extra_col_23`, `extra_col_24`, `extra_col_25`, `extra_col_26`, `extra_col_27`, `extra_col_28`, `extra_col_29`, `extra_col_30`, `extra_col_31`, `extra_col_32`, `extra_col_33`, `extra_col_34`, `extra_col_35`, `extra_col_36`, `extra_col_37`, `extra_col_38`, `extra_col_39`, `extra_col_40`, `extra_col_41`, `extra_col_42`, `extra_col_43`, `extra_col_44`, `extra_col_45`, `extra_col_46`, `extra_col_47`, `extra_col_48`, `extra_col_49`.

In [21]:
import time, csv
from pyspark.sql.functions import col, when, lit, broadcast
from pyspark.sql import functions as F
import os

input_data_dir = "../input_data"
output_dir = "../output"
analysis_info = []
records_before_op = 0
total_insertion_time = 0

file_type = input("Enter input file type csv or parquet? : ").lower().strip()
input_data_dir = os.path.join(input_data_dir, file_type)
input_files = os.listdir(input_data_dir)

analysis_file = os.path.join(output_dir, f"analysis_info_{file_type}.csv")
if os.path.exists(analysis_file):
    os.remove(analysis_file)

df = spark.table("demo.nyc.taxis_10M_50COLUMNS_where")
records_before_op = df.count()

for file in input_files:
    print(f"Started with file={file}")
    file_path = os.path.join(input_data_dir, file)

    st = time.time()
    if file_type == "parquet":
        df = spark.read.parquet(file_path)
    else:
        df = spark.read.csv(file_path, header=True)
        df = df.select(
            F.col("extra_col_0").cast("long"),
            F.col("extra_col_1").cast("int"),
            F.col("extra_col_2").cast("string"),
            F.col("extra_col_3").cast("date"),
            *[F.col(f"extra_col_{i}").cast("string" if i % 4 == 0 or i % 4 == 2 else "int" if i % 4 == 1 else "date") for i in range(4, 45)]
        )

    rows = df.count()
    
    df.writeTo("demo.nyc.taxis_10M_50COLUMNS_where").append()
    end = time.time() - st
    total_insertion_time += end

    details = {"time_taken": f"{end:.2f} sec", "Operation": f"Inserted {rows} records", "records_after_op": records_before_op + rows}
    records_before_op += rows

    print(f"Inserted {rows} records in {end:.2f} sec.")

# **PRINT INSERTION TIME BEFORE UPDATE**
print(f"\nTotal insertion time: {total_insertion_time:.2f} sec\n")

# === Get Update Percentage from User ===
update_percentage = float(input("Enter update percentage (e.g., 1 for 1%): ").strip()) / 100

df = spark.table("demo.nyc.taxis_10M_50COLUMNS_where")
total_rows = df.count()

num_rows = int(total_rows * update_percentage)
print(f"Updating {num_rows} rows (~{update_percentage*100}%)...")

# Sample random rows ensuring unique IDs (this will be done for each batch)
sampled_df = df.select("extra_col_0").distinct().orderBy(F.rand()).limit(num_rows)
sampled_ids = [row["extra_col_0"] for row in sampled_df.collect()]

# Convert sampled_ids to a DataFrame to use with join
sampled_ids_df = spark.createDataFrame([(id,) for id in sampled_ids], ["extra_col_0"])

# Broadcast the sampled DataFrame to avoid broadcasting large task binary
broadcast_sampled_ids = broadcast(sampled_ids_df)

st = time.time()

# === Using JOIN for Update Operation ===
# Instead of using `withColumn()` and `isin()` for the update, we join the sampled_ids to the original dataframe
updated_df = df.join(broadcast_sampled_ids, on="extra_col_0", how="left_outer") \
    .withColumn(
        "extra_col_1",
        when(col("extra_col_0").isNotNull(), col("extra_col_1") + 10)
        .otherwise(col("extra_col_1"))
    ) \
    .drop("extra_col_0")  # Drop the extra_col_0 after the join since it's no longer needed

# **Correct Column Selection to Avoid Duplicates**
updated_df = updated_df.select(
    "extra_col_0",  # Include the key column
    "extra_col_1",  # Updated column
    *[col(c) for c in df.columns if c != "extra_col_0"]  # Include all other original columns except extra_col_0
)

# Check the schema to verify the columns
updated_df.printSchema()

# Overwrite the table partitions with the final updated data
updated_df.writeTo("demo.nyc.taxis_10M_50COLUMNS_where").overwritePartitions()

end = time.time() - st

print(f"Updated {num_rows} rows in {end:.2f} sec")


Enter input file type csv or parquet? :  parquet


Started with file=records_1000000_part_10_1740401457.66906.parquet


                                                                                

Inserted 1000000 records in 32.59 sec.
Started with file=records_1000000_part_1_1740398687.6853974.parquet


                                                                                

Inserted 1000000 records in 33.09 sec.
Started with file=records_1000000_part_2_1740398997.7710938.parquet


                                                                                

Inserted 1000000 records in 30.85 sec.
Started with file=records_1000000_part_3_1740399303.6597402.parquet


                                                                                

Inserted 1000000 records in 35.15 sec.
Started with file=records_1000000_part_4_1740399611.4401598.parquet


                                                                                

Inserted 1000000 records in 33.53 sec.
Started with file=records_1000000_part_5_1740399918.8825066.parquet


                                                                                

Inserted 1000000 records in 37.07 sec.
Started with file=records_1000000_part_6_1740400229.5675209.parquet


                                                                                

Inserted 1000000 records in 37.14 sec.
Started with file=records_1000000_part_7_1740400532.7327414.parquet


                                                                                

Inserted 1000000 records in 31.23 sec.
Started with file=records_1000000_part_8_1740400841.6608176.parquet


                                                                                

Inserted 1000000 records in 34.77 sec.
Started with file=records_1000000_part_9_1740401151.339735.parquet


                                                                                

Inserted 1000000 records in 31.15 sec.

Total insertion time: 336.56 sec



Enter update percentage (e.g., 1 for 1%):  1


Updating 100000 rows (~1.0%)...


25/02/26 21:00:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 21:00:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 21:00:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 21:00:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 21:00:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 21:00:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 21:00:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/02/26 21:00:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `extra_col_0` cannot be resolved. Did you mean one of the following? [`extra_col_1`, `demo`.`nyc`.`taxis_10M_50COLUMNS_where`.`extra_col_10`, `demo`.`nyc`.`taxis_10M_50COLUMNS_where`.`extra_col_2`, `demo`.`nyc`.`taxis_10M_50COLUMNS_where`.`extra_col_20`, `demo`.`nyc`.`taxis_10M_50COLUMNS_where`.`extra_col_3`].;
'Project ['extra_col_0, extra_col_1#11318, extra_col_1#11318, extra_col_2#11102, extra_col_3#11103, extra_col_4#11104, extra_col_5#11105, extra_col_6#11106, extra_col_7#11107, extra_col_8#11108, extra_col_9#11109, extra_col_10#11110, extra_col_11#11111, extra_col_12#11112, extra_col_13#11113, extra_col_14#11114, extra_col_15#11115, extra_col_16#11116, extra_col_17#11117, extra_col_18#11118, extra_col_19#11119, extra_col_20#11120, extra_col_21#11121, extra_col_22#11122, ... 27 more fields]
+- Project [extra_col_1#11318, extra_col_2#11102, extra_col_3#11103, extra_col_4#11104, extra_col_5#11105, extra_col_6#11106, extra_col_7#11107, extra_col_8#11108, extra_col_9#11109, extra_col_10#11110, extra_col_11#11111, extra_col_12#11112, extra_col_13#11113, extra_col_14#11114, extra_col_15#11115, extra_col_16#11116, extra_col_17#11117, extra_col_18#11118, extra_col_19#11119, extra_col_20#11120, extra_col_21#11121, extra_col_22#11122, extra_col_23#11123, extra_col_24#11124, ... 25 more fields]
   +- Project [extra_col_0#11100, CASE WHEN isnotnull(extra_col_0#11100) THEN (extra_col_1#11101 + 10) ELSE extra_col_1#11101 END AS extra_col_1#11318, extra_col_2#11102, extra_col_3#11103, extra_col_4#11104, extra_col_5#11105, extra_col_6#11106, extra_col_7#11107, extra_col_8#11108, extra_col_9#11109, extra_col_10#11110, extra_col_11#11111, extra_col_12#11112, extra_col_13#11113, extra_col_14#11114, extra_col_15#11115, extra_col_16#11116, extra_col_17#11117, extra_col_18#11118, extra_col_19#11119, extra_col_20#11120, extra_col_21#11121, extra_col_22#11122, extra_col_23#11123, ... 26 more fields]
      +- Project [extra_col_0#11100, extra_col_1#11101, extra_col_2#11102, extra_col_3#11103, extra_col_4#11104, extra_col_5#11105, extra_col_6#11106, extra_col_7#11107, extra_col_8#11108, extra_col_9#11109, extra_col_10#11110, extra_col_11#11111, extra_col_12#11112, extra_col_13#11113, extra_col_14#11114, extra_col_15#11115, extra_col_16#11116, extra_col_17#11117, extra_col_18#11118, extra_col_19#11119, extra_col_20#11120, extra_col_21#11121, extra_col_22#11122, extra_col_23#11123, ... 26 more fields]
         +- Join LeftOuter, (extra_col_0#11100 = extra_col_0#11266)
            :- SubqueryAlias demo.nyc.taxis_10M_50COLUMNS_where
            :  +- RelationV2[extra_col_0#11100, extra_col_1#11101, extra_col_2#11102, extra_col_3#11103, extra_col_4#11104, extra_col_5#11105, extra_col_6#11106, extra_col_7#11107, extra_col_8#11108, extra_col_9#11109, extra_col_10#11110, extra_col_11#11111, extra_col_12#11112, extra_col_13#11113, extra_col_14#11114, extra_col_15#11115, extra_col_16#11116, extra_col_17#11117, extra_col_18#11118, extra_col_19#11119, extra_col_20#11120, extra_col_21#11121, extra_col_22#11122, extra_col_23#11123, ... 26 more fields] demo.nyc.taxis_10M_50COLUMNS_where demo.nyc.taxis_10M_50COLUMNS_where
            +- ResolvedHint (strategy=broadcast)
               +- LogicalRDD [extra_col_0#11266], false
