In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("records").getOrCreate()

25/02/26 12:58:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
# Switch to the 'nyc' database
spark.sql("USE nyc")

# List all tables in the 'nyc' database
spark.sql("SHOW TABLES").show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|      nyc|           taxis_100|      false|
|      nyc|     taxis_100M_time|      false|
|      nyc|taxis_1000_50COLUMNS|      false|
|      nyc|            taxis_1B|      false|
|      nyc|            taxis_1L|      false|
|      nyc|          taxis_1000|      false|
|      nyc|            taxis_10|      false|
|      nyc| taxis_10M_50COLUMNS|      false|
|      nyc|   taxis_1L_5COLUMNS|      false|
|      nyc|         taxis_10000|      false|
|      nyc|            taxis_1M|      false|
|      nyc|          taxis_1L_5|      false|
|      nyc|          taxis_10_M|      false|
|      nyc|taxis_1000_50COLU...|      false|
|      nyc|  taxis_10_50COLUMNS|      false|
|      nyc|           taxis_10K|      false|
|      nyc|taxis_100000_50CO...|      false|
|      nyc|            taxis_1K|      false|
|      nyc|           taxis_10L|      false|
|      nyc

In [8]:
import time
import random
from datetime import datetime, timedelta
from pyspark.sql import functions as F

# Set the configuration to avoid truncation warning
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)  # Adjust as needed

# Define table name
table_name = "demo.nyc.taxis_100_50COLUMNS_Update"  # Adjust table name accordingly

# Get total records before update
start_time = time.time()
df = spark.table(table_name)
total_records_before = df.count()
print(f"Total records before update: {total_records_before}")

# Ask user input for update percentage
update_percentage = float(input("Enter percentage of records to update: "))

# Calculate records to update
num_records_to_update = max(1, int((update_percentage / 100) * total_records_before))
print(f"Updating {num_records_to_update} records.")

# Step 1: Get table schema (cached once)
schema_df = spark.sql(f"DESCRIBE {table_name}").collect()
schema_dict = {row["col_name"]: row["data_type"] for row in schema_df if not row["col_name"].startswith("#")}
columns = list(schema_dict.keys())

# Step 2: Generate random updates while respecting column types
def generate_random_value(data_type):
    if "int" in data_type.lower():
        return random.randint(1, 10000)  # Generate random integer for INT columns
    elif "string" in data_type.lower():
        return f"Random_{random.randint(100, 999)}"  # Generate random string for STRING columns
    elif "date" in data_type.lower():
        return (datetime.today() - timedelta(days=random.randint(0, 365))).strftime('%Y-%m-%d')  # Generate random date in YYYY-MM-DD format
    else:
        return None  # Default case for unsupported data types

# Step 3: Generate a list of sampled records to update
# Sample random rows ensuring unique IDs
sampled_df = df.select("extra_col_0").distinct().orderBy(F.rand()).limit(num_records_to_update)
sampled_ids = [row["extra_col_0"] for row in sampled_df.collect()]

# Step 4: Update columns using the DataFrame API
update_df = df

for col_name in columns:
    data_type = schema_dict[col_name]
    if col_name == "extra_col_0":  # Do not update the primary key
        continue
    random_value = generate_random_value(data_type)

    # Apply the update using `withColumn`
    if random_value is not None:
        update_df = update_df.withColumn(
            col_name,
            F.when(F.col("extra_col_0").isin(sampled_ids), F.lit(random_value).cast(schema_dict[col_name])).otherwise(F.col(col_name))
        )

# Step 5: Write back the updated DataFrame
update_df.write.mode("overwrite").saveAsTable(table_name)

# Step 6: Print final output
end_time = time.time()
print(f"✅ Successfully updated {num_records_to_update} records!")
print(f"⏳ Total time taken for update: {round(end_time - start_time, 2)} seconds.")


Total records before update: 100


Enter percentage of records to update:  1


Updating 1 records.
✅ Successfully updated 1 records!
⏳ Total time taken for update: 7.18 seconds.


In [12]:
import time
import random
from datetime import datetime, timedelta
from pyspark.sql import functions as F

# Set the configuration to avoid truncation warning
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)  # Adjust as needed

# Define table name
table_name = "demo.nyc.taxis_100_50COLUMNS_Update"  # Adjust table name accordingly

# Get total records before update
start_time = time.time()
df = spark.table(table_name)
total_records_before = df.count()
print(f"Total records before update: {total_records_before}")

# Ask user input for update percentage
update_percentage = float(input("Enter percentage of records to update: "))

# Calculate records to update
num_records_to_update = max(1, int((update_percentage / 100) * total_records_before))
print(f"Updating {num_records_to_update} records.")

# Step 1: Get table schema (cached once)
schema_df = spark.sql(f"DESCRIBE {table_name}").collect()
schema_dict = {row["col_name"]: row["data_type"] for row in schema_df if not row["col_name"].startswith("#")}
columns = list(schema_dict.keys())

# Step 2: Generate random updates while respecting column types
def generate_random_value(data_type):
    if "int" in data_type.lower():
        return random.randint(1, 10000)  # Generate random integer for INT columns
    elif "string" in data_type.lower():
        return f"Random_{random.randint(100, 999)}"  # Generate random string for STRING columns
    elif "date" in data_type.lower():
        return (datetime.today() - timedelta(days=random.randint(0, 365))).strftime('%Y-%m-%d')  # Generate random date in YYYY-MM-DD format
    else:
        return None  # Default case for unsupported data types

# Step 3: Generate sampled IDs (efficiently without using collect)
sampled_df = df.select("extra_col_0").distinct().orderBy(F.rand()).limit(num_records_to_update)
sampled_ids_broadcast = sampled_df.rdd.map(lambda row: row["extra_col_0"]).collect()
sampled_ids_broadcast = set(sampled_ids_broadcast)  # Convert to set for faster lookup

# Broadcast the sampled IDs to all workers
broadcast_sampled_ids = spark.sparkContext.broadcast(sampled_ids_broadcast)

# Step 4: Combine updates into one transformation to avoid multiple `withColumn()` calls
# We will apply all updates in a single `select` operation
update_exprs = []

for col_name in columns:
    if col_name == "extra_col_0":  # Do not update the primary key
        update_exprs.append(F.col(col_name))  # Keep the original value of the primary key
        continue
    
    data_type = schema_dict[col_name]
    random_value = generate_random_value(data_type)
    
    if random_value is not None:
        # Create the update expression for the column
        update_exprs.append(
            F.when(F.col("extra_col_0").isin(broadcast_sampled_ids.value), F.lit(random_value).cast(schema_dict[col_name])).otherwise(F.col(col_name)).alias(col_name)
        )
    else:
        update_exprs.append(F.col(col_name))  # No update needed, retain original column value

# Apply the update using select with the generated expressions
update_df = df.select(*update_exprs)

# Step 5: Write back the updated DataFrame
update_df.write.mode("overwrite").saveAsTable(table_name)

# Step 6: Print final output
end_time = time.time()
print(f"✅ Successfully updated {num_records_to_update} records!")
print(f"⏳ Total time taken for update: {round(end_time - start_time, 2)} seconds.")


Total records before update: 100


Enter percentage of records to update:  10


Updating 10 records.
✅ Successfully updated 10 records!
⏳ Total time taken for update: 6.74 seconds.
