In [1]:
from pyspark.sql import SparkSession

# Initialize the SparkSession (step 1)
spark = SparkSession.builder.appName("records").getOrCreate()


25/02/28 05:38:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [17]:
spark.sql("SELECT COUNT(*) AS total_records FROM demo.nyc.taxis_10000_50COLUMNS;").show()

+-------------+
|total_records|
+-------------+
|        10000|
+-------------+



In [15]:
import time

# Function to measure execution time for a query and return the result along with time taken
def run_query(query):
    start_time = time.time()  # Start the timer
    result = spark.sql(query)  # Run the Spark SQL query
    end_time = time.time()  # End the timer
    
    execution_time = end_time - start_time  # Time taken for the query to execute
    return result, execution_time

# Initialize total execution time
total_execution_time = 0

# 1. Repartition and Cache taxis_10000_50COLUMNS DataFrame
df_10000 = spark.table("demo.nyc.taxis_10000_50COLUMNS")
df_10000_repartitioned = df_10000.repartition("extra_col_3", "extra_col_1")  # Repartition based on two columns
df_10000_repartitioned.cache()  # Cache the DataFrame

# Query on the repartitioned DataFrame
query_1 = "SELECT COUNT(*) AS total_records FROM demo.nyc.taxis_10000_50COLUMNS"
result_1, time_1 = run_query(query_1)
total_execution_time += time_1
result_1.show()

# At the end, print the total execution time for this query
print(f"Execution time for taxis_10000_50COLUMNS: {total_execution_time:.4f} seconds")


25/02/28 06:09:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------------+
|total_records|
+-------------+
|        10000|
+-------------+

Execution time for taxis_10000_50COLUMNS: 0.0183 seconds


In [16]:
import time

# Function to measure execution time for a query and return the result along with time taken
def run_query(query):
    start_time = time.time()  # Start the timer
    result = spark.sql(query)  # Run the Spark SQL query
    end_time = time.time()  # End the timer
    
    execution_time = end_time - start_time  # Time taken for the query to execute
    return result, execution_time

# Initialize total execution time
total_execution_time = 0

# Load the table into a DataFrame
df_10000 = spark.table("demo.nyc.taxis_10000_50COLUMNS")

# 1. Repartition the DataFrame by multiple columns (for complex partitioning)
# Example partitioning by two columns: "extra_col_3" (date) and "extra_col_1" (int)
df_10000_repartitioned = df_10000.repartition("extra_col_3", "extra_col_1")  # Repartition based on two columns
df_10000_repartitioned.cache()  # Cache the DataFrame for optimized access

# 2. Filter data based on a condition (complex filtering)
# Example: Filter rows where "extra_col_5" (int) > 100
df_10000_filtered = df_10000_repartitioned.filter("extra_col_5 > 100")
df_10000_filtered.cache()  # Cache after filtering

# 3. Group by and aggregate the data
# Example: Group by "extra_col_3" and calculate the sum of "extra_col_5"
df_10000_grouped = df_10000_filtered.groupBy("extra_col_3").agg({"extra_col_5": "sum"})
df_10000_grouped.cache()  # Cache after aggregation

# 4. Perform a complex join (Join with the same table for demonstration)
# Example: Join on "extra_col_3" (date) column
df_10000_joined = df_10000_repartitioned.alias("df1").join(df_10000_repartitioned.alias("df2"), "extra_col_3")
df_10000_joined.cache()  # Cache after join

# 5. Run a query on the original DataFrame (before repartitioning)
query_1 = "SELECT COUNT(*) AS total_records FROM demo.nyc.taxis_10000_50COLUMNS"
result_1, time_1 = run_query(query_1)
total_execution_time += time_1
result_1.show()

# 6. Run a query on the repartitioned and filtered DataFrame
query_2 = "SELECT COUNT(*) AS filtered_records FROM demo.nyc.taxis_10000_50COLUMNS WHERE extra_col_5 > 100"
result_2, time_2 = run_query(query_2)
total_execution_time += time_2
result_2.show()

# 7. Run a query on the grouped DataFrame (aggregated data)
query_3 = "SELECT extra_col_3, SUM(extra_col_5) AS total_sum FROM demo.nyc.taxis_10000_50COLUMNS GROUP BY extra_col_3"
result_3, time_3 = run_query(query_3)
total_execution_time += time_3
result_3.show()

# 8. Run a query on the joined DataFrame
query_4 = "SELECT COUNT(*) AS total_records_joined FROM demo.nyc.taxis_10000_50COLUMNS df1 JOIN demo.nyc.taxis_10000_50COLUMNS df2 ON df1.extra_col_3 = df2.extra_col_3"
result_4, time_4 = run_query(query_4)
total_execution_time += time_4
result_4.show()

# At the end, print the total execution time for all queries
print(f"Total execution time for all operations: {total_execution_time:.4f} seconds")


25/02/28 06:11:35 WARN CacheManager: Asked to cache already cached data.


+-------------+
|total_records|
+-------------+
|        10000|
+-------------+



                                                                                

+----------------+
|filtered_records|
+----------------+
|            9913|
+----------------+

+-----------+---------+
|extra_col_3|total_sum|
+-----------+---------+
| 2015-05-19|    26442|
| 2017-08-11|    19705|
| 2022-03-28|    27523|
| 2025-02-16|    27962|
| 2021-12-18|    13786|
| 2015-03-09|     2579|
| 2016-03-01|    14604|
| 2021-06-22|    12391|
| 2018-08-10|    18876|
| 2023-07-15|    14366|
| 2021-08-27|    11011|
| 2023-06-22|     8917|
| 2019-06-04|    21620|
| 2021-11-13|     6562|
| 2021-10-11|     9659|
| 2020-08-24|    11171|
| 2017-09-11|     8514|
| 2018-05-28|     3131|
| 2021-01-27|     7688|
| 2019-05-08|     6894|
+-----------+---------+
only showing top 20 rows

+--------------------+
|total_records_joined|
+--------------------+
|               37712|
+--------------------+

Total execution time for all operations: 0.0714 seconds
