<a href="https://colab.research.google.com/github/shrishaameenaa-cmd/Data_Processing_Challenge/blob/main/23BCS160_In_Memory_Data_Processing_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col
import pandas as pd
import time


In [3]:
spark = SparkSession.builder.appName("InMemoryProcessingDemo").getOrCreate()


In [4]:
# Create a sample dataset with 1 million rows
data = [(i, i % 100, (i * 0.5) % 50) for i in range(1_000_000)]
columns = ["id", "category", "value"]

df = spark.createDataFrame(data, columns)
df.show(5)


+---+--------+-----+
| id|category|value|
+---+--------+-----+
|  0|       0|  0.0|
|  1|       1|  0.5|
|  2|       2|  1.0|
|  3|       3|  1.5|
|  4|       4|  2.0|
+---+--------+-----+
only showing top 5 rows



In [5]:
df.cache()  # Store DataFrame in memory


DataFrame[id: bigint, category: bigint, value: double]

In [6]:
start = time.time()

result = df.groupBy("category").agg(avg("value").alias("avg_value"))
result.show(10)

end = time.time()
print("⏱ In-Memory Execution Time:", round(end - start, 2), "seconds")


+--------+---------+
|category|avg_value|
+--------+---------+
|      26|     13.0|
|      29|     14.5|
|      65|     32.5|
|      19|      9.5|
|      54|     27.0|
|       0|      0.0|
|      22|     11.0|
|       7|      3.5|
|      77|     38.5|
|      34|     17.0|
+--------+---------+
only showing top 10 rows

⏱ In-Memory Execution Time: 10.72 seconds


In [7]:
df.unpersist()  # Remove from memory

start = time.time()
result = df.groupBy("category").agg(avg("value").alias("avg_value")).collect()
end = time.time()

print("⏱ Disk-Based Execution Time:", round(end - start, 2), "seconds")


⏱ Disk-Based Execution Time: 3.88 seconds


In [8]:
for batch in range(3):
    print(f"=== Batch {batch+1} ===")
    new_data = [(i + batch * 1000000, i % 100, (i * 0.3) % 50) for i in range(100000)]
    new_df = spark.createDataFrame(new_data, columns)
    new_df.cache()
    result = new_df.groupBy("category").agg(avg("value").alias("avg_value"))
    result.show(5)


=== Batch 1 ===
+--------+------------------+
|category|         avg_value|
+--------+------------------+
|      26| 27.79999999999945|
|      29|28.699999999999935|
|      65|              29.5|
|      19|25.699999999999935|
|      54|26.199999999999935|
+--------+------------------+
only showing top 5 rows

=== Batch 2 ===
+--------+------------------+
|category|         avg_value|
+--------+------------------+
|      26| 27.79999999999945|
|      29|28.699999999999935|
|      65|              29.5|
|      19|25.699999999999935|
|      54|26.199999999999935|
+--------+------------------+
only showing top 5 rows

=== Batch 3 ===
+--------+------------------+
|category|         avg_value|
+--------+------------------+
|      26| 27.79999999999945|
|      29|28.699999999999935|
|      65|              29.5|
|      19|25.699999999999935|
|      54|26.199999999999935|
+--------+------------------+
only showing top 5 rows

