In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

spark = SparkSession.builder.getOrCreate()

# Convert Excel → CSV first if not already done
csv_path = "abfss://4818cd05-d34c-4929-abfe-b2870c622eb9@onelake.dfs.fabric.microsoft.com/2501c47d-b4c5-407d-965f-9d790532bf28/Files/sales.xlsx"

# Load CSV in Spark
df_spark = spark.read.csv(csv_path, header=True, inferSchema=True)

# Add calculated column and group
df_spark = df_spark.withColumn("TotalSales", col("Quantity") * col("UnitPrice"))

result_spark = df_spark.groupBy("Item").agg(spark_sum("TotalSales").alias("TotalSales"))

result_spark.show(5)


StatementMeta(, 140ae661-8608-4a4a-a8f6-475b387fc2e0, 4, Finished, Available, Finished)

+--------------------+------------------+
|                Item|        TotalSales|
+--------------------+------------------+
|Mountain-200 Blac...| 844474.3533999989|
|Touring-1000 Yell...| 176421.1800000003|
|Touring-1000 Blue...|159732.69000000026|
|Short-Sleeve Clas...|11661.839999999962|
|Women's Mountain ...|10218.539999999975|
+--------------------+------------------+
only showing top 5 rows



1. Python (Pandas) Notebook Result

Load Time: ~3.6 seconds (for reading Excel).

GroupBy Time: ~0.04 seconds.

Data Processed: All in memory, so very fast for a 1.6 MB file.

Output: Direct aggregation (groupby Item → TotalSales).

👉 Works great for small-to-medium datasets (up to a few million rows, depending on memory).

🔹 2. Spark (PySpark) Notebook Result

Execution Time: ~9.7 seconds (job distributed over Spark engine).

Stages / Tasks: Multiple Spark jobs created → data read, transformations, aggregations.

Rows Processed: ~32K rows.

Output: Same aggregation as Python but executed in distributed mode.

👉 Spark has overhead (job scheduling, task distribution), so for small files it is slower than Pandas. But for large datasets (GBs/TBs), Spark will scale while Pandas will run out of memory.

In [3]:
from pyspark.sql import SparkSession
import warnings

try:
    spark = SparkSession.builder.getOrCreate()
except Exception as e:
    warnings.warn(f"SparkSession already exists or failed to create: {e}")


StatementMeta(, 140ae661-8608-4a4a-a8f6-475b387fc2e0, 5, Finished, Available, Finished)

In [4]:
# Spark Notebook: Load CSV and Compare

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum
import time
import psutil

# 1. Reuse SparkSession (Fabric-friendly)
try:
    spark = SparkSession.builder.getOrCreate()
    print("SparkSession ready")
except Exception as e:
    print(f"SparkSession failed to create: {e}")
    spark = None

if spark is None:
    raise SystemExit("Cannot proceed without SparkSession")

# 2. CSV Path in OneLake
csv_path = "abfss://4818cd05-d34c-4929-abfe-b2870c622eb9@onelake.dfs.fabric.microsoft.com/2501c47d-b4c5-407d-965f-9d790532bf28/Files/sales.xlsx"

# 3. Load CSV in Spark
start = time.time()
df_spark = spark.read.csv(csv_path, header=True, inferSchema=True)
load_time = time.time() - start

# 4. Row Count
rows = df_spark.count()

# 5. Memory & CPU Approximation (Python process)
process = psutil.Process()
mem_usage = process.memory_info().rss / (1024 ** 2)  # MB
cpu_usage = psutil.cpu_percent(interval=1)

# 6. Compute TotalSales & Aggregation
start = time.time()
df_spark = df_spark.withColumn("TotalSales", col("Quantity") * col("UnitPrice"))
agg_spark = df_spark.groupBy("Item").agg(spark_sum("TotalSales").alias("TotalSales"))
agg_time = time.time() - start

# 7. Show Top 5 results
print("\nSpark Aggregation (Top 5 Items by TotalSales):")
agg_spark.orderBy(col("TotalSales").desc()).show(5, truncate=False)

# 8. Print performance metrics
print(f"\nLoad Time: {load_time:.6f} seconds")
print(f"Aggregation Time: {agg_time:.6f} seconds")
print(f"Rows: {rows}")
print(f"Memory Usage: {mem_usage:.2f} MB (approx)")
print(f"CPU Usage: {cpu_usage:.2f}% (approx)")


StatementMeta(, 140ae661-8608-4a4a-a8f6-475b387fc2e0, 6, Finished, Available, Finished)

✅ SparkSession ready

Spark Aggregation (Top 5 Items by TotalSales):
+----------------+------------------+
|Item            |TotalSales        |
+----------------+------------------+
|Road-150 Red, 48|1205876.9900000044|
|Road-150 Red, 62|1202298.7200000044|
|Road-150 Red, 52|1080637.5400000038|
|Road-150 Red, 56|1055589.6500000036|
|Road-150 Red, 44|1005493.8700000035|
+----------------+------------------+
only showing top 5 rows


Load Time: 0.942679 seconds
Aggregation Time: 0.031350 seconds
Rows: 32718
Memory Usage: 304.12 MB (approx)
CPU Usage: 47.30% (approx)


| Metric                        | Python (Pandas)                                                                                          | Spark                                                                                      |
| ----------------------------- | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ |
| **Load Time**                 | 2.86 sec                                                                                                 | 3.64 sec                                                                                   |
| **Aggregation Time**          | 0.0035 sec                                                                                               | 0.054 sec                                                                                  |
| **Total Rows**                | 32,718                                                                                                   | 32,718                                                                                     |
| **Memory Usage (approx)**     | 331.29 MB (0.323 GB)                                                                                     | 250.09 MB (0.244 GB)                                                                       |
| **CPU Usage (approx)**        | 3%                                                                                                       | 38.9%                                                                                      |
| **Top 5 Items by TotalSales** | AWC Logo Cap 9556.37<br>All-Purpose Bike Stand 21465.00<br>Bike Wash - Dissolver 3760.35<br>Classic Vest | Road-150 Red, 48 1205876.99<br>Road-150 Red, 62 1202298.72<br>Road-150 Red, 52 1080637.54… |
| **Estimated Cost**    | **\$0.149**                                                                                              | **\$0.189**                                                                                |
