# COUNT(1) vs COUNT(*) vs COUNT(COL_NAME)

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Count(1) vs Count(*)") \
    .master("local[*]") \
    .getOrCreate()

spark

In [4]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [40]:
# Lets read the dataframe to check the data
df = spark \
    .read \
    .format("csv") \
    .option("header", True) \
    .load("dataset/sales.csv")

df.show()

+--------------------+----------+-----------+--------------------+--------------------+----------+
|       transacted_at|    trx_id|retailer_id|         description|              amount|   city_id|
+--------------------+----------+-----------+--------------------+--------------------+----------+
|2017-11-24T19:00:...|1995601912| 2077350195|Walgreen       11-25|197.2300000000000...| 216510442|
|2017-11-24T19:00:...|1734117021|  644879053|unkn    ppd id: 7...|8.580000000000000000| 930259917|
|2017-11-24T19:00:...|1734117022|  847200066|Wal-Mart  ppd id:...|1737.260000000000...|1646415505|
|2017-11-24T19:00:...|1734117030| 1953761884|Home Depot     pp...|384.5000000000000...| 287177635|
|2017-11-24T19:00:...|1734117089| 1898522855| Target        11-25|66.33000000000000...|1855530529|
|2017-11-24T19:00:...|1734117117|  997626433|Sears  ppd id: 85...|298.8700000000000...| 957346984|
|2017-11-24T19:00:...|1734117123| 1953761884|unkn   ppd id: 15...|19.55000000000000...|  45522086|
|2017-11-2

In [30]:
# Get count(1) performance
from pyspark.sql.functions import lit, count

@get_time
def x(): df.groupBy("trx_id").agg(count(lit(1))).write.format("noop").mode("overwrite").save()

Execution time: 2384.6654891967773 ms


In [31]:
# Get count(col_name) performance
@get_time
def x(): df.groupBy("trx_id").agg(count("city_id")).write.format("noop").mode("overwrite").save()

Execution time: 2509.0878009796143 ms


In [32]:
# Get count(*) performance
@get_time
def x(): df.groupBy("trx_id").agg(count("*")).write.format("noop").mode("overwrite").save()

Execution time: 2393.6233520507812 ms


In [39]:
# Explain Plan for count(*)
df.groupBy("trx_id").agg(count("*")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count(1)#512L]
+- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Analyzed Logical Plan ==
trx_id: string, count(1): bigint
Aggregate [trx_id#18], [trx_id#18, count(1) AS count(1)#512L]
+- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Optimized Logical Plan ==
Aggregate [trx_id#18], [trx_id#18, count(1) AS count(1)#512L]
+- Project [trx_id#18]
   +- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[trx_id#18], functions=[count(1)], output=[trx_id#18, count(1)#512L])
   +- Exchange hashpartitioning(trx_id#18, 200), ENSURE_REQUIREMENTS, [id=#992]
      +- HashAggregate(keys=[trx_id#18], functions=[partial_count(1)], output=[trx_id#18, count#516L])
         +- FileScan csv [trx_id#18] Batched: false, 

In [38]:
# Explain Plan for count(1)
df.groupBy("trx_id").agg(count(lit(1))).explain(True)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count(1)#500L]
+- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Analyzed Logical Plan ==
trx_id: string, count(1): bigint
Aggregate [trx_id#18], [trx_id#18, count(1) AS count(1)#500L]
+- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Optimized Logical Plan ==
Aggregate [trx_id#18], [trx_id#18, count(1) AS count(1)#500L]
+- Project [trx_id#18]
   +- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[trx_id#18], functions=[count(1)], output=[trx_id#18, count(1)#500L])
   +- Exchange hashpartitioning(trx_id#18, 200), ENSURE_REQUIREMENTS, [id=#979]
      +- HashAggregate(keys=[trx_id#18], functions=[partial_count(1)], output=[trx_id#18, count#504L])
         +- FileScan csv [trx_id#18] Batched: false, 

In [37]:
# Explain plan with count(col_name)
df.groupBy("trx_id").agg(count("city_id")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count('city_id) AS count(city_id)#488]
+- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Analyzed Logical Plan ==
trx_id: string, count(city_id): bigint
Aggregate [trx_id#18], [trx_id#18, count(city_id#22) AS count(city_id)#488L]
+- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Optimized Logical Plan ==
Aggregate [trx_id#18], [trx_id#18, count(city_id#22) AS count(city_id)#488L]
+- Project [trx_id#18, city_id#22]
   +- Relation [transacted_at#17,trx_id#18,retailer_id#19,description#20,amount#21,city_id#22] csv

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[trx_id#18], functions=[count(city_id#22)], output=[trx_id#18, count(city_id)#488L])
   +- Exchange hashpartitioning(trx_id#18, 200), ENSURE_REQUIREMENTS, [id=#966]
      +- HashAggregate(keys=[trx_id#18], functions=[partial_count(city_id#22)], o