#### pyspark GroupBy and Aggregrate Functions
- Groupby function follows by aggregrate function
- Aggregate function can also be applied separtely 

In [1]:
%load_ext watermark
%load_ext lab_black

In [2]:
from pyspark.sql import SparkSession

In [5]:
# start session
spark = SparkSession.builder.appName("Learning-spark").getOrCreate()

In [6]:
# read the dataset
df_pyspark = spark.read.csv("datasets/tips.csv", header=True, inferSchema=True)
df_pyspark.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [7]:
# lets see the dtypes
df_pyspark.dtypes

[('total_bill', 'double'),
 ('tip', 'double'),
 ('sex', 'string'),
 ('smoker', 'string'),
 ('day', 'string'),
 ('time', 'string'),
 ('size', 'int')]

In [8]:
# similarly we can check schemas too
df_pyspark.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [10]:
# lets groupby to see the tip given according to gender
df_pyspark.groupBy("sex").sum().show()

+------+------------------+-----------------+---------+
|   sex|   sum(total_bill)|         sum(tip)|sum(size)|
+------+------------------+-----------------+---------+
|Female|1570.9499999999998|           246.51|      214|
|  Male|3256.8200000000024|485.0700000000001|      413|
+------+------------------+-----------------+---------+



In [11]:
# lets see the average tip given by gender
df_pyspark.groupBy("sex").avg().show()

+------+------------------+------------------+------------------+
|   sex|   avg(total_bill)|          avg(tip)|         avg(size)|
+------+------------------+------------------+------------------+
|Female|18.056896551724137| 2.833448275862069|2.4597701149425286|
|  Male|20.744076433121034|3.0896178343949052|2.6305732484076434|
+------+------------------+------------------+------------------+



In [18]:
df_pyspark.groupBy("sex").count().show()

+------+-----+
|   sex|count|
+------+-----+
|Female|   87|
|  Male|  157|
+------+-----+



In [14]:
# lets aggregrate the total tip got
df_pyspark.agg({"tip": "sum"}).show()

+--------+
|sum(tip)|
+--------+
|  731.58|
+--------+



In [16]:
# lets aggregrate the total bill
df_pyspark.agg({"total_bill": "sum"}).show()

+-----------------+
|  sum(total_bill)|
+-----------------+
|4827.770000000001|
+-----------------+

