#### PySpark Configurations ####

In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
                .appName("Ansh-Lamba-PySpark-Tutorial-Intermediate") \
                    .config("spark.ui.port", "4040") \
                        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/10 21:21:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Reading data from CSV & JSON files ####

In [3]:
# Create root directory
INPUT_DATA_ROOT = "/opt/spark-data/input/ansh-lamba"

In [4]:
# Read CSV file with Infered schema
df_big_mart_sales = spark.read.format("csv") \
                        .option('inferSchema',True) \
                            .option("header", True) \
                                .load(f"{INPUT_DATA_ROOT}/BigMart Sales.csv")

                                                                                

In [5]:
# Check first N records
df_big_mart_sales.show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Su

#### Data Manipulations - DISTINCT, DROP_DUPLICATES, UNION & UNION ByName, STRING, DATE, DATE_ADD, DATE_SUB, DATEDIFF ####

In [6]:
# Drop duplicate records from dataframe
df_big_mart_sales \
    .distinct() \
        .show(5, truncate=False)

[Stage 3:>                                                          (0 + 1) / 1]

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDB57          |20.25      |Regular         |0.018801549    |Fruits and Vegetables|222.1772|OUT035           |2004                     |Small      |Tier 2              |Supermarket Type1|5559.43          |
|FDI27          |8.71       |Regular         |0.04605781     |Dairy                |43.8744 |OUT049           |1999                     |Medium     |Tier 1              |Su

                                                                                

In [7]:
# Drop duplicates from dataframe based on Item_Type column
df_big_mart_sales \
    .dropDuplicates(['Item_Type']) \
        .show(5, truncate=False)

[Stage 6:>                                                          (0 + 1) / 1]

+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type   |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDP36          |10.395     |Regular         |0.0            |Baking Goods|51.4008 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|556.6088         |
|FDO23          |17.85      |Low Fat         |0.0            |Breads      |93.1436 |OUT045           |2002                     |NULL       |Tier 2              |Supermarket Type1|2174.5028        |
|FDP49    

                                                                                

In [8]:
# Drop duplicates from dataframe based on Item_Type column
df_big_mart_sales \
    .drop_duplicates(subset=['Item_Type']) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type   |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDP36          |10.395     |Regular         |0.0            |Baking Goods|51.4008 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|556.6088         |
|FDO23          |17.85      |Low Fat         |0.0            |Breads      |93.1436 |OUT045           |2002                     |NULL       |Tier 2              |Supermarket Type1|2174.5028        |
|FDP49    

In [9]:
# Create dummy datasets
siblings_male = [\
                (1, 'Kwaku Jude', 41, 'M', 10_090.50), \
                (2, 'Yaw David', 37, 'M', 9_001.10), \
                (3, 'Kofi Baffuor', 35, 'M', 8_200.99)
                ]

siblings_female = [\
                (4, 'Abena Salo', 32, 'F', 7_905.00), \
                (5, 'Abena Pat', 30, 'F', 7_005.19)
                ]

In [10]:
# Schema definition
siblings_schema = 'Id INT, Name STRING, Age INT, Gender STRING, Salary DOUBLE'

In [11]:
# Create Spark DataFrame from dummy data
df_siblings_male = spark.createDataFrame(data=siblings_male, schema=siblings_schema)
df_siblings_female = spark.createDataFrame(data=siblings_female, schema=siblings_schema)

In [12]:
# Check first N records
df_siblings_male.show()

                                                                                

+---+------------+---+------+-------+
| Id|        Name|Age|Gender| Salary|
+---+------------+---+------+-------+
|  1|  Kwaku Jude| 41|     M|10090.5|
|  2|   Yaw David| 37|     M| 9001.1|
|  3|Kofi Baffuor| 35|     M|8200.99|
+---+------------+---+------+-------+



In [13]:
# Merge dataframes
df_siblings = df_siblings_male.union(df_siblings_female)
# df_siblings = df_siblings_male.unionByName(df_siblings_female)

In [14]:
# Check first N records
df_siblings.show()

+---+------------+---+------+-------+
| Id|        Name|Age|Gender| Salary|
+---+------------+---+------+-------+
|  1|  Kwaku Jude| 41|     M|10090.5|
|  2|   Yaw David| 37|     M| 9001.1|
|  3|Kofi Baffuor| 35|     M|8200.99|
|  4|  Abena Salo| 32|     F| 7905.0|
|  5|   Abena Pat| 30|     F|7005.19|
+---+------------+---+------+-------+



In [15]:
# Convert Item_Type column using INIT function
df_big_mart_sales \
    .select(initcap('Item_Type').alias('Upper_Item_Type')) \
        .show(5, truncate=False)

+---------------------+
|Upper_Item_Type      |
+---------------------+
|Dairy                |
|Soft Drinks          |
|Meat                 |
|Fruits And Vegetables|
|Household            |
+---------------------+
only showing top 5 rows



In [16]:
# Convert Item_Type column using LOWER functions
df_big_mart_sales \
    .select(lower('Item_Type').alias('Lower_Item_Type')) \
        .show(5, truncate=False)

+---------------------+
|Lower_Item_Type      |
+---------------------+
|dairy                |
|soft drinks          |
|meat                 |
|fruits and vegetables|
|household            |
+---------------------+
only showing top 5 rows



In [17]:
# Convert Item_Type column using LOWER functions
df_big_mart_sales \
    .select(upper('Item_Type').alias('Upper_Item_Type')) \
        .show(5, truncate=False)

+---------------------+
|Upper_Item_Type      |
+---------------------+
|DAIRY                |
|SOFT DRINKS          |
|MEAT                 |
|FRUITS AND VEGETABLES|
|HOUSEHOLD            |
+---------------------+
only showing top 5 rows



In [18]:
# Add new date columns to dataframe
df_siblings = df_siblings.withColumn('TodayDate', current_date())

In [19]:
# Check first N records
df_siblings.show()

+---+------------+---+------+-------+----------+
| Id|        Name|Age|Gender| Salary| TodayDate|
+---+------------+---+------+-------+----------+
|  1|  Kwaku Jude| 41|     M|10090.5|2026-01-10|
|  2|   Yaw David| 37|     M| 9001.1|2026-01-10|
|  3|Kofi Baffuor| 35|     M|8200.99|2026-01-10|
|  4|  Abena Salo| 32|     F| 7905.0|2026-01-10|
|  5|   Abena Pat| 30|     F|7005.19|2026-01-10|
+---+------------+---+------+-------+----------+



In [20]:
# Add 7 days to TodayDate & Subtract 7 days from TodayDate column in dataframe
df_siblings = df_siblings.withColumn('NextWeekDate', date_add('TodayDate', 7)) \
                .withColumn('LastWeekDate', date_add('TodayDate', -7))

In [21]:
# Check first N records
df_siblings.show()

+---+------------+---+------+-------+----------+------------+------------+
| Id|        Name|Age|Gender| Salary| TodayDate|NextWeekDate|LastWeekDate|
+---+------------+---+------+-------+----------+------------+------------+
|  1|  Kwaku Jude| 41|     M|10090.5|2026-01-10|  2026-01-17|  2026-01-03|
|  2|   Yaw David| 37|     M| 9001.1|2026-01-10|  2026-01-17|  2026-01-03|
|  3|Kofi Baffuor| 35|     M|8200.99|2026-01-10|  2026-01-17|  2026-01-03|
|  4|  Abena Salo| 32|     F| 7905.0|2026-01-10|  2026-01-17|  2026-01-03|
|  5|   Abena Pat| 30|     F|7005.19|2026-01-10|  2026-01-17|  2026-01-03|
+---+------------+---+------+-------+----------+------------+------------+



In [22]:
# Check difference between two days
df_siblings = df_siblings.withColumn('DaysDiff', datediff('NextWeekDate','LastWeekDate'))

In [23]:
# Check first N records
df_siblings.show()

+---+------------+---+------+-------+----------+------------+------------+--------+
| Id|        Name|Age|Gender| Salary| TodayDate|NextWeekDate|LastWeekDate|DaysDiff|
+---+------------+---+------+-------+----------+------------+------------+--------+
|  1|  Kwaku Jude| 41|     M|10090.5|2026-01-10|  2026-01-17|  2026-01-03|      14|
|  2|   Yaw David| 37|     M| 9001.1|2026-01-10|  2026-01-17|  2026-01-03|      14|
|  3|Kofi Baffuor| 35|     M|8200.99|2026-01-10|  2026-01-17|  2026-01-03|      14|
|  4|  Abena Salo| 32|     F| 7905.0|2026-01-10|  2026-01-17|  2026-01-03|      14|
|  5|   Abena Pat| 30|     F|7005.19|2026-01-10|  2026-01-17|  2026-01-03|      14|
+---+------------+---+------+-------+----------+------------+------------+--------+



In [24]:
# Change the date format in the dataframe
df_siblings = df_siblings.withColumn('NextWeekDate', date_format(col('NextWeekDate'), 'dd-MM-yyyy')) \
                .withColumn('LastWeekDate', date_format(col('LastWeekDate'), 'dd.MM.yyyy'))

In [25]:
# Check first N records
df_siblings.show()

+---+------------+---+------+-------+----------+------------+------------+--------+
| Id|        Name|Age|Gender| Salary| TodayDate|NextWeekDate|LastWeekDate|DaysDiff|
+---+------------+---+------+-------+----------+------------+------------+--------+
|  1|  Kwaku Jude| 41|     M|10090.5|2026-01-10|  17-01-2026|  03.01.2026|      14|
|  2|   Yaw David| 37|     M| 9001.1|2026-01-10|  17-01-2026|  03.01.2026|      14|
|  3|Kofi Baffuor| 35|     M|8200.99|2026-01-10|  17-01-2026|  03.01.2026|      14|
|  4|  Abena Salo| 32|     F| 7905.0|2026-01-10|  17-01-2026|  03.01.2026|      14|
|  5|   Abena Pat| 30|     F|7005.19|2026-01-10|  17-01-2026|  03.01.2026|      14|
+---+------------+---+------+-------+----------+------------+------------+--------+



#### Data Manipulations - Handling NULLS, SPLIT & INDEXING, EXPLODE, ARRAY_CONTAINS, GROUP BY ####

In [26]:
# Drop NULLS in dataframe
df_big_mart_sales \
    .dropna('all') \
        .show(5, truncate=False) # Drop all records with NULL in all columns

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Su

In [27]:
# Drop NULLS in dataframe
df_big_mart_sales \
    .dropna('any') \
        .show(5, truncate=False) # Drop all records with NULL in any column

+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type   |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy       |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|443.4228         |
|FDN15    

In [28]:
# Drop NULLS in dataframe
df_big_mart_sales \
    .dropna(subset=['Outlet_Size']) \
        .show(5, truncate=False) # Drop NULL based on particular column

+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type   |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy       |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|443.4228         |
|FDN15    

In [29]:
# Replace NULLS in dataframe
df_big_mart_sales \
    .fillna('N/A').show(10, truncate=False) # Replace all records with NULL in all columns with N/A

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Su

In [30]:
# Split Outlet_Type column based on delimiter
df_big_mart_sales \
    .withColumn('Outlet_Type', split('Outlet_Type',' ')) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type         |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |[Supermarket, Type1]|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3     

In [31]:
# Split Outlet_Type column based on delimiter, return element at index 0
df_big_mart_sales \
    .withColumn('Outlet_Type', split('Outlet_Type',' ')[0]) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket|443.4228      

In [32]:
# Select specific columns from dataframe
df_explode = df_big_mart_sales \
                .select('Item_Identifier','Item_Weight','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Type','Item_Outlet_Sales')

In [33]:
# Check first N records
df_explode.show(5, truncate=False)

+---------------+-----------+----------------+---------------+--------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_MRP|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------+-----------------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |249.8092|Supermarket Type1|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |48.2692 |Supermarket Type2|443.4228         |
|FDN15          |17.5       |Low Fat         |0.016760075    |141.618 |Supermarket Type1|2097.27          |
|FDX07          |19.2       |Regular         |0.0            |182.095 |Grocery Store    |732.38           |
|NCD19          |8.93       |Low Fat         |0.0            |53.8614 |Supermarket Type1|994.7052         |
+---------------+-----------+----------------+---------------+--------+-----------------+-----------------+
only showing top 5 rows



In [34]:
# Split Outlet_Type column based on delimiter and EXPLODE to separate rows
df_explode \
    .withColumn('Outlet_Type', split('Outlet_Type',' ')) \
        .withColumn('Outlet_Type', explode('Outlet_Type')) \
            .show(5, truncate=False)

+---------------+-----------+----------------+---------------+--------+-----------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_MRP|Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------+-----------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |249.8092|Supermarket|3735.138         |
|FDA15          |9.3        |Low Fat         |0.016047301    |249.8092|Type1      |3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |48.2692 |Supermarket|443.4228         |
|DRC01          |5.92       |Regular         |0.019278216    |48.2692 |Type2      |443.4228         |
|FDN15          |17.5       |Low Fat         |0.016760075    |141.618 |Supermarket|2097.27          |
+---------------+-----------+----------------+---------------+--------+-----------+-----------------+
only showing top 5 rows



In [35]:
# Use ARRAY_CONTAINS function to search for Type_1_Flag in Outlet_Type
df_big_mart_sales \
    .withColumn('Outlet_Type', split('Outlet_Type',' ')) \
        .withColumn('Type_1_Flag', array_contains('Outlet_Type','Type1')) \
            .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+-----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type         |Item_Outlet_Sales|Type_1_Flag|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+-----------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |[Supermarket, Type1]|3735.138         |true       |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2

In [36]:
# Calculate TOTAL_SALES per Item_Type
df_big_mart_sales \
    .groupBy('Item_Type') \
        .agg(sum('Item_MRP').alias('Total_Sales')) \
            .show(5, truncate=False)

+---------------------+------------------+
|Item_Type            |Total_Sales       |
+---------------------+------------------+
|Starchy Foods        |21880.027399999995|
|Baking Goods         |81894.73640000001 |
|Breads               |35379.11979999999 |
|Fruits and Vegetables|178124.08099999998|
|Meat                 |59449.863799999956|
+---------------------+------------------+
only showing top 5 rows



In [37]:
# Calculate AVERAGE_SALES per Item_Type
df_big_mart_sales \
    .groupBy('Item_Type') \
        .agg(avg('Item_MRP').alias('Average_Sales')) \
            .show(5, truncate=False)

+---------------------+------------------+
|Item_Type            |Average_Sales     |
+---------------------+------------------+
|Starchy Foods        |147.83802297297294|
|Baking Goods         |126.38076604938273|
|Breads               |140.9526685258964 |
|Fruits and Vegetables|144.58123457792206|
|Meat                 |139.88203247058814|
+---------------------+------------------+
only showing top 5 rows



In [38]:
# GroupBy on multiple columns, compute Total_Sales_Per_Item
df_big_mart_sales \
    .groupBy('Item_Fat_Content','Item_Type') \
        .agg(sum('Item_MRP').alias('Total_Sales_Per_Item')) \
            .show(5, truncate=False)

+----------------+------------------+--------------------+
|Item_Fat_Content|Item_Type         |Total_Sales_Per_Item|
+----------------+------------------+--------------------+
|Low Fat         |Breads            |17421.3834          |
|Low Fat         |Others            |20779.215400000005  |
|reg             |Frozen Foods      |2599.9516           |
|low fat         |Health and Hygiene|1055.5562           |
|reg             |Meat              |1281.236            |
+----------------+------------------+--------------------+
only showing top 5 rows



In [39]:
# GroupBy on multiple columns, compute Average_Sale_Per_Item & Total_Sales_Per_Item
df_big_mart_sales \
        .groupBy('Item_Fat_Content','Item_Type') \
            .agg(sum('Item_MRP').alias('Total_Sales_Per_Item'), avg('Item_MRP').alias('Average_Sale_Per_Item')) \
                .show(5, truncate=False)

+----------------+------------------+--------------------+---------------------+
|Item_Fat_Content|Item_Type         |Total_Sales_Per_Item|Average_Sale_Per_Item|
+----------------+------------------+--------------------+---------------------+
|Low Fat         |Breads            |17421.3834          |138.2649476190476    |
|Low Fat         |Others            |20779.215400000005  |133.20009871794875   |
|reg             |Frozen Foods      |2599.9516           |136.83955789473683   |
|low fat         |Health and Hygiene|1055.5562           |105.55562            |
|reg             |Meat              |1281.236            |183.0337142857143    |
+----------------+------------------+--------------------+---------------------+
only showing top 5 rows

