#### PySpark Configurations ####

In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
                .appName("Ansh-Lamba-PySpark-Tutorial-Beginner") \
                    .config("spark.ui.port", "4040") \
                        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/10 20:12:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Reading data from CSV & JSON files ####

In [3]:
# Create root directory
INPUT_DATA_ROOT = "/opt/spark-data/input/ansh-lamba"

In [4]:
# Read CSV file with Infered schema
df_big_mart_sales = spark.read.format("csv") \
                        .option('inferSchema',True) \
                            .option("header", True) \
                                .load(f"{INPUT_DATA_ROOT}/BigMart Sales.csv")

                                                                                

In [5]:
# Check first N records
df_big_mart_sales.show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Su

In [6]:
# Read JSON file
df_drivers = spark.read.format("json") \
                .option('inferSchema',True) \
                    .option("header", True) \
                        .option('multiLine',False) \
                            .load(f"{INPUT_DATA_ROOT}/drivers.json")

In [7]:
# Check first N records
df_drivers.show(5, truncate=False)

+----+----------+--------+----------+--------------------+-----------+------+----------------------------------------------+
|code|dob       |driverId|driverRef |name                |nationality|number|url                                           |
+----+----------+--------+----------+--------------------+-----------+------+----------------------------------------------+
|HAM |1985-01-07|1       |hamilton  |{Lewis, Hamilton}   |British    |44    |http://en.wikipedia.org/wiki/Lewis_Hamilton   |
|HEI |1977-05-10|2       |heidfeld  |{Nick, Heidfeld}    |German     |\N    |http://en.wikipedia.org/wiki/Nick_Heidfeld    |
|ROS |1985-06-27|3       |rosberg   |{Nico, Rosberg}     |German     |6     |http://en.wikipedia.org/wiki/Nico_Rosberg     |
|ALO |1981-07-29|4       |alonso    |{Fernando, Alonso}  |Spanish    |14    |http://en.wikipedia.org/wiki/Fernando_Alonso  |
|KOV |1981-10-19|5       |kovalainen|{Heikki, Kovalainen}|Finnish    |\N    |http://en.wikipedia.org/wiki/Heikki_Kovalainen|


#### DDL & StructType() schemas ####

In [8]:
# Check dataframe schema
df_big_mart_sales.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [9]:
# Create DDL schema
ddl_schema = """ 
                Item_Identifier STRING, 
                Item_Weight STRING, 
                Item_Fat_Content STRING, 
                Item_Visibility DOUBLE, 
                Item_Type STRING, 
                Item_MRP DOUBLE, 
                Outlet_Identifier STRING, 
                Outlet_Establishment_Year INT, 
                Outlet_Size STRING, 
                Outlet_Location_Type STRING, 
                Outlet_Type STRING, 
                Item_Outlet_Sales DOUBLE 
            """

In [10]:
# Read CSV file with DDL schema
df_big_mart_sales_ddl_schema = spark.read.format("csv") \
                            .schema(ddl_schema) \
                                .option("header", True) \
                                    .load(f"{INPUT_DATA_ROOT}/BigMart Sales.csv")

In [11]:
# Check dataframe schema
df_big_mart_sales_ddl_schema.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [12]:
# Create StructType schema
struct_type_schema = StructType([\
                            StructField('Item_Identifier', StringType(), True), \
                                StructField('Item_Weight', StringType(), True), \
                                    StructField('Item_Fat_Content', StringType(), True), \
                                        StructField('Item_Visibility', StringType(), True), \
                                            StructField('Item_Type', StringType(), True), \
                                                StructField('Item_MRP', StringType(), True), \
                                                    StructField('Outlet_Identifier', StringType(), True), \
                                                        StructField('Outlet_Establishment_Year', StringType(), True), \
                                                            StructField('Outlet_Size', StringType(), True), \
                                                                StructField('Outlet_Location_Type', StringType(), True), \
                                                                    StructField('Outlet_Type', StringType(), True), \
                                                                        StructField('Item_Outlet_Sales', StringType(), True)
                        ])

In [13]:
# Read CSV file with StructType() schema
df_big_mart_sales_struct_type_schema = spark.read.format("csv") \
                                        .schema(struct_type_schema) \
                                            .option("header", True) \
                                                .load(f"{INPUT_DATA_ROOT}/BigMart Sales.csv")

In [14]:
# Check dataframe schema
df_big_mart_sales_struct_type_schema.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: string (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: string (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: string (nullable = true)



#### Data Manipulations - SELECT, ALIAS, FILTER, WHERE, withColumnRenamed, withColumn, TypeCasting, Sort/OrderBy, LIMIT, DROP ####

In [15]:
# Select specific columns from dataframe
df_big_mart_sales \
    .select('Item_Identifier','Item_Weight','Item_Fat_Content') \
        .show(5, truncate=False)

+---------------+-----------+----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|
+---------------+-----------+----------------+
|FDA15          |9.3        |Low Fat         |
|DRC01          |5.92       |Regular         |
|FDN15          |17.5       |Low Fat         |
|FDX07          |19.2       |Regular         |
|NCD19          |8.93       |Low Fat         |
+---------------+-----------+----------------+
only showing top 5 rows



In [16]:
# Select specific columns from dataframe        
df_big_mart_sales \
    .select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content')) \
        .show(5, truncate=False)

+---------------+-----------+----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|
+---------------+-----------+----------------+
|FDA15          |9.3        |Low Fat         |
|DRC01          |5.92       |Regular         |
|FDN15          |17.5       |Low Fat         |
|FDX07          |19.2       |Regular         |
|NCD19          |8.93       |Low Fat         |
+---------------+-----------+----------------+
only showing top 5 rows



In [17]:
# Select specific columns from dataframe and re-name using ALIAS keyword
df_big_mart_sales \
    .select(col('Item_Identifier').alias('Item_ID'), \
            col('Item_Weight').alias('Item_Wgt'), \
                col('Item_Fat_Content').alias('Item_FC')) \
                    .show(5, truncate=False)

+-------+--------+-------+
|Item_ID|Item_Wgt|Item_FC|
+-------+--------+-------+
|FDA15  |9.3     |Low Fat|
|DRC01  |5.92    |Regular|
|FDN15  |17.5    |Low Fat|
|FDX07  |19.2    |Regular|
|NCD19  |8.93    |Low Fat|
+-------+--------+-------+
only showing top 5 rows



In [18]:
# Using Filter, select all rows where Item_Fat_Content = Regular
df_big_mart_sales \
    .filter(col('Item_Fat_Content') == 'Regular') \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|443.4228         |
|FDX07          |19.2       |Regular         |0.0            |Fruits and Vegetables|182.095 |OUT010           |1998                     |NULL       |Tier 3              |Gr

In [19]:
# Using Filter, select all rows where Item_Type = Soft Drinks and Item_Weight < 10
df_big_mart_sales \
    .filter((col('Item_Type') == 'Soft Drinks') & (col('Item_Weight') < 10)) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type  |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks|48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|443.4228         |
|DRZ11          |8.85       |Regular         |0.113123893    |Soft Drinks|122.5388|OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|1609.9044        |
|DRF49         

In [20]:
# Using Filter, select all rows where Outlet_Location_Type in Tier 1 & Tier 2 and Outlet_Size is NULL
df_big_mart_sales \
    .filter((col('Outlet_Location_Type').isin('Tier 1','Tier 2')) & (col('Outlet_Size').isNull())) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type         |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDH17          |16.2       |Regular         |0.016687114    |Frozen Foods      |96.9726 |OUT045           |2002                     |NULL       |Tier 2              |Supermarket Type1|1076.5986        |
|FDU28          |19.2       |Regular         |0.09444959     |Frozen Foods      |187.8214|OUT017           |2007                     |NULL       |Tier 2              |Supermarket Type1

In [21]:
# Rename Item_Weight column to Item_Wgt
df_big_mart_sales_a = df_big_mart_sales \
                        .withColumnRenamed('Item_Weight','Item_Wgt')

In [22]:
# Check first N records
df_big_mart_sales_a \
    .show(5, truncate=False)

+---------------+--------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Wgt|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+--------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3     |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92    |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2

In [23]:
# Create new column Item_Cost by multiplying Item_Weight with Item_MRP
df_big_mart_sales_b = df_big_mart_sales \
                        .withColumn('Item_Cost', col('Item_Weight') * col('Item_MRP'))

In [24]:
# Check first N records
df_big_mart_sales_b \
    .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|Item_Cost         |
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+------------------+
|FDA15          |9.3        |Low Fat         |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |2323.2255600000003|
|DRC01          |5.92       |Regular         |0.019278216    |Soft Drinks          |48.2692 |OUT

                                                                                

In [25]:
# Replace 'Low Fat' & 'Regular' text in Item_Fat_Content column with 'LF' & 'REG' respectively
df_big_mart_sales_c = df_big_mart_sales \
                        .withColumn('Item_Fat_Content', regexp_replace('Item_Fat_Content', 'Regular', 'REG')) \
                            .withColumn('Item_Fat_Content', regexp_replace('Item_Fat_Content', 'Low Fat', 'LF'))

In [26]:
# Check first N records
df_big_mart_sales_c \
    .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3        |LF              |0.016047301    |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92       |REG             |0.019278216    |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Su

In [27]:
# Convert data type from DOUBLE to STRING using CAST()
df_big_mart_sales_d = df_big_mart_sales \
                        .withColumn('Item_Weight', col('Item_Weight').cast(StringType()))

In [28]:
# Check dataframe schema
df_big_mart_sales_d.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [29]:
# Sort dataframe by Item_Weight in DESCENDING order
df_big_mart_sales \
    .sort(col('Item_Weight').desc()) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDC02          |21.35      |Low Fat         |0.069102831    |Canned               |259.9278|OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|6768.5228        |
|FDC02          |21.35      |Low Fat         |0.115194717    |Canned               |258.3278|OUT010           |1998                     |NULL       |Tier 3              |Gr

In [30]:
# Sort dataframe by Item_Visibility in ASCENDING order
df_big_mart_sales \
    .sort(col('Item_Visibility').asc()) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDO23          |17.85      |Low Fat         |0.0            |Breads               |93.1436 |OUT045           |2002                     |NULL       |Tier 2              |Supermarket Type1|2174.5028        |
|FDY07          |11.8       |Low Fat         |0.0            |Fruits and Vegetables|45.5402 |OUT049           |1999                     |Medium     |Tier 1              |Su

In [31]:
# Sort dataframe by multiple columns - Item_Weight & Item_Visibility in DESCENDING order
df_big_mart_sales \
    .sort(['Item_Weight','Item_Visibility'], ascending=[0,0]) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDR07          |21.35      |Low Fat         |0.130127365    |Fruits and Vegetables|96.2094 |OUT010           |1998                     |NULL       |Tier 3              |Grocery Store    |190.4188         |
|FDC02          |21.35      |Low Fat         |0.115194717    |Canned               |258.3278|OUT010           |1998                     |NULL       |Tier 3              |Gr

In [32]:
# Sort dataframe by multiple columns - Item_Weight in DESCENDING & Item_Visibility in ASCENDING order
df_big_mart_sales \
    .sort(['Item_Weight','Item_Visibility'], ascending=[0,1]) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDC02          |21.35      |Low Fat         |0.068765205    |Canned               |260.4278|OUT013           |1987                     |High       |Tier 3              |Supermarket Type1|3644.5892        |
|FDC02          |21.35      |Low Fat         |0.068809463    |Canned               |258.5278|OUT035           |2004                     |Small      |Tier 2              |Su

In [33]:
# Show first 10 records from dataframe
df_big_mart_sales \
    .limit(10) \
        .show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

In [34]:
# Drop Item_Visibility column from dataframe
df_big_mart_sales \
    .drop(col('Item_Visibility')) \
        .show(5, truncate=False)

+---------------+-----------+----------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+-----------+----------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |9.3        |Low Fat         |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |5.92       |Regular         |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|443.4228         |
|FDN15          |17.5       |Low Fat         

In [35]:
# Drop Item_Weight & Item_Visibility columns from dataframe
df_big_mart_sales \
    .drop('Item_Weight','Item_Visibility') \
        .show(5, truncate=False)

+---------------+----------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Fat_Content|Item_Type            |Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type      |Item_Outlet_Sales|
+---------------+----------------+---------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|FDA15          |Low Fat         |Dairy                |249.8092|OUT049           |1999                     |Medium     |Tier 1              |Supermarket Type1|3735.138         |
|DRC01          |Regular         |Soft Drinks          |48.2692 |OUT018           |2009                     |Medium     |Tier 3              |Supermarket Type2|443.4228         |
|FDN15          |Low Fat         |Meat                 |141.618 |OUT049           |1999                  