### Importing necessary libraries

In [501]:
import pyspark
import os,pandas
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()
input_df = spark.read.csv('BigMart_Sales.csv',header=True)
input_df.printSchema()
input_df.show(n=1,vertical=True)

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: string (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: string (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: string (nullable = true)

-RECORD 0--------------------------------------
 Item_Identifier           | FDA15             
 Item_Weight               | 9.3               
 Item_Fat_Content          | Low Fat           
 Item_Visibility           | 0.016047301       
 Item_Type                 | Dairy             
 Item_MRP                  | 249.8092          
 Outlet_Identifier         | OUT049            
 Outlet_Establishment_Year | 1999              
 Ou

### Renaming a column in a df

In [502]:
# Fetch the column and apply name change to the column
input_df.select(col('Item_Identifier').alias('Item_ID')).show(n=5)
input_df.show(n=1,vertical=True)

+-------+
|Item_ID|
+-------+
|  FDA15|
|  DRC01|
|  FDN15|
|  FDX07|
|  NCD19|
+-------+
only showing top 5 rows
-RECORD 0--------------------------------------
 Item_Identifier           | FDA15             
 Item_Weight               | 9.3               
 Item_Fat_Content          | Low Fat           
 Item_Visibility           | 0.016047301       
 Item_Type                 | Dairy             
 Item_MRP                  | 249.8092          
 Outlet_Identifier         | OUT049            
 Outlet_Establishment_Year | 1999              
 Outlet_Size               | Medium            
 Outlet_Location_Type      | Tier 1            
 Outlet_Type               | Supermarket Type1 
 Item_Outlet_Sales         | 3735.138          
only showing top 1 row


#### withColumnRenamed method

In [503]:
# if you want to make changes to one column name and reflect the df as it is
df = input_df.withColumnRenamed('Item_Identifier','Item_ID') # it'll change the column name and returns entire df
df.printSchema()

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: string (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: string (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: string (nullable = true)



### No.of rows present in the df

In [504]:
# command to print no.of rows present in the df
print(df.count())

8523


### Filtering the dataframe based on a condition
#### filter() 

In [505]:
# filtering the entire df with one condition
df.filter(col('Item_Fat_Content')=='Low Fat').toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
2,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
3,FDP10,,Low Fat,0.127469857,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
4,FDY07,11.8,Low Fat,0.0,Fruits and Vegetables,45.5402,OUT049,1999,Medium,Tier 1,Supermarket Type1,1516.0266


In [506]:
# filtering the entire df with satisfying two conditions
df.filter((col('Item_Fat_Content')=='Low Fat') & (col('Item_Type')=='Meat')).toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
1,FDK43,9.8,Low Fat,0.02681843,Meat,126.002,OUT013,1987,High,Tier 3,Supermarket Type1,2150.534
2,FDH19,19.35,Low Fat,0.033082215,Meat,172.5738,OUT035,2004,Small,Tier 2,Supermarket Type1,4865.6664
3,FDN27,20.85,Low Fat,0.039624006,Meat,117.2808,OUT049,1999,Medium,Tier 1,Supermarket Type1,1523.3504
4,FDV39,11.3,Low Fat,0.007294652,Meat,198.1426,OUT045,2002,,Tier 2,Supermarket Type1,988.713


#### isNull() & isin()

In [507]:
# filtering the entire df with satisfying two conditions 
# satisfying 1. Outlet_Size columns containing Null values 2. Outlet_Location_Type containing either Tier 1 or Tier 2
df.filter((col('Outlet_Size').isNull()) & (col('Outlet_Location_Type').isin('Tier 1','Tier 2'))).toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDH17,16.2,Regular,0.016687114,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
1,FDU28,19.2,Regular,0.09444959,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535
2,NCD06,13.0,Low Fat,0.099887103,Household,45.906,OUT017,2007,,Tier 2,Supermarket Type1,838.908
3,FDO23,17.85,Low Fat,0.0,Breads,93.1436,OUT045,2002,,Tier 2,Supermarket Type1,2174.5028
4,NCP05,19.6,Low Fat,0.0,Health and Hygiene,153.3024,OUT045,2002,,Tier 2,Supermarket Type1,2428.8384


### Creating new columns and make changes to the existing data
#### withColumn()

In [508]:
# create a new column named 'flag' contains all values as 'new'
# make sure if you want to reflect the changes back to the df use df = df.function()
df = df.withColumn('flag',lit('new'))
df.toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,flag
0,FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,new
1,DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,new
2,FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,new
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,new
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,new


In [509]:
'''AnalysisException: [DATATYPE_MISMATCH.BINARY_OP_WRONG_TYPE] Cannot resolve "(Item_weight * Item_MRP)" 
due to data type mismatch: the binary operator requires the input type "NUMERIC", not "STRING". SQLSTATE: 42K09;
'''
df.printSchema() # to print current data type of the column

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: string (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: string (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: string (nullable = true)
 |-- flag: string (nullable = false)



In [510]:
# Case 1 : we can change the data type of a column using schme based
my_ddl_schema= '''
        Item_ID STRING,
        Item_Weight DOUBLE,
        Item_Fat_Content STRING,
        Item_Visibility STRING,
        Item_Type STRING,
        Item_MRP STRING,
        Outlet_Identifier STRING,
        Outlet_Establishment_Year INT,
        Outlet_Size STRING,
        Outlet_Location_Type STRING,
        Outlet_Type STRING,
        Item_Outlet_Sales STRING
'''
df= spark.read.csv('BigMart_Sales.csv',schema=my_ddl_schema,header=True)
df.printSchema()

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: string (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: string (nullable = true)



#### cast()

In [511]:
# Case 2: we can use cast() function to change the data type of a df
df = df.withColumn('Item_MRP',col('Item_weight').cast(DoubleType()))
df.printSchema()
df = df.withColumn('Item_Outlet_Sales',col('Item_Outlet_Sales').cast(DoubleType()))
df.printSchema()

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: string (nullable = true)

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: st

In [512]:
# create a new column of 'Total_cost' which contains values of multiplication of Item_weight and Item_MRP for each Item_ID
df = df.withColumn('Total_cost',col('Item_weight')*col('Item_MRP'))
df.toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Total_cost
0,FDA15,9.3,Low Fat,0.016047301,Dairy,9.3,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,86.49
1,DRC01,5.92,Regular,0.019278216,Soft Drinks,5.92,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,35.0464
2,FDN15,17.5,Low Fat,0.016760075,Meat,17.5,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,306.25
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,19.2,OUT010,1998,,Tier 3,Grocery Store,732.38,368.64
4,NCD19,8.93,Low Fat,0.0,Household,8.93,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,79.7449


#### make changes to existing data/changing columnar values
#### regexp_replace()

In [513]:
# In order to make changes to a column update the same column name and use regexp_replace function will be used to alter the contents
df = df.withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),'Low Fat','LF'))\
    .withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),'Regular','Reg'))
df.toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Total_cost
0,FDA15,9.3,LF,0.016047301,Dairy,9.3,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,86.49
1,DRC01,5.92,Reg,0.019278216,Soft Drinks,5.92,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,35.0464
2,FDN15,17.5,LF,0.016760075,Meat,17.5,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,306.25
3,FDX07,19.2,Reg,0.0,Fruits and Vegetables,19.2,OUT010,1998,,Tier 3,Grocery Store,732.38,368.64
4,NCD19,8.93,LF,0.0,Household,8.93,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,79.7449


### Sorting the df based on a Column
#### sort()

In [514]:
# sorting the df based on a column Item_weight in descending order
df.sort(col('Item_Weight').desc()).toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Total_cost
0,FDC02,21.35,LF,0.069102831,Canned,21.35,OUT018,2009,Medium,Tier 3,Supermarket Type2,6768.5228,455.8225
1,FDC02,21.35,LF,0.115194717,Canned,21.35,OUT010,1998,,Tier 3,Grocery Store,520.6556,455.8225
2,FDC02,21.35,LF,0.068809463,Canned,21.35,OUT035,2004,Small,Tier 2,Supermarket Type1,5206.556,455.8225
3,FDC02,21.35,LF,0.068765205,Canned,21.35,OUT013,1987,High,Tier 3,Supermarket Type1,3644.5892,455.8225
4,FDR07,21.35,LF,0.130127365,Fruits and Vegetables,21.35,OUT010,1998,,Tier 3,Grocery Store,190.4188,455.8225


In [515]:
# sorting the df based on a column Item_weight in ascending order
df.sort(col('Outlet_Establishment_Year').asc()).toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Total_cost
0,FDP10,,LF,0.127469857,Snack Foods,,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636,
1,DRI11,,LF,0.034237682,Hard Drinks,,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668,
2,FDW12,,Reg,0.035399923,Baking Goods,,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432,
3,FDC37,,LF,0.057556998,Baking Goods,,OUT019,1985,Small,Tier 1,Grocery Store,214.3876,
4,FDC14,,Reg,0.072221801,Canned,,OUT019,1985,Small,Tier 1,Grocery Store,125.8362,


### finding null values in each column
#### pandas: count()

In [516]:
# command to view null values in a df using count function from pandas
df.toPandas().count()

Item_ID                      8523
Item_Weight                  7060
Item_Fat_Content             8523
Item_Visibility              8523
Item_Type                    8523
Item_MRP                     7060
Outlet_Identifier            8523
Outlet_Establishment_Year    8523
Outlet_Size                  6113
Outlet_Location_Type         8523
Outlet_Type                  8523
Item_Outlet_Sales            8523
Total_cost                   7060
dtype: int64

In [517]:
# Sorting both columns Item_Weight and Outlet_Establishment_Year at a time in descending order
df.sort(['Item_Weight','Outlet_Establishment_Year'],ascending=[0,0]).toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Total_cost
0,FDC02,21.35,LF,0.069102831,Canned,21.35,OUT018,2009,Medium,Tier 3,Supermarket Type2,6768.5228,455.8225
1,FDR07,21.35,LF,0.078060605,Fruits and Vegetables,21.35,OUT018,2009,Medium,Tier 3,Supermarket Type2,380.8376,455.8225
2,FDC02,21.35,LF,0.068809463,Canned,21.35,OUT035,2004,Small,Tier 2,Supermarket Type1,5206.556,455.8225
3,FDC02,21.35,LF,0.115194717,Canned,21.35,OUT010,1998,,Tier 3,Grocery Store,520.6556,455.8225
4,FDR07,21.35,LF,0.130127365,Fruits and Vegetables,21.35,OUT010,1998,,Tier 3,Grocery Store,190.4188,455.8225


In [518]:
# Sorting both columns Item_Weight, Item_MRP in descending and Outlet_Establishment_Year in ascending order at a time 
df.sort(['Item_Weight','Item_MRP','Outlet_Establishment_Year'],ascending=[0,0,1]).toPandas().head()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Total_cost
0,FDC02,21.35,LF,0.068765205,Canned,21.35,OUT013,1987,High,Tier 3,Supermarket Type1,3644.5892,455.8225
1,FDC02,21.35,LF,0.068822477,Canned,21.35,OUT046,1997,Small,Tier 1,Supermarket Type1,7028.8506,455.8225
2,FDC02,21.35,LF,0.115194717,Canned,21.35,OUT010,1998,,Tier 3,Grocery Store,520.6556,455.8225
3,FDR07,21.35,LF,0.130127365,Fruits and Vegetables,21.35,OUT010,1998,,Tier 3,Grocery Store,190.4188,455.8225
4,FDC02,21.35,LF,0.068809463,Canned,21.35,OUT035,2004,Small,Tier 2,Supermarket Type1,5206.556,455.8225


### Printing limited number of columns
#### limit()

In [519]:
df.limit(5).toPandas()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Total_cost
0,FDA15,9.3,LF,0.016047301,Dairy,9.3,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,86.49
1,DRC01,5.92,Reg,0.019278216,Soft Drinks,5.92,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,35.0464
2,FDN15,17.5,LF,0.016760075,Meat,17.5,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,306.25
3,FDX07,19.2,Reg,0.0,Fruits and Vegetables,19.2,OUT010,1998,,Tier 3,Grocery Store,732.38,368.64
4,NCD19,8.93,LF,0.0,Household,8.93,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,79.7449


### Drop Columns
#### drop()

In [520]:
# dropping single column Total_cost
df.printSchema()
df = df.drop('Total_cost') 
df.printSchema()

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)
 |-- Total_cost: double (nullable = true)

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (null

In [521]:
df = df.withColumn('Test',lit('sample'))
df = df.withColumn('Total_cost',col('Item_Weight')*col('Item_MRP'))
df.printSchema()
# deleting multiple columns at a time
df = df.drop('Total_cost','Test')
df.printSchema()

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)
 |-- Test: string (nullable = false)
 |-- Total_cost: double (nullable = true)

root
 |-- Item_ID: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable =

### Removing Duplicate rows
#### distinct()

In [522]:
print(df.count())
# used to show all unique (distinct) rows in a DataFrame
df = df.distinct()
df.count()

8523


8523

### Dropping Duplicate values in a column

#### dropDuplicates()

In [523]:
# pre check the count of distinct values present in the df
df.select('Item_Type').toPandas().value_counts()

Item_Type            
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: count, dtype: int64

In [524]:
# drop duplicates in a particular column
df = df.dropDuplicates(subset=['Item_Type'])
# post check the count of distinct values present in the df 
df.select('Item_Type').toPandas().value_counts()

Item_Type            
Baking Goods             1
Breads                   1
Breakfast                1
Canned                   1
Dairy                    1
Frozen Foods             1
Fruits and Vegetables    1
Hard Drinks              1
Health and Hygiene       1
Household                1
Meat                     1
Others                   1
Seafood                  1
Snack Foods              1
Soft Drinks              1
Starchy Foods            1
Name: count, dtype: int64

In [525]:
# drop duplicate values in the entire df
df = df.dropDuplicates()
df.count()

16

### Finding Null values present in the DF

In [526]:
print(df.toPandas().count())
print(df.count())
df.filter(col('Item_weight').isNull()==True).filter(col('Item_MRP').isNull()==True).toPandas()

Item_ID                      16
Item_Weight                  13
Item_Fat_Content             16
Item_Visibility              16
Item_Type                    16
Item_MRP                     13
Outlet_Identifier            16
Outlet_Establishment_Year    16
Outlet_Size                  11
Outlet_Location_Type         16
Outlet_Type                  16
Item_Outlet_Sales            16
dtype: int64
16


Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDC14,,Reg,0.072221801,Canned,,OUT019,1985,Small,Tier 1,Grocery Store,125.8362
1,DRI11,,LF,0.034237682,Hard Drinks,,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
2,FDG33,,Reg,0.13956116,Seafood,,OUT027,1985,Medium,Tier 3,Supermarket Type3,3435.528


### drop all Null/None values present in the DF
#### dropna()

In [527]:
# like dropDuplicates() we can use dropna() to remove rows having null values in a column
df.dropna(subset=['outlet_size']).toPandas().count()

Item_ID                      11
Item_Weight                   8
Item_Fat_Content             11
Item_Visibility              11
Item_Type                    11
Item_MRP                      8
Outlet_Identifier            11
Outlet_Establishment_Year    11
Outlet_Size                  11
Outlet_Location_Type         11
Outlet_Type                  11
Item_Outlet_Sales            11
dtype: int64

In [528]:
# if we run without specifying columns if we run dropna() it'll remove all the null values in the df
df.dropna().toPandas().count()

Item_ID                      8
Item_Weight                  8
Item_Fat_Content             8
Item_Visibility              8
Item_Type                    8
Item_MRP                     8
Outlet_Identifier            8
Outlet_Establishment_Year    8
Outlet_Size                  8
Outlet_Location_Type         8
Outlet_Type                  8
Item_Outlet_Sales            8
dtype: int64

In [531]:
# dropping rows containing all the Null values
df.dropna('all').toPandas().count()

Item_ID                      16
Item_Weight                  13
Item_Fat_Content             16
Item_Visibility              16
Item_Type                    16
Item_MRP                     13
Outlet_Identifier            16
Outlet_Establishment_Year    16
Outlet_Size                  11
Outlet_Location_Type         16
Outlet_Type                  16
Item_Outlet_Sales            16
dtype: int64

In [532]:
# dropping rows any column containing the Null values
df.dropna('any').toPandas().count()

Item_ID                      8
Item_Weight                  8
Item_Fat_Content             8
Item_Visibility              8
Item_Type                    8
Item_MRP                     8
Outlet_Identifier            8
Outlet_Establishment_Year    8
Outlet_Size                  8
Outlet_Location_Type         8
Outlet_Type                  8
Item_Outlet_Sales            8
dtype: int64

### Filling Null Values
#### fillna()

In [547]:
# filling null values present in any interger column Item_Weight with 23
df.fillna(23,subset=['Item_Weight']).toPandas().tail()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
11,FDX07,19.2,Reg,0.0,Fruits and Vegetables,19.2,OUT010,1998,,Tier 3,Grocery Store,732.38
12,DRI11,23.0,LF,0.034237682,Hard Drinks,,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
13,NCN07,18.5,LF,0.056816465,Others,18.5,OUT010,1998,,Tier 3,Grocery Store,263.6568
14,FDG33,23.0,Reg,0.13956116,Seafood,,OUT027,1985,Medium,Tier 3,Supermarket Type3,3435.528
15,FDH35,18.25,LF,0.0,Starchy Foods,18.25,OUT045,2002,,Tier 2,Supermarket Type1,4604.6728


In [549]:
# filling all null values present in the in the string data type columns at a time with NotAvailable
df.fillna('Not_Available').toPandas().tail()

Unnamed: 0,Item_ID,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
11,FDX07,19.2,Reg,0.0,Fruits and Vegetables,19.2,OUT010,1998,Not_Available,Tier 3,Grocery Store,732.38
12,DRI11,,LF,0.034237682,Hard Drinks,,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
13,NCN07,18.5,LF,0.056816465,Others,18.5,OUT010,1998,Not_Available,Tier 3,Grocery Store,263.6568
14,FDG33,,Reg,0.13956116,Seafood,,OUT027,1985,Medium,Tier 3,Supermarket Type3,3435.528
15,FDH35,18.25,LF,0.0,Starchy Foods,18.25,OUT045,2002,Not_Available,Tier 2,Supermarket Type1,4604.6728
