In [0]:
from pyspark.sql import functions as F

# Create parameter

In [0]:
dbutils.widgets.text('storage_account', '0')

# Read data from bronze

In [0]:
storage_account = dbutils.widgets.get('storage_account')
year = 2022
month = 8
day = 31

In [0]:
df = spark \
        .read \
        .format('parquet') \
        .option('inferSchema', True) \
        .load(f'abfss://bronze@{storage_account}.dfs.core.windows.net/raw_data/{year}/{month}/{day}/')

# Explore data

### Overview

In [0]:
df.display()

Sales_Person_ID,Sales_Person,Country,Product_ID,Product,Date,Amount,Boxes_Shipped
14,Jehu Rudeforth,UK,15,Mint Chip Choco,2022-01-04,"$5,320",180
24,Van Tuxwell,India,3,85% Dark Bars,2022-08-01,"$7,896",94
10,Gigi Bohling,India,18,Peanut Butter Cubes,2022-07-07,"$4,501",91
13,Jan Morforth,Australia,18,Peanut Butter Cubes,2022-04-27,"$12,726",342
14,Jehu Rudeforth,UK,18,Peanut Butter Cubes,2022-02-24,"$13,685",184
24,Van Tuxwell,India,20,Smooth Sliky Salty,2022-06-06,"$5,376",38
21,Oby Sorrel,UK,4,99% Dark & Pure,2022-01-25,"$13,685",176
11,Gunar Cockshoot,Australia,5,After Nines,2022-03-24,"$3,080",73
14,Jehu Rudeforth,New Zealand,1,50% Dark Bites,2022-04-20,"$3,990",59
4,Brien Boise,Australia,4,99% Dark & Pure,2022-07-04,"$2,835",102


### Check column type

In [0]:
df.printSchema()

root
 |-- Sales_Person_ID: integer (nullable = true)
 |-- Sales_Person: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Product_ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Amount: string (nullable = true)
 |-- Boxes_Shipped: integer (nullable = true)



### Check for null

In [0]:
df.describe().display()

summary,Sales_Person_ID,Sales_Person,Country,Product_ID,Product,Amount,Boxes_Shipped
count,1094.0,1094,1094,1094.0,1094,1094,1094.0
mean,12.938756855575868,,,11.696526508226691,,,161.7979890310786
stddev,7.1687856919873685,,,6.475744511043245,,,121.5441454053633
min,1.0,Andria Kimpton,Australia,1.0,50% Dark Bites,"$1,015",1.0
max,25.0,Wilone O'Kielt,USA,22.0,White Choc,$994,709.0


In [0]:
df.select([F.sum(F.when(df[c].isNull(), 1).otherwise(0)).alias('Null ' + c) for c in df.columns]).display()

Null Sales_Person_ID,Null Sales_Person,Null Country,Null Product_ID,Null Product,Null Date,Null Amount,Null Boxes_Shipped
0,0,0,0,0,0,0,0


### Check duplicate records

In [0]:
df.groupBy('Sales_Person_ID', 'Country', 'Product_ID', 'Date', 'Boxes_Shipped').count().filter(F.col('count') > 1).display()

Sales_Person_ID,Country,Product_ID,Date,Boxes_Shipped,count


### Check category

In [0]:
df.select('Sales_Person_ID', 'Sales_Person').distinct().orderBy('Sales_Person_ID').display()

Sales_Person_ID,Sales_Person
1,Andria Kimpton
2,Barr Faughny
3,Beverie Moffet
4,Brien Boise
5,Camilla Castle
6,Ches Bonnell
7,Curtice Advani
8,Dennison Crosswaite
9,Dotty Strutley
10,Gigi Bohling


In [0]:
df.select('Product_ID', 'Product').distinct().orderBy('Product_ID').display()

Product_ID,Product
1,50% Dark Bites
2,70% Dark Bites
3,85% Dark Bars
4,99% Dark & Pure
5,After Nines
6,Almond Choco
7,Baker's Choco Chips
8,Caramel Stuffed Bars
9,Choco Coated Almonds
10,Drinking Coco


In [0]:
df.select('Country').distinct().display()

Country
India
USA
UK
Canada
New Zealand
Australia


### Find outlier

In [0]:
df.groupBy('Boxes_Shipped').count().orderBy('Boxes_Shipped', ascending=False).display()

Boxes_Shipped,count
709,1
708,1
614,1
597,1
591,1
581,1
554,1
547,1
543,1
539,1


Databricks visualization. Run in Databricks to view.

![histogram](../../docs/image/databricks_img/eda_outlier_histogram.JPG)

In [0]:
df.filter(F.col('Boxes_Shipped') > 600).display()

Sales_Person_ID,Sales_Person,Country,Product_ID,Product,Date,Amount,Boxes_Shipped
16,Karlen McCaffrey,Australia,1,50% Dark Bites,2022-01-17,"$6,678",708
20,Marney O'Breen,Canada,8,Caramel Stuffed Bars,2022-02-23,"$1,372",614
24,Van Tuxwell,Canada,10,Drinking Coco,2022-06-15,"$4,900",709
