In [0]:
%fs ls /databricks-datasets/online_retail/data-001/

path,name,size
dbfs:/databricks-datasets/online_retail/data-001/data.csv,data.csv,5357240


In [0]:
# Read in Data to DataFram with Column Headers

# specify path
path = "/databricks-datasets/online_retail/data-001/data.csv"

# read in file using csv format
df = spark.read.load(path,
                    format='com.databricks.spark.csv', 
                    header='true',
                    inferSchema='true')

# show 20 rows
display(df)

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom
536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/10 8:26,7.65,17850.0,United Kingdom
536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/10 8:26,4.25,17850.0,United Kingdom
536366,22633,HAND WARMER UNION JACK,6,12/1/10 8:28,1.85,17850.0,United Kingdom
536366,22632,HAND WARMER RED POLKA DOT,6,12/1/10 8:28,1.85,17850.0,United Kingdom
536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/10 8:34,1.69,13047.0,United Kingdom


In [0]:
# take a look at our schema
df.printSchema()

In [0]:
# show just the countries
df.select("Country").show()

In [0]:
# Remove Duplicates from Column and Sort

# For this we'll need a few functions
display( # shows the results in a grid
   df 
    .select("Country") # chooses just the 1 column
    .distinct() # removes duplicates
    .orderBy("Country") # sorts results in ascending
)

Country
Australia
Austria
Bahrain
Belgium
Channel Islands
Cyprus
Denmark
EIRE
Finland
France


In [0]:
df.columns

In [0]:
# Calculate Order Totals

display(
  df
    .select(df["InvoiceNo"],df["UnitPrice"]*df["Quantity"])
    .groupBy("InvoiceNo")
    .sum()
  )

InvoiceNo,sum((UnitPrice * Quantity))
536596,38.09
536938,1680.88
537252,26.35
537691,310.57
538041,0.0
538184,458.9199999999999
538517,320.28000000000003
538879,338.9799999999999
539275,403.8
539630,751.0


In [0]:
# Inpect Reuslts with Filter

df.filter(df["InvoiceNo"]=='536596').show()

In [0]:
# Show Top 10 Products in UK

display(
  df
    .select(df["Country"], df["Description"],(df["UnitPrice"]*df["Quantity"]).alias("Total"))
    .groupBy("Country", "Description")
    .sum()
    .filter(df["Country"]=="United Kingdom")
    .sort("sum(Total)", ascending=False)
    .limit(10)
  )

Country,Description,sum(Total)
United Kingdom,DOTCOM POSTAGE,34177.85999999999
United Kingdom,REGENCY CAKESTAND 3 TIER,30512.56000000003
United Kingdom,WHITE HANGING HEART T-LIGHT HOLDER,22248.690000000024
United Kingdom,CHILLI LIGHTS,12475.610000000004
United Kingdom,RED WOOLLY HOTTIE WHITE HEART.,9355.869999999995
United Kingdom,PAPER CHAIN KIT 50'S CHRISTMAS,9313.069999999996
United Kingdom,WHITE SKULL HOT WATER BOTTLE,8867.309999999998
United Kingdom,HEART OF WICKER LARGE,8175.289999999995
United Kingdom,HOT WATER BOTTLE TEA AND SYMPATHY,7946.580000000001
United Kingdom,CHOCOLATE HOT WATER BOTTLE,7825.719999999996
