### Set up the Spark environment.

In [1]:
import sys
sys.path.append('/class')
from initspark import *
sc, spark, conf = initspark()


initializing pyspark
pyspark initialized


### Turn a simple RDD into a DataFrame. 

In [2]:
x = sc.parallelize([(1,'alpha'),(2,'beta')])
print(x.collect())
x0 = spark.createDataFrame(x)
x0.show()
x0.collect()

[(1, 'alpha'), (2, 'beta')]
+---+-----+
| _1|   _2|
+---+-----+
|  1|alpha|
|  2| beta|
+---+-----+



[Row(_1=1, _2='alpha'), Row(_1=2, _2='beta')]

### Give the DataFrame meaningful column names by providing a list of names as string.

In [3]:
x1 = spark.createDataFrame(x, schema=['ID','Name'])
x1.show()
print(x1)
print(x1.collect())
x2 = x1.take(1)
print(x2[0])
print(x2[0].ID, x2[0]['Name'])

+---+-----+
| ID| Name|
+---+-----+
|  1|alpha|
|  2| beta|
+---+-----+

DataFrame[ID: bigint, Name: string]
[Row(ID=1, Name='alpha'), Row(ID=2, Name='beta')]
Row(ID=1, Name='alpha')
1 alpha


### Give a DataFrame a schema with column names and data types by providing a single string with the names and data types.

In [4]:
x2 = spark.createDataFrame(x, schema = 'ID:int, Name:string')
x2.show()
print(x2)


+---+-----+
| ID| Name|
+---+-----+
|  1|alpha|
|  2| beta|
+---+-----+

DataFrame[ID: int, Name: string]


### Create a schema object to be more specific. Also, some functions cannot use string but require schema objects.

In [5]:
# CREATE TABLE schema1 (ID int, Name string);
schema1 = StructType([
    StructField('ID', IntegerType()), 
    StructField('Name', StringType())
])
x3 = spark.createDataFrame(x, schema = schema1)
x3.show()
print(x3)


+---+-----+
| ID| Name|
+---+-----+
|  1|alpha|
|  2| beta|
+---+-----+

DataFrame[ID: int, Name: string]


### The built-in toDF method does the same thing.

In [6]:
x.toDF().printSchema()
x.toDF(['ID', 'Name']).printSchema()
x.toDF('ID:int, Name:string').printSchema()
x.toDF(schema = schema1).printSchema()


root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)



## LAB: ## 
### Use the regions and territories RDDs from the previous lab and convert them into DataFrames with meaningful schemas.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use sc.textFile to read the files
<br>
Use map functions to split and convert the data
<br>
Use spark.createDataFrame and toDF to convert RDD into DataFrames
<br>
<br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
regions = sc.textFile('hdfs://localhost:9000/regions')
regions = regions.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))
regionsdf = spark.createDataFrame(regions, 'RegionID:int, RegionName:string')
regionsdf.show()

territories = sc.textFile('hdfs://localhost:9000/territories')
territories = territories.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1], int(x[2])))
territoriesdf = territories.toDF('TerritoryID:int, TerritoryName:string, RegionID: int')
territoriesdf.show()

# instead of collecting the data as a list of rows, it can be brought back as a pandas DataFrame
regions.toPandas()
```
</p>
</details>

### Examples of reading a CSV directly into a DataFrame.

### If there is a top-level read function for the file type you want, that's the cleanest option, and pass in the parameters as named parameters. Not all formats have this, and also, legacy code written before this may use the old style syntax.

In [7]:
filename1 = 'file:///class/datasets/northwind/CSVHeaders/categories'
cat1 = spark.read.csv(filename1, header = True, inferSchema = True)
print(cat1)
cat1.printSchema()
cat1.show()
cat1.collect()
# CREATE TEMPORARY TABLE categories (CategoryID int, CategoryName string, Description string)
# ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
# LOCATION 'file:///class/datasets/northwind/CSVHeaders/categories'

DataFrame[CategoryID: int, CategoryName: string, Description: string]
root
 |-- CategoryID: integer (nullable = true)
 |-- CategoryName: string (nullable = true)
 |-- Description: string (nullable = true)

+----------+--------------+--------------------+
|CategoryID|  CategoryName|         Description|
+----------+--------------+--------------------+
|         1|     Beverages|Soft drinks coffe...|
|         2|    Condiments|Sweet and savory ...|
|         3|   Confections|Desserts candies ...|
|         4|Dairy Products|             Cheeses|
|         5|Grains/Cereals|Breads crackers p...|
|         6|  Meat/Poultry|      Prepared meats|
|         7|       Produce|Dried fruit and b...|
|         8|       Seafood|    Seaweed and fish|
+----------+--------------+--------------------+



[Row(CategoryID=1, CategoryName='Beverages', Description='Soft drinks coffees teas beers and ales'),
 Row(CategoryID=2, CategoryName='Condiments', Description='Sweet and savory sauces relishes spreads and seasonings'),
 Row(CategoryID=3, CategoryName='Confections', Description='Desserts candies and sweet breads'),
 Row(CategoryID=4, CategoryName='Dairy Products', Description='Cheeses'),
 Row(CategoryID=5, CategoryName='Grains/Cereals', Description='Breads crackers pasta and cereal'),
 Row(CategoryID=6, CategoryName='Meat/Poultry', Description='Prepared meats'),
 Row(CategoryID=7, CategoryName='Produce', Description='Dried fruit and bean curd'),
 Row(CategoryID=8, CategoryName='Seafood', Description='Seaweed and fish')]

### Legacy style is still seen and uses a few alternate syntaxes.

In [8]:
filename2 = 'file:///class/datasets/northwind/TSV/categories'
cat2 = spark.read.load(filename2, format = 'csv', sep = '\t', inferSchema = True, header = False)
cat2.printSchema()
cat2.show()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+---+--------------+--------------------+----+
|_c0|           _c1|                 _c2| _c3|
+---+--------------+--------------------+----+
|  1|     Beverages|Soft drinks, coff...|null|
|  2|    Condiments|Sweet and savory ...|null|
|  3|   Confections|Desserts, candies...|null|
|  4|Dairy Products|             Cheeses|null|
|  5|Grains/Cereals|Breads, crackers,...|null|
|  6|  Meat/Poultry|      Prepared meats|null|
|  7|       Produce|Dried fruit and b...|null|
|  8|       Seafood|    Seaweed and fish|null|
+---+--------------+--------------------+----+



In [9]:
filename3 = 'file:///class/datasets/northwind/CSVHeaders/categories'
cat3 = spark.read.load(filename3, format = 'csv', sep = ',', inferSchema = True, header = True)
cat3.printSchema()
cat3.show()


root
 |-- CategoryID: integer (nullable = true)
 |-- CategoryName: string (nullable = true)
 |-- Description: string (nullable = true)

+----------+--------------+--------------------+
|CategoryID|  CategoryName|         Description|
+----------+--------------+--------------------+
|         1|     Beverages|Soft drinks coffe...|
|         2|    Condiments|Sweet and savory ...|
|         3|   Confections|Desserts candies ...|
|         4|Dairy Products|             Cheeses|
|         5|Grains/Cereals|Breads crackers p...|
|         6|  Meat/Poultry|      Prepared meats|
|         7|       Produce|Dried fruit and b...|
|         8|       Seafood|    Seaweed and fish|
+----------+--------------+--------------------+



### There are several alternate syntaxes which can be confusing, but since you will encounter them, you need to learn to recognize the different options.
option and options allow you to pass parameters in different ways, but note the true is quoted and lowercase because it is a Java value, but you could also pass it as a True Python value.

In [10]:
cat4 = spark.read.format('csv').option('header','true').option('inferSchema','true').load(filename1)
cat4.printSchema()
cat5 = spark.read.format('csv').options(header=True, inferSchema='true').load(filename1)
cat5.printSchema()


root
 |-- CategoryID: integer (nullable = true)
 |-- CategoryName: string (nullable = true)
 |-- Description: string (nullable = true)

root
 |-- CategoryID: integer (nullable = true)
 |-- CategoryName: string (nullable = true)
 |-- Description: string (nullable = true)



### As the tables get more complex, there is a Jupyter command that will show the tables in a prettier format.

In [None]:
display(cat3)

## LAB: ## 
### Load the products table using any of the spark.read methods.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use spark.read.csv
<br>
Make sure to read the version that has headers if you want to infer schema
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
prod1 = spark.read.csv('file:///class/datasets/northwind/CSVHeaders/products', header=True, inferSchema=True)
prod1.printSchema()
display(prod1)```
</p>
</details>

### Using a schema is a good idea for performance if you know what it is. Usually, you can infer schema during development and use it as a helper to build the schema to use for production.

In [18]:
prodSchema = StructType([
    StructField('ProductID', IntegerType()), 
    StructField('ProductName', StringType()),
    StructField('SupplierID', IntegerType()), 
    StructField('CategoryID', IntegerType()), 
    StructField('QuantityPerUnit', StringType()), 
    StructField('UnitPrice', FloatType()), 
    StructField('UnitsInStock', IntegerType()), 
    StructField('UnitsOnOrder', IntegerType()), 
    StructField('ReorderLevel', IntegerType()), 
    StructField('Discontinued', IntegerType())
])

prod2 = spark.read.csv('file:///class/datasets/northwind/CSVHeaders/products', header=True, schema=prodSchema)
print(prod2)
display(prod2)



DataFrame[ProductID: int, ProductName: string, SupplierID: int, CategoryID: int, QuantityPerUnit: string, UnitPrice: float, UnitsInStock: int, UnitsOnOrder: int, ReorderLevel: int, Discontinued: int]


Unnamed: 0,ProductID,ProductName,SupplierID,CategoryID,QuantityPerUnit,UnitPrice,UnitsInStock,UnitsOnOrder,ReorderLevel,Discontinued
0,1,Chai,8,1,10 boxes x 30 bags,18.0,39,0,10,1
1,2,Chang,1,1,24 - 12 oz bottles,19.0,17,40,25,1
2,3,Aniseed Syrup,1,2,12 - 550 ml bottles,10.0,13,70,25,0
3,4,Chef Anton's Cajun Seasoning,2,2,48 - 6 oz jars,22.0,53,0,0,0
4,5,Chef Anton's Gumbo Mix,2,2,36 boxes,21.35,0,0,0,1
5,6,Grandma's Boysenberry Spread,3,2,12 - 8 oz jars,25.0,120,0,25,0
6,7,Uncle Bob's Organic Dried Pears,3,7,12 - 1 lb pkgs.,30.0,15,0,10,0
7,8,Northwoods Cranberry Sauce,3,2,12 - 12 oz jars,40.0,6,0,0,0
8,9,Mishi Kobe Niku,4,6,18 - 500 g pkgs.,97.0,29,0,0,1
9,10,Ikura,4,8,12 - 200 ml jars,31.0,31,0,0,0


### Convert a DataFrame into a JSON string.

In [19]:
print (cat1.toJSON().take(10))

['{"CategoryID":1,"CategoryName":"Beverages","Description":"Soft drinks coffees teas beers and ales"}', '{"CategoryID":2,"CategoryName":"Condiments","Description":"Sweet and savory sauces relishes spreads and seasonings"}', '{"CategoryID":3,"CategoryName":"Confections","Description":"Desserts candies and sweet breads"}', '{"CategoryID":4,"CategoryName":"Dairy Products","Description":"Cheeses"}', '{"CategoryID":5,"CategoryName":"Grains/Cereals","Description":"Breads crackers pasta and cereal"}', '{"CategoryID":6,"CategoryName":"Meat/Poultry","Description":"Prepared meats"}', '{"CategoryID":7,"CategoryName":"Produce","Description":"Dried fruit and bean curd"}', '{"CategoryID":8,"CategoryName":"Seafood","Description":"Seaweed and fish"}']


### Or you can chain the whole thing together into a single command.

In [20]:
j = spark.read.csv('file:///class/datasets/northwind/CSVHeaders/categories', inferSchema=True, header=True).toJSON().collect()
print(j)

['{"CategoryID":1,"CategoryName":"Beverages","Description":"Soft drinks coffees teas beers and ales"}', '{"CategoryID":2,"CategoryName":"Condiments","Description":"Sweet and savory sauces relishes spreads and seasonings"}', '{"CategoryID":3,"CategoryName":"Confections","Description":"Desserts candies and sweet breads"}', '{"CategoryID":4,"CategoryName":"Dairy Products","Description":"Cheeses"}', '{"CategoryID":5,"CategoryName":"Grains/Cereals","Description":"Breads crackers pasta and cereal"}', '{"CategoryID":6,"CategoryName":"Meat/Poultry","Description":"Prepared meats"}', '{"CategoryID":7,"CategoryName":"Produce","Description":"Dried fruit and bean curd"}', '{"CategoryID":8,"CategoryName":"Seafood","Description":"Seaweed and fish"}']


### JSON is another top-level supported format.

In [21]:
cat6 = spark.read.json('file:///class/datasets/northwind/JSON/categories')
display(cat6)
print(cat6)

Unnamed: 0,categoryid,categoryname,description,picture
0,1,Beverages,"Soft drinks, coffees, teas, beers, and ales",
1,2,Condiments,"Sweet and savory sauces, relishes, spreads, an...",
2,3,Confections,"Desserts, candies, and sweet breads",
3,4,Dairy Products,Cheeses,
4,5,Grains/Cereals,"Breads, crackers, pasta, and cereal",
5,6,Meat/Poultry,Prepared meats,
6,7,Produce,Dried fruit and bean curd,
7,8,Seafood,Seaweed and fish,


DataFrame[categoryid: bigint, categoryname: string, description: string, picture: string]


### You can also use schemas but be careful of case because it matches the schema names to the key names in the JSON and upper/lower case matters. This will fail.

In [22]:
prod = spark.read.json('file:///class/datasets/northwind/JSON/products', schema=prodSchema)
display(prod)



Unnamed: 0,ProductID,ProductName,SupplierID,CategoryID,QuantityPerUnit,UnitPrice,UnitsInStock,UnitsOnOrder,ReorderLevel,Discontinued
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
6,,,,,,,,,,
7,,,,,,,,,,
8,,,,,,,,,,
9,,,,,,,,,,


### Redefine the schema using all lowercase since that's what this particular JSON file has.

In [23]:
prodSchema = StructType([
    StructField('productid', IntegerType()), 
    StructField('productname', StringType()),
    StructField('supplierid', IntegerType()), 
    StructField('categoryid', IntegerType()), 
    StructField('quantityperunit', StringType()), 
    StructField('unitprice', FloatType()), 
    StructField('unitsinstock', IntegerType()), 
    StructField('unitsonorder', IntegerType()), 
    StructField('reorderlevel', IntegerType()), 
#    StructField('joey', StringType()),
    StructField('discontinued', IntegerType())
])

prod = spark.read.json('file:///class/datasets/northwind/JSON/products', schema=prodSchema)
print(prod)
display(prod)


DataFrame[productid: int, productname: string, supplierid: int, categoryid: int, quantityperunit: string, unitprice: float, unitsinstock: int, unitsonorder: int, reorderlevel: int, discontinued: int]


Unnamed: 0,productid,productname,supplierid,categoryid,quantityperunit,unitprice,unitsinstock,unitsonorder,reorderlevel,discontinued
0,1,Chai,8,1,10 boxes x 30 bags,18.0,39,0,10,1
1,2,Chang,1,1,24 - 12 oz bottles,19.0,17,40,25,1
2,3,Aniseed Syrup,1,2,12 - 550 ml bottles,10.0,13,70,25,0
3,4,Chef Anton's Cajun Seasoning,2,2,48 - 6 oz jars,22.0,53,0,0,0
4,5,Chef Anton's Gumbo Mix,2,2,36 boxes,21.35,0,0,0,1
5,6,Grandma's Boysenberry Spread,3,2,12 - 8 oz jars,25.0,120,0,25,0
6,7,Uncle Bob's Organic Dried Pears,3,7,12 - 1 lb pkgs.,30.0,15,0,10,0
7,8,Northwoods Cranberry Sauce,3,2,12 - 12 oz jars,40.0,6,0,0,0
8,9,Mishi Kobe Niku,4,6,18 - 500 g pkgs.,97.0,29,0,0,1
9,10,Ikura,4,8,12 - 200 ml jars,31.0,31,0,0,0


### You may also see the older style syntax.

In [24]:
prod = spark.read.format('json').load('file:///class/datasets/northwind/JSON/products')
display(prod)


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,1,1,1,Chai,10 boxes x 30 bags,10,8,18.0,39,0
1,1,1,2,Chang,24 - 12 oz bottles,25,1,19.0,17,40
2,2,0,3,Aniseed Syrup,12 - 550 ml bottles,25,1,10.0,13,70
3,2,0,4,Chef Anton's Cajun Seasoning,48 - 6 oz jars,0,2,22.0,53,0
4,2,1,5,Chef Anton's Gumbo Mix,36 boxes,0,2,21.35,0,0
5,2,0,6,Grandma's Boysenberry Spread,12 - 8 oz jars,25,3,25.0,120,0
6,7,0,7,Uncle Bob's Organic Dried Pears,12 - 1 lb pkgs.,10,3,30.0,15,0
7,2,0,8,Northwoods Cranberry Sauce,12 - 12 oz jars,0,3,40.0,6,0
8,6,1,9,Mishi Kobe Niku,18 - 500 g pkgs.,0,4,97.0,29,0
9,8,0,10,Ikura,12 - 200 ml jars,0,4,31.0,31,0


### Writing a DataFrame uses a similar syntax. There is a safeguard against accidentally overwriting a destination, so that's why we are deleting it first.

In [25]:
! rm -r /tmp/prodjson
prod.write.json('file:///tmp/prodjson')
! cat /tmp/prodjson/*

{"categoryid":1,"discontinued":1,"productid":1,"productname":"Chai","quantityperunit":"10 boxes x 30 bags","reorderlevel":10,"supplierid":8,"unitprice":18.0,"unitsinstock":39,"unitsonorder":0}
{"categoryid":1,"discontinued":1,"productid":2,"productname":"Chang","quantityperunit":"24 - 12 oz bottles","reorderlevel":25,"supplierid":1,"unitprice":19.0,"unitsinstock":17,"unitsonorder":40}
{"categoryid":2,"discontinued":0,"productid":3,"productname":"Aniseed Syrup","quantityperunit":"12 - 550 ml bottles","reorderlevel":25,"supplierid":1,"unitprice":10.0,"unitsinstock":13,"unitsonorder":70}
{"categoryid":2,"discontinued":0,"productid":4,"productname":"Chef Anton's Cajun Seasoning","quantityperunit":"48 - 6 oz jars","reorderlevel":0,"supplierid":2,"unitprice":22.0,"unitsinstock":53,"unitsonorder":0}
{"categoryid":2,"discontinued":1,"productid":5,"productname":"Chef Anton's Gumbo Mix","quantityperunit":"36 boxes","reorderlevel":0,"supplierid":2,"unitprice":21.35,"unitsinstock":0,"unitsonorder"

### Alternatively, generate a unique file name with a timestamp.

In [26]:
import time

prod.write.csv(f'file:///tmp/prodcsv{time.strftime("%Y%m%d-%H%M%S")}', sep = '|', header=True)
! cat /tmp/prodcsv*/*
! ls /tmp/prodcsv*

categoryid|discontinued|productid|productname|quantityperunit|reorderlevel|supplierid|unitprice|unitsinstock|unitsonorder
1|1|1|Chai|10 boxes x 30 bags|10|8|18.0|39|0
1|1|2|Chang|24 - 12 oz bottles|25|1|19.0|17|40
2|0|3|Aniseed Syrup|12 - 550 ml bottles|25|1|10.0|13|70
2|0|4|Chef Anton's Cajun Seasoning|48 - 6 oz jars|0|2|22.0|53|0
2|1|5|Chef Anton's Gumbo Mix|36 boxes|0|2|21.35|0|0
2|0|6|Grandma's Boysenberry Spread|12 - 8 oz jars|25|3|25.0|120|0
7|0|7|Uncle Bob's Organic Dried Pears|12 - 1 lb pkgs.|10|3|30.0|15|0
2|0|8|Northwoods Cranberry Sauce|12 - 12 oz jars|0|3|40.0|6|0
6|1|9|Mishi Kobe Niku|18 - 500 g pkgs.|0|4|97.0|29|0
8|0|10|Ikura|12 - 200 ml jars|0|4|31.0|31|0
4|0|11|Queso Cabrales|1 kg pkg.|30|5|21.0|22|30
4|0|12|Queso Manchego La Pastora|10 - 500 g pkgs.|0|5|38.0|86|0
8|0|13|Konbu|2 kg box|5|6|6.0|24|0
7|0|14|Tofu|40 - 100 g pkgs.|0|6|23.25|35|0
2|0|15|Genen Shouyu|24 - 250 ml bottles|5|6|13.0|39|0
3|0|16|Pavlova|32 - 500 g boxes|10|7|17.45|29|0
6|1|17|Alice Mutton|20 - 1 

### Note the use of mode('overwrite') here as an alternative to deleting it first.

In [27]:
prod.write.mode('overwrite').orc('file:///tmp/prodorc')
! cat /tmp/prodorc/*


ORC  
MP -  

   M�P +  

   M P /  

   ��.P c  
/
     M" 
Zaanse koeken�P e  
0
     M"!
	1 kg pkg.750 cc per bottle�P -  

   M <�P -  

   M:�P N  )4
'
  M	 @4xp@R��kX�@P /  

    ��0P /  

    ��P �  B�N F�  NN
F� N


eso CabralesQ��Manchego La PastoraKonbuTofuGenen ShouyuPavlovaAlice MuttonCarnarvon TigersTeatime Chocolate BiscuitsSir Rodney's MarmaladeSir Rodney's SconesGustaf's KnackebrodTunnbrodGuarana FantasticaNuNuCa Nuss-Nougat-CremeGumbar!`@mibarchenSchoggi xkoladeRossle SauerkrautThuringe�lstbratwurstNord-Ost Matjeshe%��Gorgonzola TelinoMascarpone FabioliGeitostSasquatch AleSteeleye StoutInlagd SillGravad laxCote de BlayeChartreuse verteBoston Crab MeatJack's New England Clam ChowderSingaporean Hokkien Fried MeeIpoh CoffeeGula MalaccaRogede sildSpegesildZaanse koekenCh%�<deMaxilakuValkoiA)<suklaaManjimup Dk$ApplesFiloA�Perth

In [28]:
prod.write.mode('overwrite').parquet('file:///tmp/prodparquet')
! cat /tmp/prodparquet/*


PAR1�XL  @ 	 	   <               NR,�                               '�   �@�(cKţ��5�H�<Bڶ��g�	\�  $L  <                (,,�                                  L   �	��~���( �	,� M               M                 �    � 	 	                 	   
   Aniseed Syruphef Anton's Cajun Seasoning>   Gumbo Mix:�RGrandma's Boysenberry Spread   Uncle Bob's Organic Dried Pears   Northwoods Cranb	BXauce   Mishi Kobe Niku�\Ikura   Queso CabralesHManchego La Pastora8�Konbu   Tofu   Genen Shouyu   Pavlova   Alice Mutton!0arnarvon Tige	�dTeatime Chocolate Biscuits!4`Sir Rodney's Marmalade  6 SconesLGustaf's Knackebrounn ��uarana Fantastica   NuNuCa Nuss-Nougat-Creme	Umbar!� mibarchenISchoggi ko�T   Rossle Sauerkraut�huringe�,stbratwursty<ord-Ost Matjeshe)I@Gorgonzola TelinotDMascarpone Fabioli!tGeitostA�0Sasquatch Ale!�|Steeleye Stout   Inla

### AVRO is a little different, it is built in now but doesn't have a top-level method for it, so you need to use the old style syntax.
This doesn't always work inside of a notebook either, so take a look at the program and run it from spark-submit with the proper package dependency added.

In [None]:
! cat /class/avro.py

! spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.3 /class/avro.py
        


## LAB: 
### Try to read in a few files with different formats and write them out to other formats. 
### Read shippers found in TSV and write it out as JSON.
### Read orders found in CSVHeaders and write it out as ORC.
### Read orderdetails found in JSON and write it out as Parquet.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use the syntax that is easiest for you
<br>
If there is a top-level function for the file format, that is usually the easiest way
<br>
TSV is just the same as CSV, but if there are no headers, you need to supply a schema
<br>
Remember to either remove the destination folder before writing or use an overwrite option
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
shipperSchema = StructType([
    StructField('shipperid', StringType()), 
    StructField('shippername', StringType()),
    StructField('phone', StringType())
])

shippers = spark.read.csv('file:///class/datasets/northwind/TSV/shippers', sep='\t', header=False, inferSchema=False, schema=shipperSchema)
shippers.write.mode('overwrite').json('file:///tmp/shippersjson')

orders = spark.read.csv('file:///class/datasets/northwind/CSVHeaders/orders', header=True, inferSchema=True)
orders.write.mode('overwrite').orc('file:///tmp/ordersorc')

orderdetails = spark.read.json('file:///class/datasets/northwind/JSON/orderdetails')
orderdetails.write.mode('overwrite').parquet('file:///tmp/orderdetailsparquet')

```
</p>
</details>

### You can also read from a Hive table using `spark.read.table`.

In [32]:
spark.read.table('regions').write.mode('overwrite').json('file:///tmp/regionjson')

# insert overwrite local directory '/tmp/region_json'
# ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
# SELECT * FROM regions;


### Can start to call methods on the DataFrame.

In [33]:
prod.printSchema()
print (prod.columns, prod.count())

root
 |-- categoryid: long (nullable = true)
 |-- discontinued: long (nullable = true)
 |-- productid: long (nullable = true)
 |-- productname: string (nullable = true)
 |-- quantityperunit: string (nullable = true)
 |-- reorderlevel: long (nullable = true)
 |-- supplierid: long (nullable = true)
 |-- unitprice: double (nullable = true)
 |-- unitsinstock: long (nullable = true)
 |-- unitsonorder: long (nullable = true)

['categoryid', 'discontinued', 'productid', 'productname', 'quantityperunit', 'reorderlevel', 'supplierid', 'unitprice', 'unitsinstock', 'unitsonorder'] 77


### Choose particular columns from a DataFrame.
You can use quoted strings for the column names.

In [34]:
display(prod.select('productid', 'productname', 'unitprice'))
#SELECT productid, productname, unitprice from products


Unnamed: 0,productid,productname,unitprice
0,1,Chai,18.0
1,2,Chang,19.0
2,3,Aniseed Syrup,10.0
3,4,Chef Anton's Cajun Seasoning,22.0
4,5,Chef Anton's Gumbo Mix,21.35
5,6,Grandma's Boysenberry Spread,25.0
6,7,Uncle Bob's Organic Dried Pears,30.0
7,8,Northwoods Cranberry Sauce,40.0
8,9,Mishi Kobe Niku,97.0
9,10,Ikura,31.0


### Or you can use a Pythonic syntax using the DataFrame name and field name.

In [35]:
field1 = 'unitprice'
display(prod.select(prod.productid, prod.productname, field1, 'categoryid'))

fields = ['categoryid', 'productid']
display(prod.select(fields))

Unnamed: 0,productid,productname,unitprice,categoryid
0,1,Chai,18.0,1
1,2,Chang,19.0,1
2,3,Aniseed Syrup,10.0,2
3,4,Chef Anton's Cajun Seasoning,22.0,2
4,5,Chef Anton's Gumbo Mix,21.35,2
5,6,Grandma's Boysenberry Spread,25.0,2
6,7,Uncle Bob's Organic Dried Pears,30.0,7
7,8,Northwoods Cranberry Sauce,40.0,2
8,9,Mishi Kobe Niku,97.0,6
9,10,Ikura,31.0,8


Unnamed: 0,categoryid,productid
0,1,1
1,1,2
2,2,3
3,2,4
4,2,5
5,2,6
6,7,7
7,2,8
8,6,9
9,8,10


### Case is ignored if you use quoted strings but not if you use Python syntax.

In [37]:
display(prod.select('Productid', 'productname', 'unitprice'))

# this will fail
#display(prod.select(prod.Productid, prod.productname, prod.unitprice))

# this will not fail
#display(prod.select(prod.productid, prod.productname, prod.unitprice))

Unnamed: 0,Productid,productname,unitprice
0,1,Chai,18.0
1,2,Chang,19.0
2,3,Aniseed Syrup,10.0
3,4,Chef Anton's Cajun Seasoning,22.0
4,5,Chef Anton's Gumbo Mix,21.35
5,6,Grandma's Boysenberry Spread,25.0
6,7,Uncle Bob's Organic Dried Pears,30.0
7,8,Northwoods Cranberry Sauce,40.0
8,9,Mishi Kobe Niku,97.0
9,10,Ikura,31.0


### `distinct` is a method after the `select` method chooses the columns.

In [38]:
display(prod.select('CategoryID').distinct())

# SELECT DISTINCT categoryid from prod
# SELECT categoryID from prod GROUP BY categoryid
# FROM prod SELECT categoryID DISTINCT

Unnamed: 0,CategoryID
0,7
1,6
2,5
3,1
4,3
5,8
6,2
7,4


### Sort a DataFrame. The `sort` and `orderBy` methods are different aliases for the exact same method.

In [39]:
field = 'productid'
#display(prod.sort(prod.unitprice))
#display(prod.orderBy('unitprice', ascending = False))
display(prod.select(field, 'productname', prod.unitprice).orderBy('unitprice'))

Unnamed: 0,productid,productname,unitprice
0,33,Geitost,2.5
1,24,Guarana Fantastica,4.5
2,13,Konbu,6.0
3,52,Filo Mix,7.0
4,54,Tourtiere,7.45
5,75,Rhonbrau Klosterbier,7.75
6,23,Tunnbrod,9.0
7,19,Teatime Chocolate Biscuits,9.2
8,45,Rogede sild,9.5
9,47,Zaanse koeken,9.5


### Create a new DataFrame with a new calculated column added.

In [40]:
prod2 = (prod.withColumn('value', prod.unitprice * prod.unitsinstock)
             .withColumn('other', prod.unitprice * 10)
             .select('categoryid', 'productid', prod.unitprice, prod.unitsinstock, 'value', 'other')
        )

prod2 = (prod.select('categoryid', 'productid', prod.unitprice, prod.unitsinstock)
             .withColumn('value', prod.unitprice * prod.unitsinstock)
             .withColumn('other', prod.unitprice * 10)
             
        )

display(prod2)


Unnamed: 0,categoryid,productid,unitprice,unitsinstock,value,other
0,1,1,18.0,39,702.0,180.0
1,1,2,19.0,17,323.0,190.0
2,2,3,10.0,13,130.0,100.0
3,2,4,22.0,53,1166.0,220.0
4,2,5,21.35,0,0.0,213.5
5,2,6,25.0,120,3000.0,250.0
6,7,7,30.0,15,450.0,300.0
7,2,8,40.0,6,240.0,400.0
8,6,9,97.0,29,2813.0,970.0
9,8,10,31.0,31,961.0,310.0


### Remove an unwanted column from a DataFrame by selecting the columns you want to keep.

In [42]:
print(prod.columns)
prod.select('categoryid',
 'discontinued',
 'productid',
 'productname',
 'reorderlevel',
 'supplierid',
 'unitprice',
 'unitsinstock',
 'unitsonorder')

['categoryid', 'discontinued', 'productid', 'productname', 'quantityperunit', 'reorderlevel', 'supplierid', 'unitprice', 'unitsinstock', 'unitsonorder']


DataFrame[categoryid: bigint, discontinued: bigint, productid: bigint, productname: string, reorderlevel: bigint, supplierid: bigint, unitprice: double, unitsinstock: bigint, unitsonorder: bigint]

### `drop` is a convenient way to remove a column without enumerating through all the columns you want to keep.

In [45]:
prod2 = prod.drop('quantityperunit', 'discontinued')
display(prod2)
# SELECT * EXCEPT(col1, col2)


Unnamed: 0,categoryid,productid,productname,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,1,1,Chai,10,8,18.0,39,0
1,1,2,Chang,25,1,19.0,17,40
2,2,3,Aniseed Syrup,25,1,10.0,13,70
3,2,4,Chef Anton's Cajun Seasoning,0,2,22.0,53,0
4,2,5,Chef Anton's Gumbo Mix,0,2,21.35,0,0
5,2,6,Grandma's Boysenberry Spread,25,3,25.0,120,0
6,7,7,Uncle Bob's Organic Dried Pears,10,3,30.0,15,0
7,2,8,Northwoods Cranberry Sauce,0,3,40.0,6,0
8,6,9,Mishi Kobe Niku,0,4,97.0,29,0
9,8,10,Ikura,0,4,31.0,31,0


### The `filter` and `where` methods can both be used and have alternative ways to represent the condition.

In [46]:
p = prod
display(p.filter(p.unitprice > 100))
display(p.filter('unitprice > 100'))

# Note == when using python syntax
display(p.where(p.categoryid == 2))

# Note = when using quoted SQL like syntax
display(p.where('categoryid = 2'))



Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,6,1,29,Thuringer Rostbratwurst,50 bags x 30 sausgs.,0,12,123.79,0,0
1,1,0,38,Cote de Blaye,12 - 75 cl bottles,15,18,263.5,17,0


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,6,1,29,Thuringer Rostbratwurst,50 bags x 30 sausgs.,0,12,123.79,0,0
1,1,0,38,Cote de Blaye,12 - 75 cl bottles,15,18,263.5,17,0


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,2,0,3,Aniseed Syrup,12 - 550 ml bottles,25,1,10.0,13,70
1,2,0,4,Chef Anton's Cajun Seasoning,48 - 6 oz jars,0,2,22.0,53,0
2,2,1,5,Chef Anton's Gumbo Mix,36 boxes,0,2,21.35,0,0
3,2,0,6,Grandma's Boysenberry Spread,12 - 8 oz jars,25,3,25.0,120,0
4,2,0,8,Northwoods Cranberry Sauce,12 - 12 oz jars,0,3,40.0,6,0
5,2,0,15,Genen Shouyu,24 - 250 ml bottles,5,6,13.0,39,0
6,2,0,44,Gula Malacca,20 - 2 kg bags,15,20,19.45,27,0
7,2,0,61,Sirop d'erable,24 - 500 ml bottles,25,29,28.5,113,0
8,2,0,63,Vegie-spread,15 - 625 g jars,5,7,43.9,24,0
9,2,0,65,Louisiana Fiery Hot Pepper Sauce,32 - 8 oz bottles,0,2,21.05,76,0


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,2,0,3,Aniseed Syrup,12 - 550 ml bottles,25,1,10.0,13,70
1,2,0,4,Chef Anton's Cajun Seasoning,48 - 6 oz jars,0,2,22.0,53,0
2,2,1,5,Chef Anton's Gumbo Mix,36 boxes,0,2,21.35,0,0
3,2,0,6,Grandma's Boysenberry Spread,12 - 8 oz jars,25,3,25.0,120,0
4,2,0,8,Northwoods Cranberry Sauce,12 - 12 oz jars,0,3,40.0,6,0
5,2,0,15,Genen Shouyu,24 - 250 ml bottles,5,6,13.0,39,0
6,2,0,44,Gula Malacca,20 - 2 kg bags,15,20,19.45,27,0
7,2,0,61,Sirop d'erable,24 - 500 ml bottles,25,29,28.5,113,0
8,2,0,63,Vegie-spread,15 - 625 g jars,5,7,43.9,24,0
9,2,0,65,Louisiana Fiery Hot Pepper Sauce,32 - 8 oz bottles,0,2,21.05,76,0


### More complex conditions.

In [50]:
display(p.where('unitprice >= 50 and unitprice <= 100'))
display(p.where('unitprice between 50 and 100'))

display(p.where((p.unitprice >=50) & (p.unitprice <= 100)))

# fails because python uses & not and for compound conditions
display(p.where((p.unitprice >=50) & (p.unitprice <= 100)))


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,6,1,9,Mishi Kobe Niku,18 - 500 g pkgs.,0,4,97.0,29,0
1,8,0,18,Carnarvon Tigers,16 kg pkg.,0,7,62.5,42,0
2,3,0,20,Sir Rodney's Marmalade,30 gift boxes,0,8,81.0,40,0
3,7,0,51,Manjimup Dried Apples,50 - 300 g pkgs.,10,24,53.0,20,0
4,4,0,59,Raclette Courdavault,5 kg pkg.,0,28,55.0,79,0


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,6,1,9,Mishi Kobe Niku,18 - 500 g pkgs.,0,4,97.0,29,0
1,8,0,18,Carnarvon Tigers,16 kg pkg.,0,7,62.5,42,0
2,3,0,20,Sir Rodney's Marmalade,30 gift boxes,0,8,81.0,40,0
3,7,0,51,Manjimup Dried Apples,50 - 300 g pkgs.,10,24,53.0,20,0
4,4,0,59,Raclette Courdavault,5 kg pkg.,0,28,55.0,79,0


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,6,1,9,Mishi Kobe Niku,18 - 500 g pkgs.,0,4,97.0,29,0
1,8,0,18,Carnarvon Tigers,16 kg pkg.,0,7,62.5,42,0
2,3,0,20,Sir Rodney's Marmalade,30 gift boxes,0,8,81.0,40,0
3,7,0,51,Manjimup Dried Apples,50 - 300 g pkgs.,10,24,53.0,20,0
4,4,0,59,Raclette Courdavault,5 kg pkg.,0,28,55.0,79,0


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,unitprice,unitsinstock,unitsonorder
0,6,1,9,Mishi Kobe Niku,18 - 500 g pkgs.,0,4,97.0,29,0
1,8,0,18,Carnarvon Tigers,16 kg pkg.,0,7,62.5,42,0
2,3,0,20,Sir Rodney's Marmalade,30 gift boxes,0,8,81.0,40,0
3,7,0,51,Manjimup Dried Apples,50 - 300 g pkgs.,10,24,53.0,20,0
4,4,0,59,Raclette Courdavault,5 kg pkg.,0,28,55.0,79,0


## LAB: 
### Find all the products in category 2 with fewer units in stock than units on order. 
### Only display with productid, name, unitsinstock, unitsonorder, and unitprice.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use the where or filter method. It's probably easier to use a quoted SQL style syntax.
<br>
Use select to get the columns you want to see.
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
display(p.where('unitsinstock < unitsonorder and categoryid = 2')
         .select('productid','productname', 'unitsinstock', 'unitsonorder', 'unitprice'))
```
</p>
</details>

### JOINs work as expected.

In [52]:
tab1 = sc.parallelize([(1, 'Alpha'), (2, 'Beta'), (3, 'Delta')]).toDF('ID:int, code:string')
tab2 = sc.parallelize([(100, 'One', 1), (101, 'Two', 2), (102, 'Three', 1), (103, 'Four', 4)]).toDF('ID:int, name:string, parentID:int')

display(tab1)
display(tab2)
tab1.join(tab2, tab1.ID == tab2.parentID).show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'left').show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'right').show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'full').show()


Unnamed: 0,ID,code
0,1,Alpha
1,2,Beta
2,3,Delta


Unnamed: 0,ID,name,parentID
0,100,One,1
1,101,Two,2
2,102,Three,1
3,103,Four,4


+---+-----+---+-----+--------+
| ID| code| ID| name|parentID|
+---+-----+---+-----+--------+
|  1|Alpha|100|  One|       1|
|  1|Alpha|102|Three|       1|
|  2| Beta|101|  Two|       2|
+---+-----+---+-----+--------+

+---+-----+----+-----+--------+
| ID| code|  ID| name|parentID|
+---+-----+----+-----+--------+
|  1|Alpha| 100|  One|       1|
|  1|Alpha| 102|Three|       1|
|  3|Delta|null| null|    null|
|  2| Beta| 101|  Two|       2|
+---+-----+----+-----+--------+

+----+-----+---+-----+--------+
|  ID| code| ID| name|parentID|
+----+-----+---+-----+--------+
|   1|Alpha|100|  One|       1|
|   1|Alpha|102|Three|       1|
|null| null|103| Four|       4|
|   2| Beta|101|  Two|       2|
+----+-----+---+-----+--------+

+----+-----+----+-----+--------+
|  ID| code|  ID| name|parentID|
+----+-----+----+-----+--------+
|   1|Alpha| 100|  One|       1|
|   1|Alpha| 102|Three|       1|
|   3|Delta|null| null|    null|
|null| null| 103| Four|       4|
|   2| Beta| 101|  Two|       2|
+---

###  Examples of aggregate functions.

In [53]:
tab3 = sc.parallelize([(1, 10), (1, 20), (1, 30), (2, 40), (2,50)]).toDF('groupID:int, amount:int')
display(tab3)
tab3.groupby('groupID').max().show()
tab3.groupby('groupID').sum().drop('sum(groupID)').withColumnRenamed('sum(amount)', 'amount').show()


Unnamed: 0,groupID,amount
0,1,10
1,1,20
2,1,30
3,2,40
4,2,50


+-------+------------+-----------+
|groupID|max(groupID)|max(amount)|
+-------+------------+-----------+
|      1|           1|         30|
|      2|           2|         50|
+-------+------------+-----------+

+-------+------+
|groupID|amount|
+-------+------+
|      1|    60|
|      2|    90|
+-------+------+



### Alternatively, there is an `agg` method that takes a `dict` object.

In [54]:
x = tab3.groupby('groupID')
x.agg({'amount':'sum'}).show()


+-------+-----------+
|groupID|sum(amount)|
+-------+-----------+
|      1|         60|
|      2|         90|
+-------+-----------+



### Or we could import the functions from pyspark and use them directly. This allows us to call multiple aggregates at once.

In [55]:
from pyspark.sql import functions as F
#from pyspark.sql.functions import sum, max
x.agg(F.sum('amount'), F.max('amount')).show()
#dir(F)

+-------+-----------+-----------+
|groupID|sum(amount)|max(amount)|
+-------+-----------+-----------+
|      1|         60|         30|
|      2|         90|         50|
+-------+-----------+-----------+



### Or we could use SQL syntax to encode the calculations but we need to use the `expr` function to tell it how to interpret the SQL.

In [56]:
from pyspark.sql.functions import expr
f = expr('sum(amount) as total')
print(f, type(f))
x.agg(expr('sum(amount) as total'), expr('count(*) as cnt')).show()


Column<b'sum(amount) AS `total`'> <class 'pyspark.sql.column.Column'>
+-------+-----+---+
|groupID|total|cnt|
+-------+-----+---+
|      1|   60|  3|
|      2|   90|  2|
+-------+-----+---+



## LAB: 
### Join products and categories together displaying only the product and category IDs and names, sort by categoryid and productid.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Make sure not to show the common column twice.
<br>
Select which Python style makes it easier to distinguish which columns you want from a join.
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
c = spark.read.csv('file:///class/datasets/northwind/CSVHeaders/categories'
                   , inferSchema = True, header=True)
p = spark.read.json('file:///class/datasets/northwind/JSON/products')

# display(c)
j = (c.join(p, c.CategoryID == p.categoryid)
      .select(c.CategoryID, c.CategoryName, p.productid, p.productname)
      .orderBy('categoryid', 'productid'))
```
</p>
</details>

### Can also convert a DataFrame back to and RDD if we want to use low-level RDD methods.

In [59]:
j1 = j.rdd
print(j1)
print(j.rdd.map(lambda x : (x.CategoryID, x.productid)).toDF(['x','y']).take(2))

MapPartitionsRDD[515] at javaToPython at NativeMethodAccessorImpl.java:0
[Row(x=1, y=1), Row(x=1, y=2)]


### Sometimes you want to just rename a column, so here are two ways to accomplish that.

In [60]:
display(p.withColumnRenamed('unitprice','listprice'))

cols = p.columns # get a list of all the current column names
cols[5] = 'listprice' # replace a column position with the new name 
p1 = p.toDF(*cols) # create a new dataframe from the original with a list of column names
display(p1)


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,reorderlevel,supplierid,listprice,unitsinstock,unitsonorder
0,1,1,1,Chai,10 boxes x 30 bags,10,8,18.0,39,0
1,1,1,2,Chang,24 - 12 oz bottles,25,1,19.0,17,40
2,2,0,3,Aniseed Syrup,12 - 550 ml bottles,25,1,10.0,13,70
3,2,0,4,Chef Anton's Cajun Seasoning,48 - 6 oz jars,0,2,22.0,53,0
4,2,1,5,Chef Anton's Gumbo Mix,36 boxes,0,2,21.35,0,0
5,2,0,6,Grandma's Boysenberry Spread,12 - 8 oz jars,25,3,25.0,120,0
6,7,0,7,Uncle Bob's Organic Dried Pears,12 - 1 lb pkgs.,10,3,30.0,15,0
7,2,0,8,Northwoods Cranberry Sauce,12 - 12 oz jars,0,3,40.0,6,0
8,6,1,9,Mishi Kobe Niku,18 - 500 g pkgs.,0,4,97.0,29,0
9,8,0,10,Ikura,12 - 200 ml jars,0,4,31.0,31,0


Unnamed: 0,categoryid,discontinued,productid,productname,quantityperunit,listprice,supplierid,unitprice,unitsinstock,unitsonorder
0,1,1,1,Chai,10 boxes x 30 bags,10,8,18.0,39,0
1,1,1,2,Chang,24 - 12 oz bottles,25,1,19.0,17,40
2,2,0,3,Aniseed Syrup,12 - 550 ml bottles,25,1,10.0,13,70
3,2,0,4,Chef Anton's Cajun Seasoning,48 - 6 oz jars,0,2,22.0,53,0
4,2,1,5,Chef Anton's Gumbo Mix,36 boxes,0,2,21.35,0,0
5,2,0,6,Grandma's Boysenberry Spread,12 - 8 oz jars,25,3,25.0,120,0
6,7,0,7,Uncle Bob's Organic Dried Pears,12 - 1 lb pkgs.,10,3,30.0,15,0
7,2,0,8,Northwoods Cranberry Sauce,12 - 12 oz jars,0,3,40.0,6,0
8,6,1,9,Mishi Kobe Niku,18 - 500 g pkgs.,0,4,97.0,29,0
9,8,0,10,Ikura,12 - 200 ml jars,0,4,31.0,31,0


## HOMEWORK:  
### Join Orders, OrderDetails, and Products together. Find the sales total for each category listed in descending order by sales.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Load each file into a dataframe and give them single-letter aliases for simplicity.
<br>
Join products and order details together on productid.
<br>
Join that to orders on orderid.
<br>
Create a calculated column to get the line total for each order's details.
<br>
Group by categoryID and calculate the sum of the line totals.
<br>
Sort on the calculated total.
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
prodSchema = StructType([
    StructField('productid', IntegerType()), 
    StructField('productname', StringType()),
    StructField('supplierid', IntegerType()), 
    StructField('categoryid', IntegerType()), 
    StructField('quantityperunit', StringType()), 
    StructField('unitprice', FloatType()), 
    StructField('unitsinstock', IntegerType()), 
    StructField('unitsonorder', IntegerType()), 
    StructField('reorderlevel', IntegerType()), 
    StructField('discontinued', IntegerType())
])


```
</p>
</details>
