### Set up the Spark environment.

In [None]:
import sys
sys.path.append('/class')
from initspark import *
sc, spark, conf = initspark()


### Turn a simple RDD into a DataFrame. 

In [None]:
x = sc.parallelize([(1,'alpha'),(2,'beta')])
x0 = spark.createDataFrame(x)
x0.show()


### Give the DataFrame meaningful column names.

In [None]:
x1 = spark.createDataFrame(x, schema=['ID','Name'])
x1.show()
print(x1)


### Give a DataFrame a schema with column names and data types.

In [None]:
x2 = spark.createDataFrame(x, 'ID:int, Name:string')
x2.show()
print(x2)


### Create a schema object.

In [None]:
schema1 = StructType([
    StructField('ID', IntegerType()), 
    StructField('Name', StringType())
])
x3 = spark.createDataFrame(x, schema = schema1)
x3.show()
print(x3)


### The built in toDF method does the same thing.

In [None]:
x.toDF().printSchema()
x.toDF(['ID', 'Name']).printSchema()
x.toDF('ID:int, Name:string').printSchema()
x.toDF(schema = schema1).printSchema()


## LAB: ## 
### Use the regions and territories RDDs from the previous lab and convert them into DataFrames with meaningful schemas.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use sc.textFile to read the files
<br>
Use map functions to split and convert the data
<br>
Use spark.createDataFrame and toDF to convert RDD into DataFrames
<br>
<br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
regions = sc.textFile('hdfs://localhost:9000/regions')
regions = regions.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))
regionsdf = spark.createDataFrame(regions, 'RegionID:int, RegionName:string')
regionsdf.show()

territories = sc.textFile('hdfs://localhost:9000/territories')
territories = territories.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1], int(x[2])))
territoriesdf = territories.toDF('TerritoryID:int, TerritoryName:string, RegionID: int')
territoriesdf.show()
```
</p>
</details>

### Examples of reading a CSV directly into a DataFrame.

In [None]:
filename = '/class/datasets/northwind/CSV/categories'
cat1 = spark.read.load(filename, format = 'csv', sep = ',', inferSchema = True, header = False)
cat1.printSchema()


In [None]:
cat1 = spark.read.load(filename, format = 'csv', sep = ',', inferSchema = True, header = True)
cat1.printSchema()


### There are several alternate syntaxes which can be confusing, but since you will encounter them, you need to learn to recognize the different options.
option and options allow you pass parameters in different ways, but not the true is quoted and lowercase because it is a java value, but you could also pass it as a True Python value.

In [None]:
cat2 = spark.read.format('csv').option('header','true').option('inferSchema','true').load(filename)
cat2.printSchema()
cat2 = spark.read.format('csv').options(header=True, inferSchema='true').load(filename)
cat2.printSchema()


### If there is a top-level read function for the file type you want, that's the cleanest option, and pass in the parameters as named parameters. Not all formats have this and also legacy code written before this may use the old style syntax.

In [None]:
cat3 = spark.read.csv(filename, header = True, inferSchema = True)
cat3.printSchema()
cat3.show()


### As the tables get more complex, there is a Jupyter command that will show the tables in a prettier format.

In [None]:
display(cat3)

## LAB: ## 
### Load the products table using any of the spark.read methods.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use spark.read.csv
<br>
Make sure to read the version that has headers if you want to infer schema
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
prod1 = spark.read.csv('/class/datasets/northwind/CSVHeaders/products', header=True, inferSchema=True)
prod1.printSchema()
display(prod1)```
</p>
</details>

### Using a schema is a good idea for performance if you know what it is. Usually you can infer schema during development and use it as a helper to build the schema to use for production.

In [None]:

prodSchema = StructType([
    StructField('ProductID', IntegerType()), 
    StructField('ProductName', StringType()),
    StructField('SupplierID', IntegerType()), 
    StructField('CategoryID', IntegerType()), 
    StructField('QuantityPerUnit', StringType()), 
    StructField('UnitPrice', FloatType()), 
    StructField('UnitsInStock', IntegerType()), 
    StructField('UnitsOnOrder', IntegerType()), 
    StructField('ReorderLevel', IntegerType()), 
    StructField('Discontinued', IntegerType())
])

prod2 = spark.read.csv('/class/datasets/northwind/CSVHeaders/products', header=True, schema=prodSchema)
print(prod2)
display(prod2)

# prod2 = spark.read.csv('/class/datasets/northwind/CSVHeaders/products', header=True, inferSchema=False)
# print(prod2)
# prodSchema2 = "ProductID:int, ProductName:string, SupplierID:int, CategoryID:int, QuantityPerUnit:string, UnitPrice:double, UnitsInStock:int, UnitsOnOrder:int, ReorderLevel:int, Discontinued:int"

# prod3 = prod2.toDF(prodSchema2)


### Convert a DataFrame into a JSON string.

In [None]:
print (cat2.toJSON().take(10))

### JSON is another top-level supported format.

In [None]:
cat4 = spark.read.json('/class/datasets/northwind/JSON/categories')
display(cat4)


### You can also use schemas, but be careful of case.

In [None]:
prod = spark.read.json('/class/datasets/northwind/JSON/products', schema=prodSchema)
display(prod)

prodSchema = StructType([
    StructField('productid', IntegerType()), 
    StructField('productname', StringType()),
    StructField('supplierid', IntegerType()), 
    StructField('categoryid', IntegerType()), 
    StructField('quantityperunit', StringType()), 
    StructField('unitprice', FloatType()), 
    StructField('unitsinstock', IntegerType()), 
    StructField('unitsonorder', IntegerType()), 
    StructField('reorderlevel', IntegerType()), 
    StructField('discontinued', IntegerType())
])
prod = spark.read.json('/class/datasets/northwind/JSON/products', schema=prodSchema)
display(prod)


### You may also see the older style syntax.

In [None]:
prod = spark.read.format('json').load('/class/datasets/northwind/JSON/products')
display(prod)


### Writing a DataFrame uses a similar syntax.

In [None]:
! rm -r /tmp/prodjson
prod.write.json('/tmp/prodjson')
! cat /tmp/prodjson/*

In [None]:
! rm -r /tmp/prodcsv
prod.write.csv('/tmp/prodcsv', sep = '\t', header=True)
! cat /tmp/prodcsv/*

### Note the use of mode('overwrite') here.

In [None]:
prod.write.mode('overwrite').orc('/tmp/prodorc')
! cat /tmp/prodorc/*


In [None]:
prod.write.mode('overwrite').parquet('/tmp/prodparquet')
! cat /tmp/prodparquet/*


### AVRO is a little different, it is built in now but doesn't have a top-level method for it, so you need to use the old style syntax.
This doesn't always work inside of a notebook either, so take a look at the program and run it from spark-submit with the proper package dependency added.

In [None]:
! cat /class/avro.py

! spark-submit --packages org.apache.spark:spark-avro_2.11:2.4.3 /class/avro.py
        
#prod4.write.format("avro").mode('overwrite').save('/tmp/prodavro')


## LAB: ## 
### Try to read in a few files with different formats and write them out to other formats. 
### Read shippers found in TSV and write it out as JSON.
### Read orders found in CSVHeaders and write it out as ORC.
### Read orderdetails found in JSON and write it out as Parquet.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use the syntax that is easiest for you
<br>
If there is a top level function for the file format, that is usually the easiest way
<br>
TSV is just the same as CSV, but if there are no headers you need to supply a schema
<br>
Remember to either remove the destination folder before writing or use an overwrite option
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
shipperSchema = StructType([
    StructField('shipperid', StringType()), 
    StructField('shippername', StringType()),
    StructField('phone', StringType())
])

shippers = spark.read.csv('/class/datasets/northwind/TSV/shippers', sep='\t', header=False, inferSchema=False, schema=shipperSchema)
shippers.write.mode('overwrite').json('/tmp/shippersjson')

orders = spark.read.csv('/class/datasets/northwind/CSVHeaders/orders', header=True, inferSchema=True)
orders.write.mode('overwrite').orc('/tmp/ordersorc')

orderdetails = spark.read.json('/class/datasets/northwind/JSON/orderdetails')
orderdetails.write.mode('overwrite').parquet('/tmp/orderdetailsparquet')

```
</p>
</details>

### JDBC and NoSQL sources can also be used, but we will explore those options in the next session.

In [None]:
prod.printSchema()
print (prod.columns, prod.count())

### Choose particular columns from a DataFrame.
You can use quoted strings for the column names

In [None]:
display(prod.select('productid', 'productname', 'unitprice'))

### Or you can use a pythonic syntax using the DataFrame name and field name.

In [None]:
display(prod.select(prod.productid, prod.productname, prod.unitprice))

### Case is ignored if you use quoted strings but not if you use python syntax.

In [None]:
display(prod.select('Productid', 'productname', 'unitprice'))
display(prod.select(prod.Productid, prod.productname, prod.unitprice))

### Distinct is a method after the select method chooses the columns.

In [None]:
display(prod.select('CategoryID').distinct())


### Sort a DataFrame. The sort and orderBy methods are different aliases for the exact same method.

In [None]:
display(prod.sort(prod.unitprice))
display(prod.orderBy('unitprice', ascending = False))
display(prod.select('productid', 'productname', 'unitprice').orderBy('unitprice'))

### Create a new DataFrame with a new calculated column added.

In [None]:
prod2 = prod.withColumn('value', prod.unitprice * prod.unitsinstock)
display(prod2)


### Remove an unwanted column from a DataFrame.

In [None]:
prod2 = prod2.drop('quantityperunit')
display(prod2)


### The filter and where methods can both be used and have alternative ways to represent the condition.

In [None]:
p = prod
display(p.filter(p.unitprice > 100))
display(p.filter('unitprice > 100'))
# Note == when using python syntax
display(p.where(p.categoryid == 2))
# Note = when using quoted SQL like syntax
display(p.where('categoryid = 2'))



### More complex conditions.

In [None]:
display(p.where('unitprice >= 50 and unitprice <= 100'))
display(p.where('unitprice between 50 and 100'))

display(p.where((p.unitprice >=50) & (p.unitprice <= 100)))


## LAB: ## 
### Find all the products in category 2 with fewer units in stock than units on order. 
### Only display with productid, name, unitsinstock, unitsonorder, and unitprice.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use the where or filter method. It's probably easier to use a quoted SQL style syntax
<br>
Use select to get the columns you want to see
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
display(p.where('unitsinstock < unitsonorder and categoryid = 2').select('productid','productname', 'unitsinstock', 'unitsonorder', 'unitprice'))
```
</p>
</details>

### JOINs work as expected.

In [None]:
tab1 = sc.parallelize([(1, 'Alpha'), (2, 'Beta'), (3, 'Delta')]).toDF('ID:int, code:string')
tab2 = sc.parallelize([(100, 'One', 1), (101, 'Two', 2), (102, 'Three', 1), (103, 'Four', 4)]).toDF('ID:int, name:string, parentID:int')
tab1.join(tab2, tab1.ID == tab2.parentID).show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'left').show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'right').show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'full').show()


###  Examples of aggregate functions.

In [None]:
tab3 = sc.parallelize([(1, 10), (1, 20), (1, 30), (2, 40), (2,50)]).toDF('groupID:int, amount:int')
tab3.groupby('groupID').max().show()
tab3.groupby('groupID').sum().show()


In [None]:
x = tab3.groupby('groupID')
x.agg({'amount':'sum'}).show()


In [None]:
from pyspark.sql import functions as F
x.agg(F.sum('amount'), F.max('amount')).show()


In [None]:
from pyspark.sql.functions import expr
x.agg(expr('sum(amount) as total')).show()


## LAB: ## 
### Join products and categories together displaying only the product and category ID's and names, sort by categoryid and productid.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Make sure not to show the common column twice
<br>
Select with python style makes it easier to distinguish which columns you want from a join
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
c = cat
p = prod
display(c.join(p, c.categoryid == p.categoryid).select(c.categoryid, c.categoryname, p.productid, p.productname).orderBy('categoryid', 'productid'))```
</p>
</details>

### Sometimes you want to just rename a column so here are two ways to accomplish that.

In [None]:
display(p.withColumnRenamed('unitprice','listprice'))
cols = p.columns # get a list of all the current column names
cols[5] = 'listprice' # replace a column position with the new name 
p1 = p.toDF(*cols) # create a new dataframe from the original with a list of column names
display(p1)


## HOMEWORK: ## 
### Join Orders, OrderDetails, and Products together. Find the sales total for each category listed in descending order by sales.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Load each file into a dataframe and give them single letter aliases for simplicity
<br>
Join products and order details together on productid
<br>
Join that to orders on orderid
<br>
Createa a calculated column to get the line total for each order details
<br>
Group by categoryID and calculate the sum of the line totals 
<br>
Sort on the calculated total
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
prodSchema = StructType([
    StructField('productid', IntegerType()), 
    StructField('productname', StringType()),
    StructField('supplierid', IntegerType()), 
    StructField('categoryid', IntegerType()), 
    StructField('quantityperunit', StringType()), 
    StructField('unitprice', FloatType()), 
    StructField('unitsinstock', IntegerType()), 
    StructField('unitsonorder', IntegerType()), 
    StructField('reorderlevel', IntegerType()), 
    StructField('discontinued', IntegerType())
])


```
</p>
</details>
