### Set up the Spark environment.

In [None]:
import subprocess
result = subprocess.run(["gcloud", "config", "get-value", "project"], stdout=subprocess.PIPE)
PROJECT_ID = result.stdout.decode("utf-8").strip()

# Print the output
print(PROJECT_ID)

In [None]:
SOURCE_PATH = f'gs://{PROJECT_ID}-data/datasets'
DEST_PATH = f'gs://{PROJECT_ID}-output'
print(SOURCE_PATH, DEST_PATH)

### Turn a simple RDD into a DataFrame. 

In [None]:
x = sc.parallelize([(1,'alpha'),(2,'beta')])
x0 = spark.createDataFrame(x)
x0.show()


### Give the DataFrame meaningful column names.

In [None]:
x1 = spark.createDataFrame(x, schema=['ID','Name'])
x1.show()
print(x1)


### Give a DataFrame a schema with column names and data types.

In [None]:
x2 = spark.createDataFrame(x, 'ID:int, Name:string')
x2.show()
print(x2)


### Create a schema object.

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
schema1 = StructType([
    StructField('ID', IntegerType()), 
    StructField('Name', StringType())
])
x3 = spark.createDataFrame(x, schema = schema1)
x3.show()
print(x3)


### The built in toDF method does the same thing.

In [None]:
x.toDF().printSchema()
x.toDF(['ID', 'Name']).printSchema()
x.toDF('ID:int, Name:string').printSchema()
x.toDF(schema = schema1).printSchema()


## LAB: ## 
### Use the regions and territories RDDs from the previous lab and convert them into DataFrames with meaningful schemas.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use sc.textFile to read the files
<br>
Use map functions to split and convert the data
<br>
Use spark.createDataFrame and toDF to convert RDD into DataFrames
<br>
<br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
regions = sc.textFile(f'{SOURCE_PATH}/northwind/CSV/regions')
regions = regions.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))
regionsdf = spark.createDataFrame(regions, 'RegionID:int, RegionName:string')
regionsdf.show()

territories = sc.textFile(f'{SOURCE_PATH}/northwind/CSV/territories')
territories = territories.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1], int(x[2])))
territoriesdf = territories.toDF('TerritoryID:int, TerritoryName:string, RegionID: int')
territoriesdf.show()
```
</p>
</details>

### Examples of reading a CSV directly into a DataFrame.

In [None]:
filename = f'{SOURCE_PATH}/northwind/CSV/categories'
cat1 = spark.read.load(filename, format = 'csv', sep = ',', inferSchema = True, header = False)
cat1.printSchema()


In [None]:
cat1 = spark.read.load(filename, format = 'csv', sep = ',', inferSchema = True, header = True)
cat1.printSchema()


### There are several alternate syntaxes which can be confusing, but since you will encounter them, you need to learn to recognize the different options
option and options allow you pass parameters in different ways, but not the true is quoted and lowercase because it is a java value, but you could also pass it as a True Python value.

In [None]:
cat2 = spark.read.format('csv').option('header','true').option('inferSchema','true').load(filename)
cat2.printSchema()
cat2 = spark.read.format('csv').options(header=True, inferSchema='true').load(filename)
cat2.printSchema()


### If there is a top level read function for the file type you want, that's the cleanest option, and pass in the parameters as named parameters. Not all formats have this and also legacy code written before this may use the old style syntax.

In [None]:
cat3 = spark.read.csv(filename, header = True, inferSchema = True)
cat3.printSchema()
cat3.show()


### As the tables get more complex, there is a Jupyter command that will show the tables in a prettier format.

In [None]:
cat3.show()

## LAB: ## 
### Load the products table using any of the spark.read methods.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use spark.read.csv
<br>
Make sure to read the version that has headers if you want to infer schema
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
prod1 = spark.read.csv(f'{SOURCE_PATH}/northwind/CSVHeaders/products', header=True, inferSchema=True) 
prod1.printSchema() 
prod1.show()
```
</p>
</details>

### Using a schema is a good idea for performance if you know what it is. Usually you can infer schema during development and use it as a helper to build the schema to use for production.

In [None]:

prodSchema = StructType([
    StructField('ProductID', IntegerType()), 
    StructField('ProductName', StringType()),
    StructField('SupplierID', IntegerType()), 
    StructField('CategoryID', IntegerType()), 
    StructField('QuantityPerUnit', StringType()), 
    StructField('UnitPrice', FloatType()), 
    StructField('UnitsInStock', IntegerType()), 
    StructField('UnitsOnOrder', IntegerType()), 
    StructField('ReorderLevel', IntegerType()), 
    StructField('Discontinued', IntegerType())
])

prod2 = spark.read.csv(f'{SOURCE_PATH}/northwind/CSVHeaders/products', header=True, schema=prodSchema)
print(prod2)
prod2.show()



### Convert a DataFrame into a JSON string.

In [None]:
print (cat2.toJSON().take(10))

### JSON is another top level supported format.

In [None]:
cat4 = spark.read.json(f'{SOURCE_PATH}/northwind/JSON/categories')
cat4.show()


### You can also use schemas but be careful of case.

In [None]:
prod = spark.read.json(f'{SOURCE_PATH}/northwind/JSON/products')
prod.show()

prodSchema = StructType([
    StructField('productid', IntegerType()), 
    StructField('productname', StringType()),
    StructField('supplierid', IntegerType()), 
    StructField('categoryid', IntegerType()), 
    StructField('quantityperunit', StringType()), 
    StructField('unitprice', FloatType()), 
    StructField('unitsinstock', IntegerType()), 
    StructField('unitsonorder', IntegerType()), 
    StructField('reorderlevel', IntegerType()), 
    StructField('discontinued', IntegerType())
])
prod = spark.read.json(f'{SOURCE_PATH}/northwind/JSON/products', schema=prodSchema)
prod.show()


### You may also see the older style syntax.

In [None]:
prod = spark.read.format('json').load(f'{SOURCE_PATH}/northwind/JSON/products')
prod.show()


### We can easily save this to a Hive table

In [None]:
prod.write.mode('overwrite').saveAsTable('products')


### It's also easy to load data from a saved Hive table

In [None]:
prod2 = spark.read.table('products')
prod2.show()

### Choose particular columns from a DataFrame.
You can use quoted strings for the column names

In [None]:
prod.select('productid', 'productname', 'unitprice').show()

### Or you can use a pythonic syntax using the DataFrame name and field name

In [None]:
prod.select(prod.productid, prod.productname, prod.unitprice).show()

### Case is ignored if you use quoted strings but not if you use python syntax

In [None]:
prod.select('Productid', 'productname', 'unitprice').show()


In [None]:
prod.select(prod.Productid, prod.productname, prod.unitprice).show()

### Distinct is a method after the select method chooses the columns

In [None]:
prod.select('CategoryID').distinct().show()


### Sort a DataFrame. The sort and orderBy methods are different aliases for the exact same method.

In [None]:
prod.sort(prod.unitprice).show()
prod.orderBy('unitprice', ascending = False).show()
prod.select('productid', 'productname', 'unitprice').orderBy('unitprice').show()

### Create a new DataFrame with a new calculated column added.

In [None]:
prod2 = prod.withColumn('value', prod.unitprice * prod.unitsinstock)
prod2.show()

### Remove an unwanted column from a DataFrame.

In [None]:
prod2 = prod2.drop('quantityperunit')
prod2.show()


### The filter and where methods can both be used and have alternative ways to represent the condition.

In [None]:
p = prod
p.filter(p.unitprice > 100).show()
p.filter('unitprice > 100').show()
# Note == when using python syntax
p.where(p.categoryid == 2).show()
# Note = when using quoted SQL like syntax
p.where('categoryid = 2').show()



### More complex conditions

In [None]:
p.where('unitprice >= 50 and unitprice <= 100').show()

p.where('unitprice between 50 and 100').show()

p.where((p.unitprice >=50) & (p.unitprice <= 100)).show()


## LAB: ## 
### Find all the products in category 2 with fewer units in stock than units on order 
### Only display with productid, name, unitsinstock, unitsonorder and unitprice
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use the where or filter method. It's probably easier to use a quoted SQL style syntax
<br>
Use select to get the columns you want to see
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
p.where('unitsinstock < unitsonorder and categoryid = 2').select('productid','productname', 'unitsinstock', 'unitsonorder', 'unitprice').show()
```
</p>
</details>

### JOINs work as expected.

In [None]:
tab1 = sc.parallelize([(1, 'Alpha'), (2, 'Beta'), (3, 'Delta')]).toDF('ID:int, code:string')
tab2 = sc.parallelize([(100, 'One', 1), (101, 'Two', 2), (102, 'Three', 1), (103, 'Four', 4)]).toDF('ID:int, name:string, parentID:int')
tab1.join(tab2, tab1.ID == tab2.parentID).show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'left').show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'right').show()
tab1.join(tab2, tab1.ID == tab2.parentID, 'full').show()


###  Examples of aggregate functions.

In [None]:
tab3 = sc.parallelize([(1, 10), (1, 20), (1, 30), (2, 40), (2,50)]).toDF('groupID:int, amount:int')
tab3.groupby('groupID').max().show()
tab3.groupby('groupID').sum().show()


In [None]:
x = tab3.groupby('groupID')
x.agg({'amount':'sum'}).show()


In [None]:
from pyspark.sql import functions as F
x.agg(F.sum('amount'), F.max('amount')).show()


In [None]:
from pyspark.sql.functions import expr
x.agg(expr('sum(amount) as total')).show()


## LAB: ## 
### Join products and categories together displaying only the product and category ID's and names, sort by categoryid and productid
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Make sure not to show the common column twice
<br>
Select with python style makes it easier to distinguish which columns you want from a join
<br><br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
filename = f'{SOURCE_PATH}/northwind/CSVHeaders/categories'
c = spark.read.load(filename, format = 'csv', sep = ',', inferSchema = True, header = True)
p = spark.read.json(f'{SOURCE_PATH}/northwind/JSON/products')
c.show()
p.show()

j = (c.join(p, c.CategoryID == p.categoryid)
      .select(c.CategoryID, c.CategoryName, p.productid, p.productname)
      .orderBy('CategoryID', 'productid')
    )
j.show()
```
</p>
</details>

### Sometimes you want to just rename a column so here are two ways to accomplish that

In [None]:
p.withColumnRenamed('unitprice','listprice').show()
cols = p.columns # get a list of all the current column names
cols[5] = 'listprice' # replace a column position with the new name 
p1 = p.toDF(*cols) # create a new dataframe from the original with a list of column names
p1.show()
