<a href="https://colab.research.google.com/github/roitraining/jpmc_hadoop/blob/master/notebooks/Ch05_DataFrames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Set up the Spark environment.

In [0]:
import sys

rootpath = '/class/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()


### Turn a simple RDD into a DataFrame. 

In [0]:
x = sc.parallelize([(1,'alpha'),(2,'beta')])
x0 = spark.createDataFrame(x)
x0.show()
print(x0.collect())
list(map(tuple, x0.collect()))

### Give the DataFrame meaningful column names.

In [0]:
x1 = spark.createDataFrame(x, schema=['ID','Name'])
#x1 = spark.createDataFrame(x, schema='ID, Name') # Does not work
x1.show()
print(x1)
print(x1.collect())

### Give a DataFrame a schema with column names and data types.

In [0]:
x2 = spark.createDataFrame(x, schema='ID:int, Name:string')
x2.show()
print(x2)

x3 = x2.rdd.map(lambda x : (x.ID * 10, x.Name.upper()))

x4 = spark.createDataFrame(x3)
x4.show()

x5 = x3.toDF(schema='ID:int, Name:string')
x5.show()

### Could also use a structured type object to specify schema.

In [0]:
schema = StructType([
    StructField('ID', IntegerType()), 
    StructField('Name', StringType())
])
x6 = spark.createDataFrame(x, schema=schema)
display(x6)

x7 = x.toDF(schema=schema)
display(x7)

### Load a text file into a RDD and clean it up as before.

In [0]:
rddRegions = sc.textFile('hdfs://localhost:9000/regions')
rddRegions = rddRegions.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))
print(rddRegions.collect())

### Turn the RDD into a DataFrame.

In [0]:
dfRegions = rddRegions.toDF('RegionID: int, RegionName:string')
dfRegions.show()
print(dfRegions)
dfRegions.printSchema()

### Better yet, just use the built-in CSV reader to read a file directly into a DataFrame and skip RDDs altogether.

In [0]:
territoriesFile = f'{datapath}/northwind/CSV/territories'
df = spark.read.csv(territoriesFile, header=True, inferSchema=True)
# this doesn't work though
# df = spark.read.csv(filename, schema='TerritoryID:int, TerritoryName:string, RegionID:int')
display(df)
print(df)

### To read a CSV file with a specific schema you must use the StructType, you can't just pass in a string with column names and types.

In [0]:
schTerritories = StructType([
    StructField('TerritoryID', IntegerType()), 
    StructField('TerritoryName', StringType()), 
    StructField('RegionID', IntegerType())
])
dfTerritories = spark.read.csv(territoriesFile, schema = schTerritories, header=False)
dfTerritories.printSchema()
display(dfTerritories)

### ***LAB:*** Read regions from the the HDFS folder using read.csv and territories from the /home/student/ROI/Spark/datasets/northwind/CSVHeaders folder.


In [0]:
shippersFile= f'{datapath}/northwind/CSVHeaders/shippers'
##############################################################################################schShippers = StructType([
##############################################################################################    StructField('ShipperID', IntegerType()), 
##############################################################################################    StructField('CompanyName', StringType()), 
##############################################################################################    StructField('Phone', StringType())
##############################################################################################])
##############################################################################################dfShippers = spark.read.csv(shippersFile, schema = schShippers, header=True)
##############################################################################################dfShippers.printSchema()
##############################################################################################display(dfShippers)

### Convert a DataFrame into a JSON string.

In [0]:
print (dfShippers.toJSON().take(10))

### Read a JSON file into a DataFrame.

In [0]:
dfProducts = spark.read.json(f'{datapath}/northwind/JSON/products')
dfProducts.printSchema()
display(dfProducts)

### Here is an ORC example.

In [0]:
dfOrderDetails = spark.read.orc(f'{datapath}/northwind/ORC/orderdetails')
dfOrderDetails.printSchema()
display(dfOrderDetails)

### Here is a Parquet example.

In [0]:
dfOrders = spark.read.parquet(f'{datapath}/northwind/PARQUET/orders')
dfOrders.printSchema()
display(dfOrders)

### Choose particular columns from a DataFrame and create a calculated column.

In [0]:
# Python style
df1 = dfProducts.select(dfProducts.unitprice, dfProducts.unitsinstock).withColumn('value', dfProducts.unitprice * dfProducts.unitsinstock)
display(df1)

# SQL style
from pyspark.sql.functions import expr
df1 = dfProducts.select('unitprice', 'unitsinstock').withColumn('value', expr('unitprice * unitsinstock'))
display(df1)



### Try a bunch of different transformations.

In [0]:
#display(dfProducts.drop('discontinued', 'reorderlevel'))
#display(dfProducts.withColumnRenamed('discontinued', 'gone'))
# display(dfProducts.select('supplierid').distinct())
# print(dfProducts.select('supplierid').distinct().count())


### Sort a DataFrame. The sort and orderBy methods are different aliases for the exact same method.

In [0]:
#display(dfProducts.sort(dfProducts.unitprice))
#display(dfProducts.sort(dfProducts.unitprice, ascending = False))
#display(dfProducts.orderBy('unitprice', ascending = False))


### Create a new DataFrame with a new calculated column added.

In [0]:
dfOrderDetails2 = dfOrderDetails.withColumn('total', dfOrderDetails.unitprice * dfOrderDetails.quantity)
display(dfOrderDetails2)

### The filter and where methods can both be used and have alternative ways to represent the condition.

In [0]:
df = dfProducts
#display(df.filter(df.unitprice < 100))
print(df.filter('unitprice < 100').count())
print(df.where(df.unitprice == 10).count())
print(df.where('unitprice = 10').count())

print(df.where((df.unitprice >= 10) & (df.unitprice <=20)).count())
print(df.where('unitprice BETWEEN 10 and 20').count())
# print(df.where(df.Amount < 4000).count())



### ***LAB:*** Using dfOrderDetails


*   Find out how many orders have a line total more than 1000
*   Find the ten largest line totals



In [0]:
#display(dfOrderDetails)
##############################################################################################df = dfOrderDetails
##############################################################################################df1 = df.withColumn('total', expr('unitprice * quantity'))
##############################################################################################df2 = df1.where('total > 1000')
##############################################################################################print (df2.count())

##############################################################################################display(df1.orderBy('total', ascending = False).limit(10))

### JOINs work as expected.

In [0]:
tab1 = sc.parallelize([(1, 'Alpha'), (2, 'Beta'), (3, 'Delta')]).toDF('ID:int, code:string')
tab2 = sc.parallelize([(100, 'One', 1), (101, 'Two', 2), (102, 'Three', 1), (103, 'Four', 4)]).toDF('childID:int, name:string, parentID:int')
display(tab1)
display(tab2)
display(tab1.join(tab2, tab1.ID == tab2.parentID))
display(tab1.join(tab2, tab1.ID == tab2.parentID, 'left'))
display(tab1.join(tab2, tab1.ID == tab2.parentID, 'right'))
display(tab1.join(tab2, tab1.ID == tab2.parentID, 'full'))


### Examples of aggregate functions.

In [0]:
tab3 = sc.parallelize([(1, 10), (1, 20), (1, 30), (2, 40), (2,50)]).toDF('groupID:int, amount:int')
display(tab3)
tab3.groupby('groupID').max().show()
tab3.groupby('groupID').sum().show()
x = tab3.groupby('groupID')

from pyspark.sql import functions as F
#from pyspark.sql.functions import sum, max
# print(dir(F))
x.agg(F.sum('amount'), F.max('amount')).show()

### Examples of reading a CSV directly into a DataFrame using different styles.

In [0]:
filename = f'{datapath}/finance/CreditCard.csv'
dfCreditCard = spark.read.load(filename, format = 'csv', sep = ',', inferSchema = True, header = True)
dfCreditCard.printSchema()

In [0]:
df4 = spark.read.format('csv').option('header','true').option('inferSchema','true').load(filename)
df4.printSchema()

In [0]:
df4 = spark.read.csv(filename, header = True, inferSchema = True)
df4.printSchema()

In [0]:
display(df4)

### Another way of changing column names.

In [0]:
cols = df4.columns
cols[0] = 'CityCountry'
df4 = df4.toDF(*cols)
df4.printSchema()

### ***LAB:*** Read the Products file from the JSON folder and categories from the CSVHeaders folder, then join them displaying just the product and category IDs and names, and sort by categoryID then productID. 


**Hint:** Drop the ambiguous column after the join.
Be aware of case sensitivity.

In [0]:
##############################################################################################dfCategories = spark.read.csv(f'{datapath}/northwind/CSVHeaders/categories', inferSchema = True, header = True)
##############################################################################################display(dfCategories)
##############################################################################################dfProdCat = dfProducts.join(dfCategories, dfProducts.categoryid == dfCategories.CategoryID).drop('categoryid')
##############################################################################################display(dfProdCat)



### Apply a custom UDF to columns to separate the City and Country and convert the Date into a date datatype.

In [0]:
from pyspark.sql.functions import udf, expr
from pyspark.sql.types import *
from pyspark.sql.functions import to_date
import datetime

def city(x):
    return x[:x.find(',')]

cityUDF = udf(city, StringType())

def country(x):
    return x[x.find(',') + 1 :]

@udf(StringType())
def gender(x):
    return 'Male' if x == 'M' else 'Female'


df = dfCreditCard.withColumnRenamed('City', 'CityCountry')
df5 = df.withColumn('City', cityUDF(df.CityCountry)) \
      .withColumn('Country', udf(country, StringType())(df.CityCountry)) \
      .withColumn('Date', to_date(df.Date, 'dd-MMM-yy')) \
      .withColumn('Gender', gender(df.Gender)) \
      .drop(df.CityCountry)
display(df5)

### DataFrames can be written to a variety of file formats. Here we are writing it to JSON.

In [0]:
df5.write.mode('overwrite').json(f'{rootpath}/CreditCard.json')
print('Done')