<a href="https://colab.research.google.com/github/roitraining/SparkforDataEngineers/blob/Development/Ch02_DataFrames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Set up the Spark environment.

In [None]:
import sys

rootpath = '/home/student/ROI/Spark/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()


### Turn a simple RDD into a DataFrame. 

In [None]:
x = sc.parallelize([(1,'alpha'),(2,'beta')])
x0 = spark.createDataFrame(x)
x0.show()
print(x0.collect())
list(map(tuple, x0.collect()))

### Give the DataFrame meaningful column names.

In [None]:
x1 = spark.createDataFrame(x, schema=['ID','Name'])
#x1 = spark.createDataFrame(x, schema='ID, Name') # Does not work
x1.show()
print(x1)
print(x1.collect())

### Give a DataFrame a schema with column names and data types.

In [None]:
x2 = spark.createDataFrame(x, schema='ID:int, Name:string')
x2.show()
print(x2)

x3 = x2.rdd.map(lambda x : (x.ID * 10, x.Name.upper()))

x4 = spark.createDataFrame(x3)
x4.show()

x5 = x3.toDF(schema='ID:int, Name:string')
x5.show()

### Could also use a structured type object to specify schema

In [None]:
schema = StructType([
    StructField('ID', IntegerType()), 
    StructField('Name', StringType())
])
x6 = spark.createDataFrame(x, schema=schema)
display(x6)

x7 = x.toDF(schema=schema)
display(x7)

### Load a text file into a RDD and clean it up as before.

In [None]:
filename = f'{datapath}/finance/CreditCard.csv'
cc = sc.textFile(filename)
first = cc.first()
cc = cc.filter(lambda x : x != first)
cc.take(10)


In [None]:
import datetime
cc = cc.map(lambda x : x.split(',')) 
cc.take(10)

In [None]:
cc = cc.map(lambda x : (x[0][1:], x[1][1:-1], datetime.datetime.strptime(x[2], '%d-%b-%y').date(), x[3], x[4], x[5], float(x[6])))
print (cc.collect())

### Turn the RDD into a DataFrame.

In [None]:
df = cc.toDF('City: string, Country: string, Date: date, CardType: string, TranType: string, Gender: string, Amount: double')
df.show()
print(df)
df.printSchema()

### Better yet, just use the built in CSV reader to read a file directly into a DataFrame and skip RDD's altogether

In [None]:
shipperfile = f'{datapath}/northwind/CSV/shippers'
df = spark.read.csv(shipperfilen, header=True, inferSchema=True)
# this doesn't work though
# df = spark.read.csv(filename, schema='ShipperID:string, CompanyName:string, Phone:string')
display(df)
print(df)

### To read a CSV file with a specific schema you must use the StructType, you can't just pass in a string with column names and types

In [None]:
schema = StructType([
    StructField('ShipperID', IntegerType()), 
    StructField('CompanyName', StringType()), 
    StructField('Phone', StringType())
])
shippers = spark.read.csv(shipperfile, schema = schema, header=False)
shippers.printSchema()
display(shippers)

### ***LAB:*** Read regions from the the HDFS folder using read.csv and territories from the /home/student/ROI/Spark/datasets/northwind/CSVHeaders folder.


In [None]:
territoriesfile= f'{datapath}/northwind/CSVHeaders/territories'

### Convert a DataFrame into a JSON string.

In [None]:
print (shippers.toJSON().take(10))

### Choose particular columns from a DataFrame and create a calculated column

In [None]:
display(df.select('City', df.Date, df.Amount) \
          .withColumn('BigAmount', df.Amount * 10))


### Try a bunch of different transformations

In [None]:
display(df)
# display(df.select('City', 'Date', 'Card Type', 'Exp Type', 'Amount'))
# display(df.drop('Gender'))
# display(df.withColumnRenamed('Card Type', 'CardType'))
# display(df.select('City').distinct())
# print (df.count(), df.select('City').distinct().count())

### Sort a DataFrame. The sort and orderBy methods are different aliases for the exact same method.

In [None]:
df.sort(df.Amount).show()
df.sort(df.Amount, ascending = False).show()
df.select('City', 'Amount').orderBy(df.City).show()
df.withColumn('NewAmount', df.Amount * 10).where('NewAmount >= 1000').where('Amount <= 2000')


### Create a new DataFrame with a new calculated column added.

In [None]:
df2 = df.withColumn('Discount', df.Amount * .03) \
        .withColumnRenamed('Card Type', 'CardType') \
        .withColumnRenamed('Exp Type', 'ExpType')
df2.show()

### The filter and where methods can both be used and have alternative ways to represent the condition.

In [None]:
display(df2.filter(df2.Amount < 4000))
print(df2.filter('Amount < 4000').count())
print(df2.where('Amount < 4000').count())
print(df2.where(df2.Amount < 4000).count())

print (df2.where((df2.Amount >= 3000) & (df2.Amount <= 4000)).count())
print (df2.where('Amount >= 3000 AND Amount <= 4000').count())
print (df2.where('Amount BETWEEN 3000 AND 4000').count())

### ***LAB:*** Using the df2 DataFrame, answer the following questions:


*   How many Platinum card purchases were there with a discount above $100?
*   Find the ten biggest discount amounts earned by women and show just the purchase amount, discount, and date.



In [None]:
display(df2)


### JOINs work as expected.

In [None]:
tab1 = sc.parallelize([(1, 'Alpha'), (2, 'Beta'), (3, 'Delta')]).toDF('ID:int, code:string')
tab2 = sc.parallelize([(100, 'One', 1), (101, 'Two', 2), (102, 'Three', 1), (103, 'Four', 4)]).toDF('childID:int, name:string, parentID:int')
display(tab1)
display(tab2)
display(tab1.join(tab2, tab1.ID == tab2.parentID))
display(tab1.join(tab2, tab1.ID == tab2.parentID, 'left'))
display(tab1.join(tab2, tab1.ID == tab2.parentID, 'right'))
display(tab1.join(tab2, tab1.ID == tab2.parentID, 'full'))


### Examples of aggregate functions.

In [None]:
tab3 = sc.parallelize([(1, 10), (1, 20), (1, 30), (2, 40), (2,50)]).toDF('groupID:int, amount:int')
display(tab3)
tab3.groupby('groupID').max().show()
tab3.groupby('groupID').sum().show()
x = tab3.groupby('groupID')

from pyspark.sql import functions as F
#from pyspark.sql.functions import sum, max
# print(dir(F))
x.agg(F.sum('amount'), F.max('amount')).show()

### Examples of reading a CSV directly into a DataFrame using different styles

In [None]:
filename = f'{datapath}/finance/CreditCard.csv'
df4 = spark.read.load(filename, format = 'csv', sep = ',', inferSchema = True, header = True)
df4.printSchema()

In [None]:
df4 = spark.read.format('csv').option('header','true').option('inferSchema','true').load(filename)
df4.printSchema()

In [None]:
df4 = spark.read.csv(filename, header = True, inferSchema = True)
df4.printSchema()

In [None]:
display(df4)

### Another way of changing column names

In [None]:
cols = df4.columns
cols[0] = 'CityCountry'
df4 = df4.toDF(*cols)
df4.printSchema()

### ***LAB:*** Read the Products file from the JSON folder and categories from the CSVHeaders folder, then join them displaying just the product and category IDs and names, and sort by categoryID then productID. 


**Hint:** Drop the ambiguous column after the join.

### Apply a custom UDF to columns to separate the City and Country and convert the Date into a date datatype.

In [None]:
from pyspark.sql.functions import udf, expr
from pyspark.sql.types import *
from pyspark.sql.functions import to_date
import datetime

def city(x):
    return x[:x.find(',')]

cityUDF = udf(city, StringType())

def country(x):
    return x[x.find(',') + 1 :]

@udf(StringType())
def gender(x):
    return 'Male' if x == 'M' else 'Female'

df5 = df4.withColumn('City', cityUDF(df4.CityCountry)) \
      .withColumn('Country', udf(country, StringType())(df4.CityCountry)) \
      .withColumn('Date', to_date(df4.Date, 'dd-MMM-yy')) \
      .withColumn('Gender', gender(df4.Gender)) \
      .drop(df4.CityCountry)
display(df5)

### DataFrames can be written to a variety of file formats. Here we are writing it to JSON.

In [None]:
df5.write.mode('overwrite').json(f'{rootpath}/CreditCard.json')
print('Done')

### Read a JSON file into a DataFrame, but note that we lose the datatypes.

In [None]:
df6 = spark.read.json(f'{rootpath}/CreditCard.json')
df6.printSchema()

### Create a schema that can be used to import a file and directly name the columns and convert them to the desired data type.

In [None]:
schema = StructType([
    StructField('Date', DateType()), 
    StructField('Card Type', StringType()),
    StructField('Exp Type', StringType()),
    StructField('Gender', StringType()),
    StructField('Amount', FloatType()),
    StructField('City', StringType()),
    StructField('Country', StringType())
])
df6 = spark.read.json(f'{rootpath}/CreditCard.json', schema = schema)
df6.printSchema()
display(df6)