# Data Sources

In [1]:
%run ./Includes/paths.py

In [4]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

from delta import *
import pandas as pd

# start spark
builder = (pyspark.sql.SparkSession.builder.appName("Spark-Course")
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# setting log-level to ERROR to decrease verbosity
# log4j log-levels are: OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE, ALL
spark.sparkContext.setLogLevel("ERROR")

# so that we can register UDFs in SQL
spark.builder.enableHiveSupport()

spark

In [3]:
def display(spark_df, rows=10):
    return spark_df.limit(rows).toPandas().head(rows)

## Reading Data

In [None]:
# fellowship.csv

# name,race,age
# Frodo,Hobbit,50
# Sam,Hobbit,38
# Merry,Hobbit,36
# Pippin,Hobbit,28
# Gandalf,Maia,2000
# Legolas,Elf,2931
# Gimli,Dwarf,139
# Aragorn,Man,87
# Boromir,Man,40
# -----,

# last row is corrupted

In [72]:
display(
    spark.read.format('csv')
        .option('path', sample_data_path + 'fellowship.csv')
        .option('header', 'true')
        .option('inferShema', 'true')
        .option('mode', 'permissive') # default mode, allows in the corrupt record
        .load()
)

Unnamed: 0,name,race,age
0,Frodo,Hobbit,50.0
1,Sam,Hobbit,38.0
2,Merry,Hobbit,36.0
3,Pippin,Hobbit,28.0
4,Gandalf,Maia,2000.0
5,Legolas,Elf,2931.0
6,Gimli,Dwarf,139.0
7,Aragorn,Man,87.0
8,Boromir,Man,40.0
9,-----,,


In [74]:
drop_malformed = (
    spark.read.format('csv')
        .option('header', 'true')
        .option('inferShema', 'true')
        .option('mode', 'dropMalformed')
        .load(sample_data_path + 'fellowship.csv') # path inside load(); got rid of option('path', path_to_file)
)

display(drop_malformed)

Unnamed: 0,name,race,age
0,Frodo,Hobbit,50
1,Sam,Hobbit,38
2,Merry,Hobbit,36
3,Pippin,Hobbit,28
4,Gandalf,Maia,2000
5,Legolas,Elf,2931
6,Gimli,Dwarf,139
7,Aragorn,Man,87
8,Boromir,Man,40


In [69]:
# however, the count returns 10 instead of 9
drop_malformed.count()

10

In [18]:
# and select distinct selects name from the corrupt row
display(drop_malformed.select('name').distinct())

Unnamed: 0,name
0,Boromir
1,-----
2,Aragorn
3,Pippin
4,Merry
5,Gandalf
6,Frodo
7,Sam
8,Legolas
9,Gimli


In [None]:
# failFast fails because of the corrupted row

display(
    spark.read
        .option('path', sample_data_path + 'fellowship.csv')
        .option('header', 'true')
        .option('inferShema', 'true')
        .option('mode', 'failFast')
        .csv(sample_data_path + 'fellowship.csv') # .csv() instead of .load(); got rid of option('path', path_to_file) and load()
)

## Writing Data

In [23]:
fellowship = (spark.read.format('csv')
                .option('path', sample_data_path + 'fellowship.csv')
                .option('header', 'true')
                .option('inferSchema', 'true')
                .option('mode', 'permissive')
                .load()
                .where('name <> "-----"'))

display(fellowship)

Unnamed: 0,name,race,age
0,Frodo,Hobbit,50
1,Sam,Hobbit,38
2,Merry,Hobbit,36
3,Pippin,Hobbit,28
4,Gandalf,Maia,2000
5,Legolas,Elf,2931
6,Gimli,Dwarf,139
7,Aragorn,Man,87
8,Boromir,Man,40


In [53]:
races = (fellowship.groupby('race').agg(mean('age').alias('avg_age')))

In [33]:
def read_fellowship_agg():
    return (spark.read.format('csv')
            .option('path', sample_data_path + 'fellowship_agg.csv')
            .option('header', 'true')
            .option('inferSchema', 'true')
            .option('mode', 'permissive')
            .load()
    )

In [58]:
(races.write
    .format('csv')
    .option('path', sample_data_path + 'fellowship_agg.csv')
    .option('header', 'true')
    .mode('overwrite')
    .save())

In [59]:
display(read_fellowship_agg())

Unnamed: 0,race,avg_age
0,Elf,2931.0
1,Man,63.5
2,Maia,2000.0
3,Hobbit,38.0
4,Dwarf,139.0


In [None]:
(races.write
    .format('csv')
    .option('path', sample_data_path + 'fellowship_agg.csv')
    .option('header', 'true')
    .mode('errorIfExists') # default; fails if data exists
    .save())

In [61]:
races2 = (fellowship.rollup('race').agg(count('name').alias('num_member')))

In [62]:
(races2.write
    .format('csv')
    .option('path', sample_data_path + 'fellowship_agg.csv')
    .option('header', 'true')
    .mode('ignore') # does nothing, raises no error, even if data already exists
    .save())

In [63]:
display(read_fellowship_agg())

Unnamed: 0,race,avg_age
0,Elf,2931.0
1,Man,63.5
2,Maia,2000.0
3,Hobbit,38.0
4,Dwarf,139.0


In [64]:
(races2.write
    .format('csv')
    .option('path', sample_data_path + 'fellowship_agg.csv')
    .option('header', 'true')
    .mode('append') # appends below current data
    .save())

In [65]:
display(read_fellowship_agg())

Unnamed: 0,race,avg_age
0,Elf,2931.0
1,Man,63.5
2,Maia,2000.0
3,Hobbit,38.0
4,Dwarf,139.0
5,Elf,1.0
6,Hobbit,4.0
7,Maia,1.0
8,,9.0
9,Dwarf,1.0


In [75]:
# spark.stop()