#### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [None]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### DataFrame API ###

The code below demonstrates some common **transformations**, **actions**, and **functions** in the DataFrame API.

**Sections**

- [Data](#Data)
- [Actions](#Actions)

**Key points**

- The datasets used in these examples are designed to have as much complexity as possible while still being small.
- The examples use `printSchema` and `show_as_html` frequently to show the contents of dataframes as they are transformed.

In [None]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)

In [None]:
# We only need to import the Row object and the pyspark sql types, everything else we get from the global variables sc or spark

from pyspark.sql import Row, functions as F
from pyspark.sql.types import *

### Data ###

This code creates two datasets, `data` and `department_data`.

**Key points**

- These datasets are designed to have as much complexity as possible while still being small.
- The datasets are contructed in Python using pyspark Row objects, distributed to give an RDD, and then wrapped with a DataFrame = Dataset[Row].
- This code does not load any data from HDFS.

In [None]:
# Create, distribute, and wrap data by hand

schema = StructType([
    StructField("Name"       ,  StringType() , True),
    StructField("Department" ,  StringType() , True),
    StructField("Age"        , IntegerType() , True),
    StructField("Gender"     ,  StringType() , True),
    StructField("Salary"     ,  DoubleType() , True)
])
data = spark.createDataFrame(  # Finally, wrap the RDD with metadata by creating a DataFrame = Dataset[Row]
    sc.parallelize(  # Second, take that list of pyspark row objects, distribute them as Spark rows in an RDD[Row]
        [  # First, define a list of pyspark row objects (this is just a Python list in memory on the master node)
            Row("Alpha One"   , "X" , 28 , "M"  ,  80000.0),
            Row("Bravo Two"   , "X" , 25 , "M"  ,  70000.0),
            Row("Charlie"     , "X" , 23 , "M"  ,  80000.0),  # Charlie has no last name, duplicate salary in department X
            Row("Delta Four"  , "Y" , 30 , None , 100000.0),  # Gender is none
            Row("Echo Five"   , "Y" , 27 , "F"  , 120000.0),
            Row("Foxtrot Six" , "Z" , 20 , "F"  ,  90000.0),
            Row("Golf Seven"  , "Z" , 20 , "F"  ,  50000.0),  # Duplicate age in department Z
            Row("Hotel Eight" , "Z" , 38 , "F"  , 100000.0),
            Row("Indigo Nine" , "Z" , 50 , "M"  ,  70000.0),
            Row("Juliet Ten"  , "Z" , 18 , "F"  ,     None),  # Salary is none
        ]
    ), schema=schema)

print(type(data))
data.printSchema()
print(data)
show_as_html(data)

In [None]:
# Create, distribute, and wrap additional department data by hand

department_schema = StructType([
    StructField("Department", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Campus", StringType(), True)
])
department_data = spark.createDataFrame(
    sc.parallelize(
        [
            Row("X", "Xray",   "U"),
            Row("Y", "Yankee", "V"),
            Row("Z", "Zulu",   "W"),
        ]
    ), schema=department_schema)

print(type(department_data))
department_data.printSchema()
print(department_data)
show_as_html(department_data)

### Actions ###

The following actions will trigger evaluation of the DAG.

**Key points**

- Some of these actions are dangerous as they collect data from the worker nodes to the master node which has limited memory.

In [None]:
# Actions

print(data)
print("")
print(data.count())
print("")
print(data.head())
print("")
data.show(5, False)

In [None]:
# A dangerous alternative to .head()

rows = data.collect()

print(type(rows))
print("")
print(len(rows))
print("")
for row in rows:
    print(row)

In [None]:
# A more dangerous alternative to .collect() as the conversion to a pandas dataframe involves copying

data_local = data.toPandas()

print(type(data_local))
print("")
print(data_local)
print("")
display(data_local)

In [None]:
# A safe and convenient use of .toPandas() after .limit()

display(data.limit(5).toPandas())

In [None]:
# A nice helper function that ensures safe and convenient use of .toPandas() after .limit()

help(show_as_html)
print("")
show_as_html(data, 5)

In [None]:
# A safe way to iterate over rows one by one locally

for row in data.toLocalIterator():
    print(row)

In [None]:
# Apply a function to each row or partition without having to run .collect()

def rowSink(row):
    print(row)  # write the output to somewhere external e.g. a database or a message queue

def partitionSink(rows):
    for row in rows:
        print(row)  # write the output to somewhere external e.g. a database or a message queue

data.foreach(rowSink)
data.foreachPartition(partitionSink)

In [None]:
# DataFrameWriter (note that there are multiple ways to write this but the result is the same)

data_path = f"hdfs:///tmp/data/"

data.write.csv(data_path)

In [None]:
!hdfs dfs -ls /tmp/data/

In [None]:
# DataFrameWriter (note that there are multiple ways to write this but the result is the same)

data_path = f"hdfs:///user/{username}/data/"

(
    data.write
    .format("csv")
    .mode("overwrite")
    .option("compression", "gzip")
    .save(data_path)
)

In [None]:
!hdfs dfs -ls /user/{username}/data/

### Stop Spark ###

In [None]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()