### System Config:

In [1]:
#initial config work

import os
import sys
        
# add working directory
os.chdir(os.getcwd())

# Create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path.
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.7-src.zip"))

### Spark Session Config:

In [2]:
#create spark session
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                     .master("local")
                     .appName("scratch")
                     .config("spark.executor.memory", "1g")
                     .config("spark.cores.max", "2")
                     .getOrCreate())

In [3]:

retail_df = (spark.read.csv('./../../input-data/test-data/retail.csv',
                                schema=None,
                                sep=",",
                                inferSchema=True,
                                header=True))
retail_df.printSchema()

AnalysisException: 'Path does not exist: file:/home/maverick/workspace/personal-workspace/Data-engineerng-use-cases/z-scratch-space/input-data/test-data/retail.csv;'

#### checking if broadcast partition causes a new stage

In [None]:
df1 = spark.range(0,10000,2)
df2 = spark.range(0,10000,2)

In [None]:
df3 = df1.selectExpr('(id*5) as id')
df4 = df3.join(df2, 'id')

In [None]:
df5 = df4.repartition(3)

In [None]:
from pyspark.sql.functions import sum, col
df6 = df5.agg(sum(col('id')))

In [None]:
df6.show(1,False)

### Basic tests

In [None]:
from pyspark.sql.functions import expr, locate

simpleColors = ["black", "white", "red", "green", "blue"]

def color_locator(column, color_string):
    return (locate(color_string.upper(), column)
                            .cast("boolean")
                            .alias("is_" + color_string))


selectedColumns = [color_locator(df.Description, c) for c in simpleColors]



In [None]:
selectedColumns

In [None]:
selectedColumns.append(expr("*"))

In [None]:
df.select(selectedColumns)

In [None]:
df.select(selectedColumns).where(expr("is_white OR is_red")).select("is_white").show(3, False)

In [None]:
from pyspark.sql.functions import struct, expr
complexDf = df.withColumn('ComplexCountry',expr('(Country,(CustomerId,Description))'))
complexDf.select('$ComplexCountry.CustomerId').show(2, False)

In [None]:
from pyspark.sql.functions import split, explode
descSplits = split(expr('Description')," ").alias('splits')
descExplodes = explode(split(expr('Description')," ").alias('splits')).alias('explodes')
df.select(expr('Description'), descSplits, descExplodes).show(10,False)

In [None]:
jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

In [None]:
from pyspark.sql.functions import get_json_object, json_tuple, col
jsonDF.select(get_json_object(col("jsonString"),"$.myJSONKey.myJSONValue[1]").alias("column"), json_tuple(col("jsonString"), "myJSONKey")).show(2,False)

In [None]:
from pyspark.sql.functions import col, expr

(retail_df.where(col('Quantity') > 3).show(4))

### Session Stop:

In [None]:
spark.stop()