Where to Look for APIs

In [None]:
from pyspark.shell import spark

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/datasets/retail-data/by-day/2010-12-01.csv")

df.printSchema()

df.createOrReplaceTempView("dfTable")

In [None]:
spark.sql("SELECT * FROM dfTable LIMIT 5").show()

Converting to Spark Types

In [None]:
from pyspark.sql.functions import lit

df.select(lit(5), lit("five"), lit(5.0)).show(5)

Working with Booleans

Example 1

In [None]:
from pyspark.sql.functions import col

df.where(col("InvoiceNo") != "536365") \
    .select("InvoiceNo", "Description") \
    .show(5, False)

In [None]:
df.where("InvoiceNo = '536365'").show(5, False)

Example 2

In [None]:
df.where("InvoiceNo <> '536365'").show(5, False)

Example 3

In [None]:
from pyspark.sql.functions import instr

priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

Example 4

In [None]:
from pyspark.sql.functions import instr

DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1

df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter)) \
    .where("isExpensive") \
    .select("UnitPrice", "isExpensive") \
    .show(5)

Example 5

In [None]:
from pyspark.sql.functions import expr

df.withColumn("isExpensive", expr("NOT UnitPrice <= 250")) \
    .where("isExpensive") \
    .select("Description", "UnitPrice") \
    .show(5)

Working with Numbers

Example 1

In [None]:
from pyspark.sql.functions import expr, pow, round

fabricatedQuantity = round(pow(col("Quantity") * col("UnitPrice"), 2) + 5, 2)
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(5)

Example 2

In [None]:
df.selectExpr(
    "CustomerId",
    "ROUND(POWER((Quantity * UnitPrice), 2.0) + 5, 2) AS realQuantity").show(5)

Example 3

In [None]:
spark.sql("SELECT CustomerId, ROUND(POWER((Quantity * UnitPrice), 2.0) + 5, 2) AS realQuantity FROM dfTable").show(5)

Example 4

In [None]:
from pyspark.sql.functions import lit, round, bround

df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

Example 5

In [None]:
spark.sql("SELECT ROUND(2.5), BROUND(2.5)").show(2)

Example 6

In [None]:
from pyspark.sql.functions import corr

df.stat.corr("UnitPrice", "Quantity")
df.select(corr("UnitPrice", "Quantity")).show()

Example 7

In [None]:
spark.sql("SELECT corr(Quantity, UnitPrice) FROM dfTable").show()

Example 8

In [None]:
df.describe().show()

Example 9

In [None]:
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05

df.stat.approxQuantile("UnitPrice", quantileProbs, relError) # 2.51

Example 10

In [None]:
df.stat.crosstab("StockCode", "Country").show(5)

Example 11

In [None]:
df.stat.freqItems(["StockCode", "Quantity"]).show()

Example 12

In [None]:
from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(10)

Working with Strings

Example 1

In [None]:
from pyspark.sql.functions import initcap

df.select(initcap("Description")).show(5)

Example 2

In [None]:
from pyspark.sql.functions import lower, upper

df.select(col("Description"),
          lower(col("Description")),
          upper(col("Description"))).show(5)

Example 3

In [None]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim

df.select(
    ltrim(lit("     HELLO     ")).alias("ltrim"),
    rtrim(lit("     HELLO     ")).alias("rtrim"),
    trim(lit("     HELLO     ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")
).show(2)

Regular Expressions

Example 1

In [None]:
from pyspark.sql.functions import regexp_replace

regex_string = "BLACK|WHITE|RED|GREEN|BLUE"

df.select(
    regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
    col("Description")
).show(5)

Example 2

In [None]:
from pyspark.sql.functions import translate

df.select(
    translate(col("Description"), "LEET", "1337"),
    col("Description")
).show(5)

Example 3

In [None]:
from pyspark.sql.functions import regexp_extract

extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
    regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
    col("Description")
).show(5)

Example 4

In [None]:
from pyspark.sql.functions import instr

containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1

df.withColumn("hasSimpleColor", containsBlack | containsWhite) \
    .where("hasSimpleColor") \
    .select("Description") \
    .show(5, False)

Example 5

In [None]:
from pyspark.sql.functions import expr, locate

simpleColors = ["black", "white", "red", "green", "blue"]

def color_locator(column, color_string):
    return locate(color_string.upper(), column) \
            .cast("boolean") \
            .alias("is_" + color_string)

selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type

df.select(*selectedColumns).where(expr("is_white OR is_red")) \
    .select("Description") \
    .show(5, False)

Working with Dates and Timestamps

Example 1

In [None]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10) \
    .withColumn("today", current_date()) \
    .withColumn("now", current_timestamp())

dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

Example 2

In [None]:
from pyspark.sql.functions import date_add, date_sub

dateDF.select(date_sub(col("today"), 5), \
              date_add(col("today"), 5)) \
              .show(1)

Example 3

In [None]:
from pyspark.sql.functions import datediff, months_between, to_date

dateDF.withColumn("week_ago", date_sub(col("today"), 7)) \
      .select(datediff(col("week_ago"), col("today"))) \
      .show(1)

dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end")
).select(months_between(col("start"), col("end"))).show(1)

Example 4

In [None]:
from pyspark.sql.functions import to_date, lit

spark.range(5) \
    .withColumn("date", to_date(lit("2017-01-01"))) \
    .select(to_date(col("date"))).show(1)

Example 5

In [None]:
from pyspark.sql.functions import to_date

dateFormat = "yyyy-MM-dd"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date1"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2")
)

cleanDateDF.createOrReplaceTempView("cleanDateTable")

spark.sql("""
    SELECT
          to_date(date1, 'yyyy-MM-dd') AS date1,
          to_date(date2, 'yyyy-MM-dd') AS date2,
          to_date(date1)
    FROM cleanDateTable
""").show()

Example 6

In [None]:
from pyspark.sql.functions import to_timestamp

cleanDateDF.select(to_timestamp(col("date1"), dateFormat).alias("dateF")).show()

Coalesce

In [None]:
from pyspark.sql.functions import coalesce

df.select(coalesce(col("Description"), col("CustomerId")).alias("descriptionCustomer")).show()

Working with Complex Types

Example 1 - Structs

In [None]:
from pyspark.sql.functions import struct

complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

complexDF.select("complex.Description")
complexDF.select(col("complex").getField("Description"))
complexDF.select("complex.*")

Example 2 - Array Split

In [None]:
from pyspark.sql.functions import split
df.select(split(col("Description"), " ").alias("NewColumn")).show(5)

In [None]:
df.select(split(col("Description"), " ").alias("array_col")).selectExpr("array_col[0]").show(5)

Example 3 - Array Length

In [None]:
from pyspark.sql.functions import size

df.select(size(split(col("Description"), " ")).alias("sizeColumn")).show(5)

Example 4 - Array Contains

In [None]:
from pyspark.sql.functions import array_contains

df.select(array_contains(split(col("Description"), " "), "WHITE").alias("columnArray")).show(5)

Example 5 - Explode

In [None]:
from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " ")) \
    .withColumn("exploded", explode(col("splitted"))) \
    .select("Description", "InvoiceNo", "exploded").show(5)

Example 6 - Maps

In [None]:
from pyspark.sql.functions import create_map

df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(5)

In [None]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")) \
    .selectExpr("complex_map['WHITE METAL LANTERN']").show(5)

Working with JSON

Example 1

In [None]:
from pyspark.sql.functions import get_json_object, json_tuple

jsonDF = spark.range(1).selectExpr("""'{"myJSONKey" : {"myJSONValue" : [1,2,3]}}' as jsonString""")

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
    json_tuple("jsonString", "myJSONKey")
).show(2)

In [None]:
jsonDF.selectExpr(
    "get_json_object(jsonString, '$.myJSONKey.myJSONValue[1]') as column",
    "json_tuple(jsonString, 'myJSONKey')"
).show(2)

Example 2

In [None]:
from pyspark.sql.functions import to_json

df.selectExpr("(InvoiceNo, Description) AS myStruct") \
    .select(to_json(col("myStruct")))

Example 3

In [None]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *

parsedSchema = StructType((
    StructField("InvoiceNo", StringType(), True),
    StructField("Description", StringType(), True)
))

df.selectExpr("(InvoiceNo, Description) AS myStruct") \
    .select(to_json(col("myStruct")).alias("newJSON")) \
    .select(from_json(col("newJSON"), parsedSchema), col("newJSON")).show(2)

End