In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("chapter6").getOrCreate()

df = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("retail-data/by-day/2010-12-10.csv")
df.printSchema()
df.createOrReplaceTempView("dbtable")
spark.sql("select * from dbtable").show(10)

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   538172|    21562|HAWAIIAN GRASS SK...|      12|2010-12-10 09:33:00|     1.25|   15805.0|United Kingdom|
|   538172|    79321|       CHILLI LIGHTS|       8|2010-12-10 09:33:00|     4.95|   15805.0|United Kingdom|
|   538172|    22041|"RECORD FRAME 7""...|      12|2010-12-10 09:33:00|     2.55|   15805.0|United Kingdom|
|   538172|   8

In [35]:
from pyspark.sql.functions import lit, col, instr, round, bround

df.select(lit(5), lit("5"), lit(5.0)).show(1)

df.selectExpr("Description", "CustomerID").where(col("InvoiceNo") == "538172").show(5, False)

priceFilter = col("UnitPrice") > 600
descFilter = instr(df.Description, "POSTAGE") >=1
df.select("Description", "UnitPrice").where(priceFilter | descFilter).show(5)
df.withColumn("criteria", priceFilter | descFilter).where("criteria").show(5)

fabricatedValue = pow((col("Quantity") * col("UnitPrice")), 2) + 5
df.select("*", fabricatedValue.alias("FabricatedValue")).show(5)

df.selectExpr("*", "POWER((Quantity * UnitPrice), 2) + 5").show(2)

df.select(round(lit(2.5)), bround(lit(2.5))).show(1)

df.describe().show(2)

+---+---+---+
|  5|  5|5.0|
+---+---+---+
|  5|  5|5.0|
+---+---+---+
only showing top 1 row

+-------------------------------+----------+
|Description                    |CustomerID|
+-------------------------------+----------+
|HAWAIIAN GRASS SKIRT           |15805.0   |
|CHILLI LIGHTS                  |15805.0   |
|"RECORD FRAME 7"" SINGLE SIZE "|15805.0   |
|3D DOG PICTURE PLAYING CARDS   |15805.0   |
|60 CAKE CASES VINTAGE CHRISTMAS|15805.0   |
+-------------------------------+----------+
only showing top 5 rows

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|       POSTAGE|     18.0|
|DOTCOM POSTAGE|   847.42|
|       POSTAGE|     18.0|
|       POSTAGE|     28.0|
|DOTCOM POSTAGE|   907.47|
+--------------+---------+

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+--------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|criteria|
+---------

In [61]:
from pyspark.sql.functions import initcap, upper, lower, regexp_replace, translate, regexp_extract

df.select(initcap("Description"), col("Description"), upper(col("Description")), lower(col("Description"))).show(5, False)

regex_string = "BLACK|WHITE|RED|BLUE|GREEN"
df.select("Description", regexp_replace(df.Description, regex_string, "COLOR").alias("Replaced")).where(col("Replaced").contains("COLOR")).show(30, False)


df.select("Description", translate(col("Description"), "LEET", "3774")).show(10, False)

extract_string = "(BLACK|WHITE|RED|BLUE|GREEN)"
df.select("Description", regexp_extract(col("Description"), extract_string, 1).alias("extracted")).where(col("extracted") != '').show(10, False)

hasWhite = instr(col("Description"), "WHITE") >= 1
hasBlack = instr(col("Description"), "BLACK") >= 1
df.withColumn("basicColor", hasWhite | hasBlack).where("basicColor").select("Description", "basicColor").show(10, False)

+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
|initcap(Description)           |Description                    |upper(Description)             |lower(Description)             |
+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
|Hawaiian Grass Skirt           |HAWAIIAN GRASS SKIRT           |HAWAIIAN GRASS SKIRT           |hawaiian grass skirt           |
|Chilli Lights                  |CHILLI LIGHTS                  |CHILLI LIGHTS                  |chilli lights                  |
|"record Frame 7"" Single Size "|"RECORD FRAME 7"" SINGLE SIZE "|"RECORD FRAME 7"" SINGLE SIZE "|"record frame 7"" single size "|
|3d Dog Picture Playing Cards   |3D DOG PICTURE PLAYING CARDS   |3D DOG PICTURE PLAYING CARDS   |3d dog picture playing cards   |
|60 Cake Cases Vintage Christmas|60 CAKE CASES VINTAGE CHRISTMAS|60 CAKE CASES VINTAGE CHR

In [82]:
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, to_date

spark.range(10).withColumn("current date", current_date()).withColumn("current timestamp", current_timestamp()).show(1, False)

spark.range(1).withColumn("date1", date_add(current_date(), 7)).withColumn("date2", date_sub(current_date(), 7)).select("date1", "date2", datediff("date1", "date2")).show(1, False)

spark.range(1).withColumn("start", to_date(lit("2017-01-01"))).withColumn("end", to_date(lit("2020-01-01"))).select("start", "end", datediff("end", "start")).show(1, False)

dateformat = "YYYY-dd-MM"
spark.range(1).select(to_date(lit('2017-12-12')).alias("date1"), to_date(lit('2020-11-11')).alias("date2")).select("date2", "date1", datediff(col("date2"), col("date1"))).show(1, False)

+---+------------+-----------------------+
|id |current date|current timestamp      |
+---+------------+-----------------------+
|0  |2021-07-03  |2021-07-03 21:23:13.304|
+---+------------+-----------------------+
only showing top 1 row

+----------+----------+----------------------+
|date1     |date2     |datediff(date1, date2)|
+----------+----------+----------------------+
|2021-07-10|2021-06-26|14                    |
+----------+----------+----------------------+

+----------+----------+--------------------+
|start     |end       |datediff(end, start)|
+----------+----------+--------------------+
|2017-01-01|2020-01-01|1095                |
+----------+----------+--------------------+

+----------+----------+----------------------+
|date2     |date1     |datediff(date2, date1)|
+----------+----------+----------------------+
|2020-11-11|2017-12-12|1065                  |
+----------+----------+----------------------+



In [90]:
from pyspark.sql.functions import coalesce

df.select(coalesce(col("Description")),col("CustomerID")).show(3, False)

print(df.count())

df.na.drop("all")

print(df.count())

df.na.drop("any")

print(df.count())

df.na.fill("all", ["StockCode"])

df.select("StockCode").where(col("StockCode") == "all").show(10, False)

+-------------------------------+----------+
|coalesce(Description)          |CustomerID|
+-------------------------------+----------+
|HAWAIIAN GRASS SKIRT           |15805.0   |
|CHILLI LIGHTS                  |15805.0   |
|"RECORD FRAME 7"" SINGLE SIZE "|15805.0   |
+-------------------------------+----------+
only showing top 3 rows

2758
2758
2758
+---------+
|StockCode|
+---------+
+---------+



In [125]:
from pyspark.sql.functions import split, size, array_contains, explode, create_map

complexDF = df.selectExpr("struct(Description, CustomerID) as complex")

complexDF.select("complex.Description", "complex.CustomerID").show(1, False)

df.select(split(col("Description"), " ").alias("splitted")).show(1, False)

df.select(split(col("Description"), " ").alias("splitted")).selectExpr("splitted[0]").show(1, False)

df.select(split(col("Description"), " ").alias("splitted")).select("splitted", size(col("splitted"))).show(5, False)

df.select(split(col("Description"), " ").alias("splitted")).select("splitted", array_contains(col("splitted"), "GRASS")).show(5, False)

df.select(split(col("Description"), " ").alias("splitted")).select("splitted", explode(col("splitted")).alias("exploded")).show(10, False)

df.select(create_map(col("CustomerID"), col("Description")).alias("createdMap")).selectExpr( "createdMap", "createdMap['15805.0']").show(2, False)

+---------------------+----------+
|Description          |CustomerID|
+---------------------+----------+
|HAWAIIAN GRASS SKIRT |15805.0   |
+---------------------+----------+
only showing top 1 row

+--------------------------+
|splitted                  |
+--------------------------+
|[HAWAIIAN, GRASS, SKIRT, ]|
+--------------------------+
only showing top 1 row

+-----------+
|splitted[0]|
+-----------+
|HAWAIIAN   |
+-----------+
only showing top 1 row

+--------------------------------------+--------------+
|splitted                              |size(splitted)|
+--------------------------------------+--------------+
|[HAWAIIAN, GRASS, SKIRT, ]            |4             |
|[CHILLI, LIGHTS]                      |2             |
|["RECORD, FRAME, 7"", SINGLE, SIZE, "]|6             |
|[3D, DOG, PICTURE, PLAYING, CARDS]    |5             |
|[60, CAKE, CASES, VINTAGE, CHRISTMAS] |5             |
+--------------------------------------+--------------+
only showing top 5 rows

+--------