In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CSV_Read").getOrCreate()

df = spark.read.format("csv")\
 .option("header", "true")\
 .option("inferSchema", "true")\
 .load("/content/2010-12-01.csv")

df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [None]:
df.count()

3108

In [None]:
df.createOrReplaceTempView("dfTable")

In [None]:
spark.sql("select * from dfTable").show(10)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [None]:
from pyspark.sql.functions import lit,col

df.select(lit(5), lit("five"), lit(5.0),col("InvoiceNo"),col("StockCode")).show(5)

+---+----+---+---------+---------+
|  5|five|5.0|InvoiceNo|StockCode|
+---+----+---+---------+---------+
|  5|five|5.0|   536365|   85123A|
|  5|five|5.0|   536365|    71053|
|  5|five|5.0|   536365|   84406B|
|  5|five|5.0|   536365|   84029G|
|  5|five|5.0|   536365|   84029E|
+---+----+---+---------+---------+
only showing top 5 rows


In [None]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



#Working with Boolean

In [None]:
from pyspark.sql.functions import col

'''
df.where(col("InvoiceNo") != 536365)\
 .select("InvoiceNo", "Description")\
 .show(50, False)
'''
df.selectExpr("InvoiceNo", "Description","Description","Quantity")\
.where(col("InvoiceNo")!=536365)\
.show(10)



+---------+--------------------+--------------------+--------+
|InvoiceNo|         Description|         Description|Quantity|
+---------+--------------------+--------------------+--------+
|   536366|HAND WARMER UNION...|HAND WARMER UNION...|       6|
|   536366|HAND WARMER RED P...|HAND WARMER RED P...|       6|
|   536367|ASSORTED COLOUR B...|ASSORTED COLOUR B...|      32|
|   536367|POPPY'S PLAYHOUSE...|POPPY'S PLAYHOUSE...|       6|
|   536367|POPPY'S PLAYHOUSE...|POPPY'S PLAYHOUSE...|       6|
|   536367|FELTCRAFT PRINCES...|FELTCRAFT PRINCES...|       8|
|   536367|IVORY KNITTED MUG...|IVORY KNITTED MUG...|       6|
|   536367|BOX OF 6 ASSORTED...|BOX OF 6 ASSORTED...|       6|
|   536367|BOX OF VINTAGE JI...|BOX OF VINTAGE JI...|       3|
|   536367|BOX OF VINTAGE AL...|BOX OF VINTAGE AL...|       2|
+---------+--------------------+--------------------+--------+
only showing top 10 rows


In [None]:
df.selectExpr("InvoiceNo", "Description","Description","Quantity")\
.where("InvoiceNo <> 536365")\
.show(10)

+---------+--------------------+--------------------+--------+
|InvoiceNo|         Description|         Description|Quantity|
+---------+--------------------+--------------------+--------+
|   536366|HAND WARMER UNION...|HAND WARMER UNION...|       6|
|   536366|HAND WARMER RED P...|HAND WARMER RED P...|       6|
|   536367|ASSORTED COLOUR B...|ASSORTED COLOUR B...|      32|
|   536367|POPPY'S PLAYHOUSE...|POPPY'S PLAYHOUSE...|       6|
|   536367|POPPY'S PLAYHOUSE...|POPPY'S PLAYHOUSE...|       6|
|   536367|FELTCRAFT PRINCES...|FELTCRAFT PRINCES...|       8|
|   536367|IVORY KNITTED MUG...|IVORY KNITTED MUG...|       6|
|   536367|BOX OF 6 ASSORTED...|BOX OF 6 ASSORTED...|       6|
|   536367|BOX OF VINTAGE JI...|BOX OF VINTAGE JI...|       3|
|   536367|BOX OF VINTAGE AL...|BOX OF VINTAGE AL...|       2|
+---------+--------------------+--------------------+--------+
only showing top 10 rows


In [None]:
df.selectExpr("InvoiceNo", "Description","Description","Quantity")\
.where("InvoiceNo = '536366'")\
.show()

+---------+--------------------+--------------------+--------+
|InvoiceNo|         Description|         Description|Quantity|
+---------+--------------------+--------------------+--------+
|   536366|HAND WARMER UNION...|HAND WARMER UNION...|       6|
|   536366|HAND WARMER RED P...|HAND WARMER RED P...|       6|
+---------+--------------------+--------------------+--------+



In [None]:
from pyspark.sql.functions import instr

priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      NULL|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      NULL|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [None]:
df.where(df.StockCode.isin("DOT")).where(priceFilter).where(descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      NULL|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



# Working with Numbers

In [None]:
from pyspark.sql.functions import expr, pow,round

fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"),expr("Quantity"),expr("UnitPrice"),expr("Description"), round(fabricatedQuantity,2).alias("realQuantity")).show(2)

+----------+--------+---------+--------------------+------------+
|CustomerId|Quantity|UnitPrice|         Description|realQuantity|
+----------+--------+---------+--------------------+------------+
|   17850.0|       6|     2.55|WHITE HANGING HEA...|      239.09|
|   17850.0|       6|     3.39| WHITE METAL LANTERN|      418.72|
+----------+--------+---------+--------------------+------------+
only showing top 2 rows


In [None]:
fabricatedQuantity = round((pow(col("Quantity") * col("UnitPrice"), 2) + 5),2)
df.selectExpr("CustomerId","Quantity","UnitPrice","Description", fabricatedQuantity.alias("realQuantity")).show(2)
#Note : selectExpr() accepts ONLY strings (SQL expressions)

PySparkTypeError: [NOT_ITERABLE] Column is not iterable.

In [None]:
df.selectExpr(
 "CustomerId","Quantity","UnitPrice","Description",
 "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

'''
in SQL
SELECT customerId, (POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity
FROM dfTabl
'''

+----------+--------+---------+--------------------+------------------+
|CustomerId|Quantity|UnitPrice|         Description|      realQuantity|
+----------+--------+---------+--------------------+------------------+
|   17850.0|       6|     2.55|WHITE HANGING HEA...|239.08999999999997|
|   17850.0|       6|     3.39| WHITE METAL LANTERN|          418.7156|
+----------+--------+---------+--------------------+------------------+
only showing top 2 rows


'\nin SQL\nSELECT customerId, (POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity\nFROM dfTabl\n'

In [None]:
from pyspark.sql.functions import corr
# Pearson Coeffieint(k)
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [None]:
 #common task is to compute summary statistics for a column or set of columns.
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                NULL| 8.627413127413128| 4.151946589446603|15661.388719512195|          NULL|
| stddev|72.89447869788873|17407.897548583845|                NULL|26.371821677029203|15.638659854603892|1854.4496996893627|          NULL|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

#  Working with Strings

In [None]:
from pyspark.sql.functions import initcap, col

#making all strings uppercase or lowercase
df.select(initcap(col("Description"))).show(5)

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
+--------------------+
only showing top 5 rows


In [None]:
from pyspark.sql.functions import col, lower, upper

df.select(col("Description"),
  lower(col("Description")),
  upper(lower(col("Description")))).show(2)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+-------------------------+
only showing top 2 rows


In [None]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim

df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)

+---------+---------+-----+---+----------+
|    ltrim|    rtrim| trim| lp|        rp|
+---------+---------+-----+---+----------+
|HELLO    |    HELLO|HELLO|HEL|HELLO     |
|HELLO    |    HELLO|HELLO|HEL|HELLO     |
+---------+---------+-----+---+----------+
only showing top 2 rows


#Regular Expression

Regular expressions give
 the user an ability to specify a set of rules to use to either extract values from a string or replace them with some other values.

  There are two key functions in Spark that you’ll need in
 order to perform regular expression tasks:
 **regexp_extract** and **regexp_replace**.

 These functions extract values and replace values, respectively

In [None]:
from pyspark.sql.functions import regexp_replace

regex_string = "BLACK|WHITE|RED|GREEN|BLUE"

df.select(regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),col("Description")).show(10)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
|CREAM CUPID HEART...|CREAM CUPID HEART...|
|KNITTED UNION FLA...|KNITTED UNION FLA...|
|COLOR WOOLLY HOTT...|RED WOOLLY HOTTIE...|
|SET 7 BABUSHKA NE...|SET 7 BABUSHKA NE...|
|GLASS STAR FROSTE...|GLASS STAR FROSTE...|
|HAND WARMER UNION...|HAND WARMER UNION...|
|HAND WARMER COLOR...|HAND WARMER RED P...|
|ASSORTED COLOUR B...|ASSORTED COLOUR B...|
+--------------------+--------------------+
only showing top 10 rows


In [None]:
from pyspark.sql.functions import translate

df.select(translate(col("Description"), "LEET", "1337"),col("Description")).show(2)

#-- in SQL
#SELECT translate(Description, 'LEET', '1337'), Description FROM dfTable

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows


#Working with Dates and Timestamps

In [None]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())

dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [None]:
spark.sql("select * from dateTable").show()

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2025-12-26|2025-12-26 18:42:...|
|  1|2025-12-26|2025-12-26 18:42:...|
|  2|2025-12-26|2025-12-26 18:42:...|
|  3|2025-12-26|2025-12-26 18:42:...|
|  4|2025-12-26|2025-12-26 18:42:...|
|  5|2025-12-26|2025-12-26 18:42:...|
|  6|2025-12-26|2025-12-26 18:42:...|
|  7|2025-12-26|2025-12-26 18:42:...|
|  8|2025-12-26|2025-12-26 18:42:...|
|  9|2025-12-26|2025-12-26 18:42:...|
+---+----------+--------------------+



In [None]:
from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(10)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
|        2025-12-21|        2025-12-31|
+------------------+------------------+



In [None]:
from pyspark.sql.functions import datediff, months_between, to_date

dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff(col("week_ago"), col("today"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row


In [None]:
dateDF.select( to_date(lit("2026-01-01")).alias("start"),to_date(lit("2027-05-01")).alias("end"))\
.select(months_between(col("end"), col("start"))).show(1)

 #-- in SQL
 #SELECT to_date('2016-01-01'), months_between('2016-01-01', '2017-01-01'),
 #datediff('2016-01-01', '2017-01-01')
 #FROM dateTable

+--------------------------------+
|months_between(end, start, true)|
+--------------------------------+
|                            16.0|
+--------------------------------+
only showing top 1 row


In [None]:
from pyspark.sql.functions import to_date

dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1)\
              .select(to_date(lit("2017-12-11"), dateFormat).alias("date"),
              to_date(lit("2017-20-12"), dateFormat).alias("date2"))

cleanDateDF.createOrReplaceTempView("dateTable2")

spark.sql("select * from dateTable2").show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



# Working with Nulls in Data

The primary way of interacting with null values, at DataFrame scale, is to
use the .na subpackage on a DataFrame

There are two things you can do with null values:
*   Can explicitly drop nulls
*   or can fill them with a value (globally or on a per-column basis).




In [None]:
#Spark includes a function to allow you to select the first non-null value from a set of columns by
#using the coalesce function.
from pyspark.sql.functions import col, coalesce

df.select(
    coalesce(col("Description"), col("CustomerId").cast("string")).alias("final_value")).show()

+--------------------+
|         final_value|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|
|SET 7 BABUSHKA NE...|
|GLASS STAR FROSTE...|
|HAND WARMER UNION...|
|HAND WARMER RED P...|
|ASSORTED COLOUR B...|
|POPPY'S PLAYHOUSE...|
|POPPY'S PLAYHOUSE...|
|FELTCRAFT PRINCES...|
|IVORY KNITTED MUG...|
|BOX OF 6 ASSORTED...|
|BOX OF VINTAGE JI...|
|BOX OF VINTAGE AL...|
|HOME BUILDING BLO...|
|LOVE BUILDING BLO...|
|RECIPE BOX WITH M...|
+--------------------+
only showing top 20 rows


  *   ifnull - ifnull allows to select the second value if the first is null, and defaults to the first.

  *   nullIf - use nullif, which returns null if the two values are equal or else returns the second if they are not.

  *   nvl- vl returns the second value if the first is null, but defaults to the first
  *   nvl2 - nvl2 returns the second value if the first is not null; otherwise, it will return the last specified value

** drop**
 The simplest function is drop, which removes rows that contain nulls. The default is to drop any
 row in which any value is null:
 *   df.na.drop()
 *   df.na.drop("any")

Specifying "any" as an argument drops a row if any of the values are null. Using “all” drops the row only if all values are null or NaN for that row:
*   df.na.drop("all")

In [None]:
df.na.drop("all", subset=["StockCode", "InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

**fill**


 Using the fill function, we can fill one or more columns with a set of values. This can be done by specifying a map—that is a particular value and a set of columns.

 For example, to fill all null values in columns of type String, we might specify the following:
 df.na.fill("All Null values become this string")


 **replace**


  df.na.replace([""], ["UNKNOWN"], "Description")

In [None]:
df.na.replace([""], ["UNKNOWN"], "Description").show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [None]:
df.na.replace(["United Kingdom"], ["India"], "Country").show()

+---------+---------+--------------------+--------+-------------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+-------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|  India|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|  India|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|  India|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|  India|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|  India|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|  India|
|   536365|    21730|GLASS STAR FROSTE...|       6|2010-12-01 08:26:00|     4.25|   17850.0

** Ordering**


 use asc_nulls_first, desc_nulls_first, asc_nulls_last, or desc_nulls_last to specify where we would like null values to appear in an ordered DataFrame.

# Working with Complex Types

There are three kinds of complex types:
*   Structs
*   Arrays
*   Maps

In [4]:
from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [6]:
spark.sql("select * from complexDF").show(2)

+--------------------+
|             complex|
+--------------------+
|{WHITE HANGING HE...|
|{WHITE METAL LANT...|
+--------------------+
only showing top 2 rows


In [7]:
complexDF.show(2)

+--------------------+
|             complex|
+--------------------+
|{WHITE HANGING HE...|
|{WHITE METAL LANT...|
+--------------------+
only showing top 2 rows


**Split**

In [13]:
from pyspark.sql.functions import split,col,size
df.select(split(col("Description"), " ")).show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
+-------------------------+
only showing top 2 rows


In [10]:
df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows


In [14]:
df.select(size(split(col("Description"), " "))).show(2) # shows 5 and

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|                              5|
|                              3|
+-------------------------------+
only showing top 2 rows


In [15]:
from pyspark.sql.functions import array_contains

df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows


**Explode**

The explode function takes a column that consists of arrays and creates one row per value in the array.

In [17]:
from pyspark.sql.functions import explode

df.withColumn("splitted", split(col("Description"), " "))\
 .withColumn("exploded", explode(col("splitted")))\
 .select("Description", "InvoiceNo", "exploded").show(2)

+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
+--------------------+---------+--------+
only showing top 2 rows


 **Maps**

In Spark, a Map is a data type that stores key–value pairs
just like a dictionary in Python


# JSON support

In [18]:
from pyspark.sql.functions import from_json,to_json
from pyspark.sql.types import *

parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))

df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)

+--------------------+--------------------+
|  from_json(newJSON)|             newJSON|
+--------------------+--------------------+
|{536365, WHITE HA...|{"InvoiceNo":"536...|
|{536365, WHITE ME...|{"InvoiceNo":"536...|
+--------------------+--------------------+
only showing top 2 rows


# User Defined Function(UDF)

Write your own custom transformations using Python or Scala and even use external libraries.They’re just functions that operate on the data, record by record. By default, these functions are registered as temporary functions to be used in that specific SparkSession or Context.

There are performance considerations that should be aware of


In [21]:
 #Step 1:
def power3(double_value):
  return double_value ** 3

In [22]:
udfExampleDF = spark.range(5).toDF("num")
power3(2.0)

8.0

In [23]:
udfExampleDF.show()

+---+
|num|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [24]:
#Step 2:
 #we need to register them with Spark so
 #that we can use them on all of our worker machines. Spark will serialize the function on the
 #driver and transfer it over the network to all executor processes.

from pyspark.sql.functions import udf
power3udf = udf(power3)


In [26]:
 #Step 3:
 #we can use it in our DataFrame code:

from pyspark.sql.functions import col
udfExampleDF.select(power3udf(col("num"))).show(5)

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
|         27|
|         64|
+-----------+

