## Working with diffrent Types of data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import regexp_replace, col

spark = SparkSession.builder.appName("PySparkPractice").getOrCreate()

In [2]:
spark = SparkSession.builder.appName("PySparkPractice").getOrCreate()

In [3]:
path = "C:/Users/srima/Documents/learning/Technology/Data Engineering/PySpark/Spark-The-Definitive-Guide-master/data/retail-data/by-day"

In [4]:
# Create Shema
schema = StructType([
    StructField("InvoiceNo", IntegerType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", TimestampType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("Country", StringType(), True)
])

df = spark.read.format("csv").option("header", "true").schema(schema).load(f"{path}/2010-12-01.csv")
df.show(5)
df.printSchema()
df.createOrReplaceTempView("dfTable")


+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows
root

### Converting to Spark Types

In [5]:
#df.select(lit(5), lit("five"), lit(5.0))

### Working with Booleans

In [6]:
df.where(col("InvoiceNo") != 536365).select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows


In [7]:
df.where("InvoiceNo <> 536365").show(5, False)

+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                  |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|536366   |22633    |HAND WARMER UNION JACK       |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536366   |22632    |HAND WARMER RED POLKA DOT    |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536367   |84879    |ASSORTED COLOUR BIRD ORNAMENT|32      |2010-12-01 08:34:00|1.69     |13047.0   |United Kingdom|
|536367   |22745    |POPPY'S PLAYHOUSE BEDROOM    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
|536367   |22748    |POPPY'S PLAYHOUSE KITCHEN    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
+---------+---------+-----------------------------+--------+----

In [8]:
priceFilter = col("UnitPrice") > 600
descripFilter = col("Description").contains("POSTAGE")

df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      NULL|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      NULL|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [9]:
DOTCodeFilter = col("StockCode") == "DOT"
descripFilter = instr(col("Description"), "POSTAGE") >= 1

df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
    .where("isExpensive")\
        .select("UnitPrice", "isExpensive" ).show(5, False)

+---------+-----------+
|UnitPrice|isExpensive|
+---------+-----------+
|569.77   |true       |
|607.49   |true       |
+---------+-----------+



In [10]:
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
.where("isExpensive")\
.select("UnitPrice", "Description").show(5, False)


+---------+--------------+
|UnitPrice|Description   |
+---------+--------------+
|569.77   |DOTCOM POSTAGE|
|607.49   |DOTCOM POSTAGE|
+---------+--------------+



### Working with numbers

In [11]:
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerID"), fabricatedQuantity.alias("realQuentity")).show(2)

+----------+------------------+
|CustomerID|      realQuentity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows


In [12]:
df.selectExpr("CustomerId", "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuentity").show(2)

+----------+------------------+
|CustomerId|      realQuentity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows


In [13]:
#df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

In [14]:
df.stat.corr("UnitPrice", "Quantity")
df.select(corr("UnitPrice", "Quantity")).show()

+-------------------------+
|corr(UnitPrice, Quantity)|
+-------------------------+
|     -0.04112314436835552|
+-------------------------+



In [15]:
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3082|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                NULL| 8.627413127413128| 4.151946589446603|15661.388719512195|          NULL|
| stddev|72.89447869788873|17407.897548583845|                NULL|26.371821677029203|15.638659854603892|1854.4496996893627|          NULL|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|           

In [16]:
# colName = "UnitPrice"
# quantileProbs = [0.5]
# relError = 0.05
# df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

### Working with String

In [17]:
df.select(initcap(col("Description"))).show()

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
|Set 7 Babushka Ne...|
|Glass Star Froste...|
|Hand Warmer Union...|
|Hand Warmer Red P...|
|Assorted Colour B...|
|Poppy's Playhouse...|
|Poppy's Playhouse...|
|Feltcraft Princes...|
|Ivory Knitted Mug...|
|Box Of 6 Assorted...|
|Box Of Vintage Ji...|
|Box Of Vintage Al...|
|Home Building Blo...|
|Love Building Blo...|
|Recipe Box With M...|
+--------------------+
only showing top 20 rows


In [18]:
df.select(col("Description"), lower(col("Description")), upper(col("Description"))).show(2)

+--------------------+--------------------+--------------------+
|         Description|  lower(Description)|  upper(Description)|
+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|white hanging hea...|WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern| WHITE METAL LANTERN|
+--------------------+--------------------+--------------------+
only showing top 2 rows


### Regular Expressions

In [19]:
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows


In [20]:
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1

df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.where("hasSimpleColor")\
.select("Description").show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows


In [21]:
# sampleColor = ["black", "white", "red", "green", "blue"]
# selectedColumns = [color_locator(df.Description, c) for c in sampleColor]

# def color_locator(column, color_string):
#     return locate(color_string.upper(), column)\
#         .cast("boolean").alias("is_"+ c)
# selectedColumns.append(expr("*"))

# df.select(*selectedColumns).where(expr("is_white OR is_red"))\
# .select("Description").show(3, False)

### Working with Dates and Timestamps

In [22]:
dateDF = spark.range(10)\
    .withColumn("today", current_date())\
    .withColumn("now", current_timestamp())

dateDF.createOrReplaceTempView("dateTable")

dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [23]:
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(2)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2025-07-18|        2025-07-28|
|        2025-07-18|        2025-07-28|
+------------------+------------------+
only showing top 2 rows


In [24]:
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(
    to_date(current_date()).alias("today"),
    to_date(current_date()-1).alias("yesterday"),
    datediff(col("today"), col("yesterday")).alias("dif_days"))\
.show(2, False)

dateDF.select(
    to_date(lit("2016-01-01")).alias("start_date"),
    to_date(lit("2025-07-23")).alias("end_date"),
    datediff(col("start_date"), col("end_date")).alias("dif_days"))\
.show(2, False)


+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row
+----------+----------+--------+
|today     |yesterday |dif_days|
+----------+----------+--------+
|2025-07-23|2025-07-22|1       |
|2025-07-23|2025-07-22|1       |
+----------+----------+--------+
only showing top 2 rows
+----------+----------+--------+
|start_date|end_date  |dif_days|
+----------+----------+--------+
|2016-01-01|2025-07-23|-3491   |
|2016-01-01|2025-07-23|-3491   |
+----------+----------+--------+
only showing top 2 rows


In [25]:
dateformat = "yyyy-dd-MM"

dateDF.select(
    to_date(lit("2016-11-12"), dateformat).alias("start_date"),
    to_date(lit("2025-23-07"), dateformat).alias("end_date"))\
.show(2, False)

+----------+----------+
|start_date|end_date  |
+----------+----------+
|2016-12-11|2025-07-23|
|2016-12-11|2025-07-23|
+----------+----------+
only showing top 2 rows


### Working With Nulls in data

#### Coalesec

In [26]:
df.printSchema()

root
 |-- InvoiceNo: integer (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [27]:
df.select(coalesce(col("Description"), col("CustomerID"))).show()

+---------------------------------+
|coalesce(Description, CustomerID)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
|             CREAM CUPID HEART...|
|             KNITTED UNION FLA...|
|             RED WOOLLY HOTTIE...|
|             SET 7 BABUSHKA NE...|
|             GLASS STAR FROSTE...|
|             HAND WARMER UNION...|
|             HAND WARMER RED P...|
|             ASSORTED COLOUR B...|
|             POPPY'S PLAYHOUSE...|
|             POPPY'S PLAYHOUSE...|
|             FELTCRAFT PRINCES...|
|             IVORY KNITTED MUG...|
|             BOX OF 6 ASSORTED...|
|             BOX OF VINTAGE JI...|
|             BOX OF VINTAGE AL...|
|             HOME BUILDING BLO...|
|             LOVE BUILDING BLO...|
|             RECIPE BOX WITH M...|
+---------------------------------+
only showing top 20 rows


In [28]:
df.count()

newdf = df.na.drop("all")
newdf.show()



+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

##### Display only nullvalue based on specify column

In [29]:
nullDf = df.filter(col("Description").isNull() & col("CustomerID").isNull() )
nullDf.show()

+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|   536414|    22139|       NULL|      56|2010-12-01 11:52:00|      0.0|      NULL|United Kingdom|
|   536545|    21134|       NULL|       1|2010-12-01 14:32:00|      0.0|      NULL|United Kingdom|
|   536546|    22145|       NULL|       1|2010-12-01 14:33:00|      0.0|      NULL|United Kingdom|
|   536547|    37509|       NULL|       1|2010-12-01 14:33:00|      0.0|      NULL|United Kingdom|
|   536549|   85226A|       NULL|       1|2010-12-01 14:34:00|      0.0|      NULL|United Kingdom|
|   536550|    85044|       NULL|       1|2010-12-01 14:34:00|      0.0|      NULL|United Kingdom|
|   536552|    20950|       NULL|       1|2010-12-01 14:34:00|      0.0|      NULL|United Kingdom|
|   536553

In [30]:
nullDf.count()

10

#### replace

In [31]:
nullDf.na.replace([""],["N/A"], "Description").show()


+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|   536414|    22139|       NULL|      56|2010-12-01 11:52:00|      0.0|      NULL|United Kingdom|
|   536545|    21134|       NULL|       1|2010-12-01 14:32:00|      0.0|      NULL|United Kingdom|
|   536546|    22145|       NULL|       1|2010-12-01 14:33:00|      0.0|      NULL|United Kingdom|
|   536547|    37509|       NULL|       1|2010-12-01 14:33:00|      0.0|      NULL|United Kingdom|
|   536549|   85226A|       NULL|       1|2010-12-01 14:34:00|      0.0|      NULL|United Kingdom|
|   536550|    85044|       NULL|       1|2010-12-01 14:34:00|      0.0|      NULL|United Kingdom|
|   536552|    20950|       NULL|       1|2010-12-01 14:34:00|      0.0|      NULL|United Kingdom|
|   536553

In [44]:
ld = "1-A-12-2-B-23-3-C-34-4-D-45"
lstData = ld.split("-")
print(lstData)

data = [((int(lstData[i]), lstData[i+1], int(lstData[i+2]))) for i in range(0, len(lstData), 3)]
print(data)

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

mdf = spark.createDataFrame(data, schema)
mdf.show()



['1', 'A', '12', '2', 'B', '23', '3', 'C', '34', '4', 'D', '45']
[(1, 'A', 12), (2, 'B', 23), (3, 'C', 34), (4, 'D', 45)]
+---+----+---+
| id|name|age|
+---+----+---+
|  1|   A| 12|
|  2|   B| 23|
|  3|   C| 34|
|  4|   D| 45|
+---+----+---+



In [47]:
nld = "1,A,12,M,2,B,23,F,3,C,34,F,4,D,45,M"

nlst = nld.split(",")
print(nlst)

data =[((int(nlst[i])), (str(nlst[i+1])),(int(nlst[i+2])),(str(nlst[i+3]))) for i in  range(0, len(nlst), 4)]
print(data)






['1', 'A', '12', 'M', '2', 'B', '23', 'F', '3', 'C', '34', 'F', '4', 'D', '45', 'M']
[(1, 'A', 12, 'M'), (2, 'B', 23, 'F'), (3, 'C', 34, 'F'), (4, 'D', 45, 'M')]
