<a href="https://colab.research.google.com/github/simantinip04/Data-Engineering/blob/main/PySpark/3June_Sales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("SalesDataset").getOrCreate()

# Load CSV
file_path = "/content/drive/MyDrive/Sales_Dataset__500_Records_.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [None]:
# View schema
df.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- ProductCategory: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- DeliveryStatus: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- City: string (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- CustomerSince: date (nullable = true)



In [None]:
# Show sample data(First 5)
df.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [None]:
# Show sample data(Last 5)
df.tail(5)

[Row(OrderID=2930, CustomerName='Jaime Harris', ProductCategory='Fashion', Amount=680.0, OrderDate=datetime.date(2025, 2, 16), DeliveryStatus='Returned', Discount=0.2, City='Robertville', PaymentMode='Cash', CustomerSince=datetime.date(2021, 1, 28)),
 Row(OrderID=7980, CustomerName='Dawn Wyatt', ProductCategory='Fashion', Amount=285.32, OrderDate=datetime.date(2024, 4, 26), DeliveryStatus='Cancelled', Discount=0.06, City='Cherylfurt', PaymentMode='UPI', CustomerSince=datetime.date(2021, 4, 14)),
 Row(OrderID=7770, CustomerName='Kristin White', ProductCategory='Groceries', Amount=792.11, OrderDate=datetime.date(2024, 8, 10), DeliveryStatus='Returned', Discount=0.07, City='Kingport', PaymentMode='UPI', CustomerSince=datetime.date(2022, 11, 16)),
 Row(OrderID=6641, CustomerName='Jennifer Taylor', ProductCategory='Toys', Amount=578.49, OrderDate=datetime.date(2025, 4, 3), DeliveryStatus='Delivered', Discount=0.1, City='Lake Jerryburgh', PaymentMode='Cash', CustomerSince=datetime.date(2020,

In [None]:
#Show data types
df.dtypes

[('OrderID', 'int'),
 ('CustomerName', 'string'),
 ('ProductCategory', 'string'),
 ('Amount', 'double'),
 ('OrderDate', 'date'),
 ('DeliveryStatus', 'string'),
 ('Discount', 'double'),
 ('City', 'string'),
 ('PaymentMode', 'string'),
 ('CustomerSince', 'date')]

In [None]:
#2 Selection, Renaming, and Filtering
from pyspark.sql.functions import col

# Select specific columns
df_sel = df.select("OrderID", "CustomerName", "Amount")
df_sel.show(5)

+-------+--------------+------+
|OrderID|  CustomerName|Amount|
+-------+--------------+------+
|   2824| Donald Walker|783.04|
|   7912|  Brandon Hall| 905.0|
|   4611|  Donald Booth|657.96|
|   3547|Phillip Garcia|606.89|
|   8527|  Valerie Gray| 77.87|
+-------+--------------+------+
only showing top 5 rows



In [None]:
# Rename column
df_renamed = df_sel.withColumnRenamed("Amount", "OrderAmount")
df_renamed.show(5)

+-------+--------------+-----------+
|OrderID|  CustomerName|OrderAmount|
+-------+--------------+-----------+
|   2824| Donald Walker|     783.04|
|   7912|  Brandon Hall|      905.0|
|   4611|  Donald Booth|     657.96|
|   3547|Phillip Garcia|     606.89|
|   8527|  Valerie Gray|      77.87|
+-------+--------------+-----------+
only showing top 5 rows



In [None]:
# Filter Amount > 500
df_filtered = df.filter(col("Amount") > 500)
df_filtered.show(5)

+-------+------------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+------------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   6155|Jonathan Wilkerson|        Fashion|882.68|2024-10-14|     Cancelled|    0.27|    

In [None]:
# Filter by city using .filter()
df_city = df.filter(col("City") == "New York")
df_city.show(5)

+-------+------------+---------------+------+---------+--------------+--------+----+-----------+-------------+
|OrderID|CustomerName|ProductCategory|Amount|OrderDate|DeliveryStatus|Discount|City|PaymentMode|CustomerSince|
+-------+------------+---------------+------+---------+--------------+--------+----+-----------+-------------+
+-------+------------+---------------+------+---------+--------------+--------+----+-----------+-------------+



In [None]:
#3: Data Manipulation
from pyspark.sql.functions import expr

# Drop column
df_drop = df.drop("CustomerSince")
df_drop.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----

In [None]:
# Add FinalAmount column
df_final = df.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))
df_final.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|      FinalAmount|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+-----------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|          665.584|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|           877.85|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|         651.3804|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|         51

In [None]:
# Sort by FinalAmount descending
df_sorted = df_final.orderBy(col("FinalAmount").desc())
df_sorted.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+------------+-----------+-------------+-----------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|        City|PaymentMode|CustomerSince|      FinalAmount|
+-------+--------------+---------------+------+----------+--------------+--------+------------+-----------+-------------+-----------------+
|   5573|Jordan Frazier|          Books|981.05|2025-03-19|     Cancelled|    0.02| Sheilaville|       Cash|   2021-07-12|          961.429|
|   8474|   Heidi Brown|    Electronics|968.91|2023-11-23|     Cancelled|    0.02|  Riverafort|       Cash|   2023-03-19|         949.5318|
|   8889|   Karen Garza|          Books| 998.3|2024-10-17|     Cancelled|    0.06|  Johnsonton|Credit Card|   2020-12-17|938.4019999999999|
|   2127|  Jaclyn Moore|      Groceries|933.32|2025-03-11|      Returned|    0.01| Cherylhaven|       Cash|   2020-06-14|         923.9868|
|   9806| Samantha G

In [None]:
# Replace 'Cancelled' with 'Order Cancelled'
df_replaced = df.withColumn("DeliveryStatus", expr("CASE WHEN DeliveryStatus = 'Cancelled' THEN 'Order Cancelled' ELSE DeliveryStatus END"))
df_replaced.show(5)

+-------+--------------+---------------+------+----------+---------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate| DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+---------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|       Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|Order Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|       Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|       Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|      Delivered|    0.17|       Mariastad|       Cash

In [None]:
#4: Aggregations and GroupBy
from pyspark.sql.functions import avg, count, sum

# Count by DeliveryStatus
df.groupBy("DeliveryStatus").agg(count("*").alias("OrderCount")).show()

+--------------+----------+
|DeliveryStatus|OrderCount|
+--------------+----------+
|      Returned|       117|
|     Cancelled|       149|
|     Delivered|       119|
|       Pending|       115|
+--------------+----------+



In [None]:
# Average Amount by ProductCategory
df.groupBy("ProductCategory").agg(avg("Amount").alias("AvgAmount")).show()

+---------------+------------------+
|ProductCategory|         AvgAmount|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+



In [None]:
# Total sales by City
df.groupBy("City").agg(sum("Amount").alias("TotalSales")).show()

+----------------+----------+
|            City|TotalSales|
+----------------+----------+
|     Ramseymouth|    761.06|
|East Edwardshire|    291.26|
|      Thomasberg|    882.68|
|     Laurenville|    383.26|
| South Colinstad|    786.27|
|    Lake Douglas|    975.09|
|   Williamsmouth|     10.78|
|      Gordonport|    514.99|
|  West Dawnmouth|      12.8|
|        Seanbury|    814.39|
|     Sheilaville|    981.05|
|       Mollybury|    222.02|
|       Lisaville|     45.69|
| Lake Jerrymouth|    404.01|
|       Perezfort|    917.55|
|Port Nicoleshire|    133.78|
|  South Samantha|    229.46|
|     Port Willie|    788.13|
|     Waltersfort|    552.81|
|       Youngbury|    372.95|
+----------------+----------+
only showing top 20 rows



In [None]:
#5: Null Handling & Update
from pyspark.sql.functions import when

# Inject nulls manually
from pyspark.sql.functions import lit
df_with_nulls = df.withColumn("City", when(col("City") == "New York", lit(None)).otherwise(col("City")))
df_with_nulls.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [None]:
# Handle nulls
df_filled = df_with_nulls.fillna({"City": "Unknown"})
df_dropped = df_with_nulls.dropna(subset=["City"])
df_filled.show(5)
df_dropped.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [None]:
# Tag high-value customers
df_tagged = df.withColumn("CustomerType",
    when(col("Amount") > 800, "High Value").otherwise("Regular"))
df_tagged.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|CustomerType|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|     Regular|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|  High Value|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|     Regular|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|     Regular|
|   8527|  Valerie Gray|   

In [None]:
#6: Date & Time Functions
from pyspark.sql.functions import year, month, datediff, current_date

# Extract year and month
df_dates = df.withColumn("Year", year(col("OrderDate"))).withColumn("Month", month(col("OrderDate")))
df_dates.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+----+-----+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|Year|Month|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+----+-----+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|2024|   12|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|2024|    9|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|2025|    1|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|2024|    3|
|   8527|  Valerie Gray|           Toys| 

In [None]:
# Calculate loyalty
df_loyalty = df.withColumn("LoyaltyYears", datediff(current_date(), col("CustomerSince")) / 365)
df_loyalty.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+------------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|      LoyaltyYears|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+------------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15| 4.635616438356164|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15| 3.221917808219178|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07| 3.824657534246575|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08| 4.82

In [None]:
# 7: Joins and Unions
# Create city-region mapping DataFrame
city_region_data = [("New York", "East"), ("Los Angeles", "West"), ("Chicago", "Midwest")]
schema = ["City", "Region"]
region_df = spark.createDataFrame(city_region_data, schema)
region_df.show()

+-----------+-------+
|       City| Region|
+-----------+-------+
|   New York|   East|
|Los Angeles|   West|
|    Chicago|Midwest|
+-----------+-------+



In [None]:
# Join (inner and left)
df_inner = df.join(region_df, on="City", how="inner")
df_left = df.join(region_df, on="City", how="left")
df_inner.show(5)
df_left.show(5)

+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+
|City|OrderID|CustomerName|ProductCategory|Amount|OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|Region|
+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+
+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+

+----------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
|            City|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|Region|
+----------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
| Port Jesseville|   4150|   Amber Perez|          Books|352.37|2024-01-13|     Cancelled|    0.24|       Cash|   2022-01-13|  N

In [None]:
# Union two datasets (e.g., 2023 vs 2024)
df_2023 = df.filter(year(col("OrderDate")) == 2023)
df_2024 = df.filter(year(col("OrderDate")) == 2024)
df_union = df_2023.union(df_2024)
df_union.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|   2169|Carolyn Daniel|    Electronics| 14.09|2023-10-07|     Delivered|    0.25|         Grayside|Credit Card|   2021-05-09|
|   6313|   Patty Perez|      Groceries| 79.83|2023-06-27|     Cancelled|    0.12|      Richardland|Credit Card|   2021-04-25|
|   2040| Kyle Mcdonald|           Toys|327.52|2023-12-15|      Returned|    0.06|Lake Jenniferside|     Wallet|   2021-07-21|
|   6038| David Bradley|        Fashion|348.51|2023-08-03|      Returned|    0.23|    Lake Toddland|        UPI|   2022-09-07|
|   3060|   John Pierce|           Toys|362.09|2023-12-25|      Returned|    0.03|       Brandtside|       Cash

In [None]:
#8: Complex JSON Simulation
from pyspark.sql.functions import to_json, struct, from_json, explode, get_json_object
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Convert each row to JSON
df_json = df.withColumn("json_data", to_json(struct([df[x] for x in df.columns])))
df_json.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+--------------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|           json_data|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+--------------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|{"OrderID":2824,"...|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|{"OrderID":7912,"...|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|{"OrderID":4611,"...|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020

In [None]:
# Load back
schema = df.schema  # use same schema
df_loaded = df_json.select(from_json(col("json_data"), schema).alias("data")).select("data.*")
df_loaded.show(5)

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [None]:
from pyspark.sql.functions import array, lit, explode

#Add a dummy array column
df_array = df.withColumn("PurchasedItems", array(lit("ItemA"), lit("ItemB"), lit("ItemC")))

#Explode the array into separate rows
df_exploded = df_array.withColumn("Item", explode("PurchasedItems"))

#Show results
df_exploded.select("OrderID", "CustomerName", "Item").show(10, truncate=False)

+-------+--------------+-----+
|OrderID|CustomerName  |Item |
+-------+--------------+-----+
|2824   |Donald Walker |ItemA|
|2824   |Donald Walker |ItemB|
|2824   |Donald Walker |ItemC|
|7912   |Brandon Hall  |ItemA|
|7912   |Brandon Hall  |ItemB|
|7912   |Brandon Hall  |ItemC|
|4611   |Donald Booth  |ItemA|
|4611   |Donald Booth  |ItemB|
|4611   |Donald Booth  |ItemC|
|3547   |Phillip Garcia|ItemA|
+-------+--------------+-----+
only showing top 10 rows



In [None]:
from pyspark.sql.functions import to_json, struct, get_json_object

# Create a JSON string column from existing fields
df_json = df.withColumn("json_data", to_json(struct("CustomerName", "OrderID", "Amount")))

df_json.select("json_data").show(5, truncate=False)

df_extracted = df_json.withColumn("ExtractedName", get_json_object(col("json_data"), "$.CustomerName"))

df_extracted.select("json_data", "ExtractedName").show(5, truncate=False)


+----------------------------------------------------------------+
|json_data                                                       |
+----------------------------------------------------------------+
|{"CustomerName":"Donald Walker","OrderID":2824,"Amount":783.04} |
|{"CustomerName":"Brandon Hall","OrderID":7912,"Amount":905.0}   |
|{"CustomerName":"Donald Booth","OrderID":4611,"Amount":657.96}  |
|{"CustomerName":"Phillip Garcia","OrderID":3547,"Amount":606.89}|
|{"CustomerName":"Valerie Gray","OrderID":8527,"Amount":77.87}   |
+----------------------------------------------------------------+
only showing top 5 rows

+----------------------------------------------------------------+--------------+
|json_data                                                       |ExtractedName |
+----------------------------------------------------------------+--------------+
|{"CustomerName":"Donald Walker","OrderID":2824,"Amount":783.04} |Donald Walker |
|{"CustomerName":"Brandon Hall","OrderID":79

In [None]:
#9: Applying Functions
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# UDF to tag orders
def order_size(amount):
    if amount >= 800:
        return "Big"
    elif amount >= 300:
        return "Medium"
    else:
        return "Small"

order_udf = udf(order_size, StringType())
# Apply UDF
df_tagged = df.withColumn("OrderSize", order_udf(col("Amount")))
df_tagged.select("OrderID", "Amount", "OrderSize").show()

+-------+------+---------+
|OrderID|Amount|OrderSize|
+-------+------+---------+
|   2824|783.04|   Medium|
|   7912| 905.0|      Big|
|   4611|657.96|   Medium|
|   3547|606.89|   Medium|
|   8527| 77.87|    Small|
|   4150|352.37|   Medium|
|   5554|148.33|    Small|
|   2169| 14.09|    Small|
|   6313| 79.83|    Small|
|   6155|882.68|      Big|
|   9830|870.55|      Big|
|   9085|921.73|      Big|
|   2040|327.52|   Medium|
|   6573|676.02|   Medium|
|   2743| 47.06|    Small|
|   9837| 46.15|    Small|
|   6038|348.51|   Medium|
|   3060|362.09|   Medium|
|   4295|684.26|   Medium|
|   5061|251.89|    Small|
+-------+------+---------+
only showing top 20 rows

