# Ex-2100 Project - data cleanup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import col, regexp_extract, to_date, when, current_date

# Initialize Spark session
spark = SparkSession.builder.appName("DataCleaningExample").getOrCreate()

# Define schema for raw (Bronze) data
bronze_schema = StructType([
    StructField("ID", StringType(), True),
    StructField("Raw_Date", StringType(), True),  # Unstructured date format
    StructField("Customer_Name", StringType(), True),
    StructField("Amount", StringType(), True),  # String format, needs conversion
    StructField("Category", StringType(), True)
])

# Sample raw data (Bronze layer)
raw_data = [
    ("001", "23-04-15", "John Doe", "100", "Electronics"),
    ("002", "invalid_date", "Jane Smith", "abc", "Clothing"),
    ("003", "24-05-01", "Robert Brown", "300", "Books"),
    ("004", "99-99-99", "Alice Johnson", "250", "Gaming"),
    ("005", "25-12-30", "Michael Lee", "500", "Electronics"),
    ("006", "26-01-01", "Emma Clark", "-50", "Clothing")
]

# Create DataFrame (Bronze data)
df_bronze = spark.createDataFrame(raw_data, schema=bronze_schema)

df_bronze.show(truncate=False)

+---+------------+-------------+------+-----------+
|ID |Raw_Date    |Customer_Name|Amount|Category   |
+---+------------+-------------+------+-----------+
|001|23-04-15    |John Doe     |100   |Electronics|
|002|invalid_date|Jane Smith   |abc   |Clothing   |
|003|24-05-01    |Robert Brown |300   |Books      |
|004|99-99-99    |Alice Johnson|250   |Gaming     |
|005|25-12-30    |Michael Lee  |500   |Electronics|
|006|26-01-01    |Emma Clark   |-50   |Clothing   |
+---+------------+-------------+------+-----------+



In [2]:
# Define regular expression pattern for valid date formats
date_pattern = r"(\d{2}-\d{2}-\d{2})"

# Extract valid dates using regexp_extract
df_silver = df_bronze.withColumn("Clean_Date", regexp_extract(col("Raw_Date"), date_pattern, 1))
df_silver.show()

+---+------------+-------------+------+-----------+----------+
| ID|    Raw_Date|Customer_Name|Amount|   Category|Clean_Date|
+---+------------+-------------+------+-----------+----------+
|001|    23-04-15|     John Doe|   100|Electronics|  23-04-15|
|002|invalid_date|   Jane Smith|   abc|   Clothing|          |
|003|    24-05-01| Robert Brown|   300|      Books|  24-05-01|
|004|    99-99-99|Alice Johnson|   250|     Gaming|  99-99-99|
|005|    25-12-30|  Michael Lee|   500|Electronics|  25-12-30|
|006|    26-01-01|   Emma Clark|   -50|   Clothing|  26-01-01|
+---+------------+-------------+------+-----------+----------+



In [3]:
# Convert extracted date into proper DateType, replacing invalid dates with NULL
df_silver = df_silver.withColumn("Clean_Date", to_date(col("Clean_Date"), "yy-MM-dd"))
df_silver.show()

+---+------------+-------------+------+-----------+----------+
| ID|    Raw_Date|Customer_Name|Amount|   Category|Clean_Date|
+---+------------+-------------+------+-----------+----------+
|001|    23-04-15|     John Doe|   100|Electronics|2023-04-15|
|002|invalid_date|   Jane Smith|   abc|   Clothing|      NULL|
|003|    24-05-01| Robert Brown|   300|      Books|2024-05-01|
|004|    99-99-99|Alice Johnson|   250|     Gaming|      NULL|
|005|    25-12-30|  Michael Lee|   500|Electronics|2025-12-30|
|006|    26-01-01|   Emma Clark|   -50|   Clothing|2026-01-01|
+---+------------+-------------+------+-----------+----------+



In [4]:
# Convert `Amount` column from string to integer, handling invalid values
df_silver = df_silver.withColumn("Amount", col("Amount").cast("int"))
df_silver.show(truncate=False)

+---+------------+-------------+------+-----------+----------+
|ID |Raw_Date    |Customer_Name|Amount|Category   |Clean_Date|
+---+------------+-------------+------+-----------+----------+
|001|23-04-15    |John Doe     |100   |Electronics|2023-04-15|
|002|invalid_date|Jane Smith   |NULL  |Clothing   |NULL      |
|003|24-05-01    |Robert Brown |300   |Books      |2024-05-01|
|004|99-99-99    |Alice Johnson|250   |Gaming     |NULL      |
|005|25-12-30    |Michael Lee  |500   |Electronics|2025-12-30|
|006|26-01-01    |Emma Clark   |-50   |Clothing   |2026-01-01|
+---+------------+-------------+------+-----------+----------+



In [5]:
df_silver = df_silver.filter((col("Clean_Date") <= current_date()) & (col("Amount") > 0))
df_silver.show(truncate=False)

+---+--------+-------------+------+-----------+----------+
|ID |Raw_Date|Customer_Name|Amount|Category   |Clean_Date|
+---+--------+-------------+------+-----------+----------+
|001|23-04-15|John Doe     |100   |Electronics|2023-04-15|
|003|24-05-01|Robert Brown |300   |Books      |2024-05-01|
+---+--------+-------------+------+-----------+----------+

