In [0]:
"""
silver zone is responsible for cleaning, standardization, and applying business rules.
Auto Optimize & Small File Compaction (Delta Optimization)
Memory & Shuffle Optimizations (if transformations occur)
Data Deduplication (Removing Duplicates)
Data Type Optimization (Ensuring efficient storage & performance)

"""

In [0]:
# Importing necessary libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.functions import substring
from delta.tables import DeltaTable


# Define paths for bronze and silver zones
bronze_path_day0 = "/mnt/Prajwal/Capstone_Project/bronze/bankcustomer_source1_day0"
bronze_path_day1 = "/mnt/Prajwal/Capstone_Project/bronze/bankcustomer_day1"
silver_path = "/mnt/Prajwal/Capstone_Project/silver/bankcustomer_source"

# Check if day1 file exists in the bronze layer
if dbutils.fs.ls("/mnt/Prajwal/Capstone_Project/bronze/"):
    files = [file.name for file in dbutils.fs.ls("/mnt/Prajwal/Capstone_Project/bronze/")]
    if "bankcustomer_day11/" in files:
        # Implement SCD Type 2 logic
        from pyspark.sql.functions import lit, current_timestamp

        # Read existing data from the silver path if it exists
        try:
            df_existing = spark.read.format("delta").load(silver_path)
        except:
            df_existing = spark.createDataFrame([], df.schema)

        # Read data from the bronze path (day1)
        df_new = spark.read.format("parquet").load(bronze_path_day1)

        # Trim spaces from column names
        df_new = df_new.toDF(*[c.strip() for c in df_new.columns])

        # Rename columns for consistency and readability
        df_new = df_new.withColumnRenamed("CustomerID", "customer_id") \
               .withColumnRenamed("CustomerName", "name") \
               .withColumnRenamed("City", "city") \
               .withColumnRenamed("PhoneNo", "phone_no") \
               .withColumnRenamed("MaritalStatus", "maritial_status") \
               .withColumnRenamed("Gender", "gender") \
               .withColumnRenamed("EmailAddress", "email")
        
        # Fill null values with default values
        df_new = df_new.fillna({
            'name': 'Unknown',
            'city': 'Unknown',
            'phone_no': '000-000-0000',
            'maritial_status': 'Unknown',
            'gender': 'Unknown',
            'email': 'noemail@example.com'
        })

        # Extract area code from phone number
        df_newdf = df_new.withColumn("area_code", regexp_extract(col("phone_no"), r"(\d{3})", 1))

        # Ensure email is correctly formatted
        df_new = df_new.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))

        # Change ingestion time to the format of yyyy-mm-dd hh:mm:ss
        df_new = df_new.withColumn('ingestion_time', date_format(col('ingestion_time'), 'yyyy-MM-dd HH:mm:ss'))

        # Drop duplicate records based on customer_id
        df_new = df_new.dropDuplicates(["customer_id"])

        # Drop the original ingestion time column
        df_new = df_new.drop("ingest_time")

        # Define a UDF to fix date of birth (DOB) format
        from pyspark.sql.functions import udf
        from datetime import datetime

        # def fix_dob(dob_str):
        #     try:
        #         dob = datetime.strptime(dob_str, '%d-%b-%y')
        #         if dob.year > datetime.today().year:
        #             dob = dob.replace(year=dob.year - 100)
        #         return dob
        #     except:
        #         return None

        #fix_dob_udf = udf(fix_dob, DateType())

        # Apply the UDF to fix DOB and calculate age
        # Apply the UDF to fix DOB and calculate age
        df_new = df_new.withColumn("age", floor(datediff(current_date(), col("DOB")) / 365.25))

        df_new = df_new.withColumn("ingestion_time_formatted", date_format(current_timestamp(), "yyyy-MM-dd HH:mm:ss"))

            # Add SCD Type 2 columns to the new data
        df_new = df_new.withColumn("is_current", lit(True)) \
                       .withColumn("start_date", current_timestamp()) \
                       .withColumn("end_date", lit(None).cast("timestamp"))

        # Join new data with existing data to identify changes
        df_joined = df_new.join(df_existing, "customer_id", "left")

        # Identify records that have changed
        df_changed = df_joined.filter(
            (df_new["name"] != df_existing["name"]) |
            (df_new["city"] != df_existing["city"]) |
            (df_new["phone_no"] != df_existing["phone_no"]) |
            (df_new["maritial_status"] != df_existing["maritial_status"]) |
            (df_new["gender"] != df_existing["gender"]) |
            (df_new["email"] != df_existing["email"]) |
            (df_new["DOB"] != df_existing["DOB"]) |
            (df_new["age"] != df_existing["age"]) |
            (df_new["ingestion_time"] != df_existing["ingestion_time"]) |
            (df_new["ingestion_time_formatted"] != df_existing["ingestion_time_formatted"])
        ).select(df_new["*"])

        # Mark existing records as not current
        df_existing_updated = df_existing.join(df_changed, "customer_id", "left_anti") \
                                         .withColumn("is_current", lit(False)) \
                                         .withColumn("end_date", current_timestamp())

        # Create a DeltaTable object for the existing data
        deltaTable = DeltaTable.forPath(spark, silver_path)

       # Merge new data with existing data to identify changes and insert new records
        df_final = deltaTable.alias("existing").merge(
                df_new.alias("new"),
                "existing.customer_id = new.customer_id"
        ).whenMatchedUpdate(
                        condition="existing.name != new.name OR "
                        "existing.city != new.city OR "
                        "existing.phone_no != new.phone_no OR "
                        "existing.maritial_status != new.maritial_status OR "
                        "existing.gender != new.gender OR "
                        "existing.DOB != new.DOB OR "
                        "existing.email != new.email",
            set={
                "is_current": lit(False),
                "end_date": current_timestamp(),
                 }
        ).execute()

        #Insert records with new keys (new business keys that never existed)
        updated_target_df = deltaTable.toDF().filter("is_current = true").select("customer_id")
        insert_df = df_new.join(updated_target_df, on="customer_id", how="left_anti")

        #final_insert = insert_df.withColumn('is_current', lit(True) \
         #                               .withColumn("start_date", current_timestamp()) \
          #                              .withColumn("end_date", lit(None).cast("timestamp")))

        print("data is updated and inserted in silver path")

        # Write the final DataFrame to the silver path
        insert_df.write.mode("append").format("delta").partitionBy("ingestion_time_formatted").option("mergeSchema", "true").save(silver_path)

        # Count updated and newly inserted records
        updated_count = df_changed.count()
        newly_inserted_count = df_new.count() # #updated_count

        # Display the counts
        print(f"Updated record count: {updated_count}")
        print(f"Newly inserted record count: {newly_inserted_count}")
    else:
        # Read data from the bronze path (day0)
        df = spark.read.format("parquet").option("inferSchema", "true").load(bronze_path_day0)

        # Drop the last 2 rows of the DataFrame
        df = df.limit(df.count() - 2)

        # Trim spaces from column names
        df = df.toDF(*[c.strip() for c in df.columns])

        # Rename columns for consistency and readability
        df = df.withColumnRenamed("_Customerid", "customer_id") \
               .withColumnRenamed("C1ustomer Name", "name") \
               .withColumnRenamed("City", "city") \
               .withColumnRenamed("Phoneno", "phone_no") \
               .withColumnRenamed("Maritial_Status", "maritial_status") \
               .withColumnRenamed("Gender", "gender") \
               .withColumnRenamed("EmailAddress", "email")

        # Fill null values with default values
        df = df.fillna({
            'name': 'Unknown',
            'city': 'Unknown',
            'phone_no': '000-000-0000',
            'maritial_status': 'Unknown',
            'gender': 'Unknown',
            'email': 'noemail@example.com'
        })

        # Extract area code from phone number
        df = df.withColumn("area_code", regexp_extract(col("phone_no"), r"(\d{3})", 1))

        # Ensure email is correctly formatted
        df = df.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))

        # Change ingestion time to the format of yyyy-mm-dd hh:mm:ss
        df = df.withColumn('ingestion_time', date_format(col('ingest_time'), 'yyyy-MM-dd HH:mm:ss'))

        # Drop duplicate records based on customer_id
        df = df.dropDuplicates(["customer_id"])

        # Drop the original ingestion time column
        df = df.drop("ingest_time")

        # Define a UDF to fix date of birth (DOB) format
        from pyspark.sql.functions import udf
        from datetime import datetime

        def fix_dob(dob_str):
            try:
                dob = datetime.strptime(dob_str, '%d-%b-%y')
                if dob.year > datetime.today().year:
                    dob = dob.replace(year=dob.year - 100)
                return dob
            except:
                return None

        fix_dob_udf = udf(fix_dob, DateType())

        # Apply the UDF to fix DOB and calculate age
        df = df.withColumn("DOB", fix_dob_udf(col("DOB")))
        df_src1 = df.withColumn("age", floor(datediff(current_date(), col("DOB")) / 365.25))

        # Define paths for bronze source2 
        bronze_path = "/mnt/Prajwal/Capstone_Project/bronze/bankcustomer_source2_day0"

        # Read data from the bronze path
        df_src2 = spark.read.format("parquet").load(bronze_path)

        # Rename columns for consistency and readability
        df_src2 = df_src2.withColumnRenamed("CustomerID", "customer_id") \
               .withColumnRenamed("CustomerName", "name") \
               .withColumnRenamed("City", "city") \
               .withColumnRenamed("Phoneno", "phone_no") \
               .withColumnRenamed("MaritalStatus", "maritial_status") \
               .withColumnRenamed("Gender", "gender") \
               .withColumnRenamed("EmailAddress", "email")

        # Trim spaces from column names
        df_src2 = df_src2.toDF(*[c.strip() for c in df_src2.columns])

        # Extract area code from phone number
        df_src2 = df_src2.withColumn("area_code", regexp_extract(col("phone_no"), r"^(\d{3})", 1))

        # Ensure email is correctly formatted
        df_src2 = df_src2.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))

        # Change ingestion time to the format of yyyy-mm-dd hh:mm:ss
        df_src2 = df_src2.withColumn('ingestion_time', date_format(col('ingest_time'), 'yyyy-MM-dd HH:mm:ss'))

        # Drop the original ingestion time column
        df_src2 = df_src2.drop("ingest_time")

        # Drop duplicate records based on customer_id
        df_src2 = df_src2.dropDuplicates(["customer_id"])

        # Cast DOB to proper format
        df_src2 = df_src2.withColumn("DOB", to_date(col("DOB"), "yyyy-MM-dd"))

        # Calculate age
        df_src2 = df_src2.withColumn("age", floor(datediff(current_date(), col("DOB")) / 365.25))

        df_src2 = df_src2.select('customer_id',
         'name',
         'city',
         'phone_no',
         'maritial_status',
         'gender',
         'DOB',
         'email',
         'area_code',
         'ingestion_time',
         'age')

        df = df_src1.union(df_src2)

        df = df.withColumn("ingestion_time_formatted", date_format(current_timestamp(), "yyyy-MM-dd HH:mm:ss"))

        # Implement SCD Type 2 logic for day0 data
        from pyspark.sql.functions import lit, current_timestamp

        # Read existing data from the silver path if it exists
        try:
            df_existing = spark.read.format("delta").load(silver_path)
        except:
            df_existing = spark.createDataFrame([], df.schema)

        # Add SCD Type 2 columns to the new data
        df_new = df.withColumn("is_current", lit(True)) \
                   .withColumn("start_date", current_timestamp()) \
                   .withColumn("end_date", lit(None).cast("timestamp"))

        # Create a DeltaTable object for the existing data
        deltaTable = DeltaTable.forPath(spark, silver_path)

        # Merge new data with existing data to identify changes and insert new records
        df_final = deltaTable.alias("existing").merge(
                df_new.alias("new"),
                "existing.customer_id = new.customer_id"
        ).whenMatchedUpdate(
                        condition="existing.name != new.name OR "
                        "existing.city != new.city OR "
                        "existing.phone_no != new.phone_no OR "
                        "existing.maritial_status != new.maritial_status OR "
                        "existing.gender != new.gender OR "
                        "existing.DOB != new.DOB OR "
                        "existing.email != new.email",
            set={
                "is_current": lit(False),
                "end_date": current_timestamp(),
                 }
        ).execute()
        #Insert records with new keys (new business keys that never existed)
        updated_target_df = deltaTable.toDF().filter("is_current = true").select("customer_id")
        insert_df = df_new.join(updated_target_df, on="customer_id", how="left_anti")

        #Insert records with new keys (new business keys that never existed)
        #df_new = df_new.filter("is_active=true").select("customer_id")
        #insert_df = source_df_audit_col.join(updated_target_df, on="customer_id", how="left_anti")

        
        df_new.write.mode("overwrite").format("delta").partitionBy("ingestion_time_formatted").option("overwriteSchema", "true").save(silver_path)

        # Create a logging table
        log_data = [(bronze_path_day0, silver_path, "success", datetime.now())]
        log_schema = ["bronze_path", "silver_path", "status", "timestamp"]
        log_df = spark.createDataFrame(log_data, log_schema)

        # Write the log data to a logging table
        log_df.write.format("delta").mode("overwrite").save("/mnt/Prajwal/Capstone_Project/Silver/Silver_logs")

In [0]:

# Read data from the bronze path
df_bronze = spark.read.format("parquet").load(bronze_path_day1)

# Read data from the silver path
df_silver = spark.read.format("delta").load(silver_path)

# Count records in both DataFrames
bronze_count = df_bronze.count()
silver_count = df_silver.count()

# Display the counts
print(f"Bronze record count: {bronze_count}")
print(f"Silver record count: {silver_count}")


# Display the silver DataFrame
display(df_silver)

In [0]:
# Read from silver path
df_silver = spark.read.format("delta").load(silver_path)
display(df_silver)

In [0]:
display(df_silver.filter(col("customer_id").isin('7f97ade2-4720-42c6-ab28-9ebcea043cf2',
'633a55b8-0678-49ea-aaa2-6a6a40beb308',
'901db134-d9d8-4a55-ae1f-3b879fe8a3c3',
'52c44179-17c5-44cb-b86e-3b98a9a98230')))

In [0]:
%run ../Gold/Capstone_Gold_Customerdetails_initialload

# Implementing SCD 2 

In [0]:
# if DeltaTable.isDeltaTable(spark, silver_path) else df_new.write.format('delta').mode('overwrite').save(silver_path)

In [0]:
# # Import necessary libraries for SCD Type 2
# from pyspark.sql.functions import lit, current_timestamp

# # Read existing data from the silver path if it exists
# try:
#     df_existing = spark.read.format("delta").load(silver_path)
# except:
#     df_existing = spark.createDataFrame([], df.schema)
# # 
# # Add SCD Type 2 columns to the new data
# df_new = df.withColumn("is_current", lit(True)) \
#            .withColumn("start_date", current_timestamp()) \
#            .withColumn("end_date", lit(None).cast("timestamp"))

# # Merge new data with existing data to identify changes and insert new records
# df_final = df_existing.alias("existing").merge(
#     df_new.alias("new"),
#     "existing.customer_id = new.customer_id"
# ).whenMatchedUpdate(
#     set={
#         "is_current": lit(False),
#         "end_date": current_timestamp()
#     }
# ).whenNotMatchedInsert(
#     values={
#         "customer_id": "new.customer_id",
#         "name": "new.name",
#         "city": "new.city",
#         "phone_no": "new.phone_no",
#         "maritial_status": "new.maritial_status",
#         "gender": "new.gender",
#         "email": "new.email",
#         "DOB": "new.DOB",
#         "age": "new.age",
#         "is_current": lit(True),
#         "start_date": current_timestamp(),
#         "end_date": lit(None).cast("timestamp")
#     }
# ).execute()

# # Write the final DataFrame to the silver path
# df_final.write.mode("overwrite").format("delta").partitionBy("ingestion_time").save(silver_path)

# # Count updated and newly inserted records
# updated_count = df_final.filter(df_final["is_current"] == False).count()
# newly_inserted_count = df_final.filter(df_final["is_current"] == True).count()

# # Display the counts
# print(f"Updated record count: {updated_count}")
# print(f"Newly inserted record count: {newly_inserted_count}")

In [0]:
# # Importing necessary libraries
# from pyspark.sql.functions import *
# from pyspark.sql.types import *
# from pyspark.sql.window import Window

# # Define paths for bronze and silver zones
# bronze_path = "/mnt/Prajwal/Capstone_Project/bronze/bankcustomer_source1_day0"
# silver_path = "/mnt/Prajwal/Capstone_Project/silver/Bankcustomer_source"

# # Read data from the bronze path
# df = spark.read.format("parquet").load(bronze_path)

# # Drop the last 2 rows of the DataFrame
# df = df.limit(df.count() - 2)

# # Trim spaces from column names
# df = df.toDF(*[c.strip() for c in df.columns])

# # Rename columns for consistency and readability
# df = df.withColumnRenamed("_Customerid", "customer_id") \
#        .withColumnRenamed("C1ustomer Name", "name") \
#        .withColumnRenamed("City", "city") \
#        .withColumnRenamed("Phoneno", "phone_no") \
#        .withColumnRenamed("Maritial_Status", "maritial_status") \
#        .withColumnRenamed("Gender", "gender") \
#        .withColumnRenamed("EmailAddress", "email")

# # Fill null values with default values
# df = df.fillna({
#     'name': 'Unknown',
#     'city': 'Unknown',
#     'phone_no': '000-000-0000',
#     'maritial_status': 'Unknown',
#     'gender': 'Unknown',
#     'email': 'noemail@example.com'
# })

# # Extract area code from phone number
# df = df.withColumn("area_code", regexp_extract(col("phone_no"), r"(\d{3})", 1))

# # Ensure email is correctly formatted
# df = df.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))

# # Change ingestion time to the format of yyyy-mm-dd hh:mm:ss
# df = df.withColumn('ingestion_time', date_format(col('ingest_time'), 'yyyy-MM-dd HH:mm:ss'))

# # Drop duplicate records based on customer_id
# df = df.dropDuplicates(["customer_id"])

# # Drop the original ingestion time column
# df = df.drop("ingest_time")

# # Define a UDF to fix date of birth (DOB) format
# from pyspark.sql.functions import udf
# from datetime import datetime

# def fix_dob(dob_str):
#     try:
#         dob = datetime.strptime(dob_str, '%d-%b-%y')
#         if dob.year > datetime.today().year:
#             dob = dob.replace(year=dob.year - 100)
#         return dob
#     except:
#         return None

# fix_dob_udf = udf(fix_dob, DateType())

# # Apply the UDF to fix DOB and calculate age
# df = df.withColumn("DOB", fix_dob_udf(col("DOB")))
# df_src1 = df.withColumn("age", floor(datediff(current_date(), col("DOB")) / 365.25))

# # Define paths for bronze source2 
# bronze_path = "/mnt/Prajwal/Capstone_Project/bronze/bankcustomer_source2_day0"

# # Read data from the bronze path
# df_src2 = spark.read.format("parquet").load(bronze_path)

# # Rename columns for consistency and readability
# df_src2 = df_src2.withColumnRenamed("CustomerID", "customer_id") \
#        .withColumnRenamed("CustomerName", "name") \
#        .withColumnRenamed("City", "city") \
#        .withColumnRenamed("Phoneno", "phone_no") \
#        .withColumnRenamed("MaritalStatus", "maritial_status") \
#        .withColumnRenamed("Gender", "gender") \
#        .withColumnRenamed("EmailAddress", "email")

# # Trim spaces from column names
# df_src2 = df_src2.toDF(*[c.strip() for c in df_src2.columns])

# # Extract area code from phone number
# df_src2 = df_src2.withColumn("area_code", regexp_extract(col("phone_no"), r"^(\d{3})", 1))

# # Ensure email is correctly formatted
# df_src2 = df_src2.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))

# # Change ingestion time to the format of yyyy-mm-dd hh:mm:ss
# df_src2 = df_src2.withColumn('ingestion_time', date_format(col('ingest_time'), 'yyyy-MM-dd HH:mm:ss'))

# # Drop the original ingestion time column
# df_src2 = df_src2.drop("ingest_time")

# # Drop duplicate records based on customer_id
# df_src2 = df_src2.dropDuplicates(["customer_id"])

# # Cast DOB to proper format
# df_src2 = df_src2.withColumn("DOB", to_date(col("DOB"), "yyyy-MM-dd"))

# # Calculate age
# df_src2 = df_src2.withColumn("age", floor(datediff(current_date(), col("DOB")) / 365.25))

# df_src2 = df_src2.select('customer_id',
#  'name',
#  'city',
#  'phone_no',
#  'maritial_status',
#  'gender',
#  'DOB',
#  'email',
#  'area_code',
#  'ingestion_time',
#  'age')


# df = df_src1.union(df_src2)

# df = df.withColumn("ingestion_time_formatted", date_format(current_timestamp(), "yyyy-MM-dd HH:mm:ss"))

# # Read existing data from the silver path if it exists
# try:
#     df_existing = spark.read.format("delta").load(silver_path)
# except:
#     df_existing = spark.createDataFrame([], df.schema)

# # Add SCD Type 2 columns to the new data
# df_new = df.withColumn("is_current", lit(True)) \
#            .withColumn("start_date", current_timestamp()) \
#            .withColumn("end_date", lit(None).cast("timestamp"))


# df_new.columns