In [0]:
# import imporant libraries
from pyspark.sql.functions import col, regexp_extract, regexp_replace, floor, datediff, current_date, to_date, lit, current_timestamp
from pyspark.sql.types import DateType
from delta.tables import DeltaTable
from pyspark.sql.functions import date_format
from delta.tables import DeltaTable
from pyspark.sql.functions import col, regexp_extract, regexp_replace, date_format, current_timestamp, lit, floor, datediff, to_date
from pyspark.sql.types import DateType


In [0]:
bronze_path_day0 = "/mnt/Prajwal/Capstone_Project/bronze/bankcustomer_source1_day0"
silver_path = "/mnt/Prajwal/Capstone_Project/silver/bankcustomer_source_clone1"

In [0]:
df = spark.read.format("parquet").option("inferSchema", "true").load(bronze_path_day0)

# Drop the last 2 rows of the DataFrame
df = df.limit(df.count() - 2)

# Trim spaces from column names
df = df.toDF(*[c.strip() for c in df.columns])

# Rename columns for consistency and readability
df = df.withColumnRenamed("_Customerid", "customer_id") \
       .withColumnRenamed("C1ustomer Name", "name") \
       .withColumnRenamed("City", "city") \
       .withColumnRenamed("Phoneno", "phone_no") \
       .withColumnRenamed("Maritial_Status", "maritial_status") \
       .withColumnRenamed("Gender", "gender") \
       .withColumnRenamed("EmailAddress", "email")

# Fill null values with default values
df = df.fillna({
    'name': 'Unknown',
    'city': 'Unknown',
    'phone_no': '000-000-0000',
    'maritial_status': 'Unknown',
    'gender': 'Unknown',
    'email': 'noemail@example.com'
})

# Extract area code from phone number
df = df.withColumn("area_code", regexp_extract(col("phone_no"), r"(\d{3})", 1))

# Ensure email is correctly formatted
df = df.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))

# Change ingestion time to the format of yyyy-mm-dd hh:mm:ss
df = df.withColumn('ingestion_time', date_format(col('ingest_time'), 'yyyy-MM-dd HH:mm:ss'))

# Drop duplicate records based on customer_id
df = df.dropDuplicates(["customer_id"])

# Drop the original ingestion time column
df = df.drop("ingest_time")

# Define a UDF to fix date of birth (DOB) format
from pyspark.sql.functions import udf
from datetime import datetime

def fix_dob(dob_str):
    try:
        dob = datetime.strptime(dob_str, '%d-%b-%y')
        if dob.year > datetime.today().year:
            dob = dob.replace(year=dob.year - 100)
        return dob
    except:
        return None

fix_dob_udf = udf(fix_dob, DateType())

# Apply the UDF to fix DOB and calculate age
df = df.withColumn("DOB", fix_dob_udf(col("DOB")))
df_src1 = df.withColumn("age", floor(datediff(current_date(), col("DOB")) / 365.25))

# Define paths for bronze source2 
bronze_path = "/mnt/Prajwal/Capstone_Project/bronze/bankcustomer_source2_day0"

# Read data from the bronze path
df_src2 = spark.read.format("parquet").load(bronze_path)

# Rename columns for consistency and readability
df_src2 = df_src2.withColumnRenamed("CustomerID", "customer_id") \
       .withColumnRenamed("CustomerName", "name") \
       .withColumnRenamed("City", "city") \
       .withColumnRenamed("Phoneno", "phone_no") \
       .withColumnRenamed("MaritalStatus", "maritial_status") \
       .withColumnRenamed("Gender", "gender") \
       .withColumnRenamed("EmailAddress", "email")

# Trim spaces from column names
df_src2 = df_src2.toDF(*[c.strip() for c in df_src2.columns])

# Extract area code from phone number
df_src2 = df_src2.withColumn("area_code", regexp_extract(col("phone_no"), r"^(\d{3})", 1))

# Ensure email is correctly formatted
df_src2 = df_src2.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))

# Change ingestion time to the format of yyyy-mm-dd hh:mm:ss
df_src2 = df_src2.withColumn('ingestion_time', date_format(col('ingest_time'), 'yyyy-MM-dd HH:mm:ss'))

# Drop the original ingestion time column
df_src2 = df_src2.drop("ingest_time")

# Drop duplicate records based on customer_id
df_src2 = df_src2.dropDuplicates(["customer_id"])

# Cast DOB to proper format
df_src2 = df_src2.withColumn("DOB", to_date(col("DOB"), "yyyy-MM-dd"))

# Calculate age
df_src2 = df_src2.withColumn("age", floor(datediff(current_date(), col("DOB")) / 365.25))

df_src2 = df_src2.select('customer_id',
 'name',
 'city',
 'phone_no',
 'maritial_status',
 'gender',
 'DOB',
 'email',
 'area_code',
 'ingestion_time',
 'age')

df = df_src1.union(df_src2)

df = df.withColumn("ingestion_time_formatted", date_format(current_timestamp(), "yyyy-MM-dd HH:mm:ss"))


df.write.mode("append").format("delta").option("overwriteSchema", "true").save(silver_path)

In [0]:
# Read from silver path
df_new = spark.read.format("delta").load(silver_path)
df_new.display()