In [0]:
from pyspark.sql.functions import *
# load the customer data

file_location_zoom_customers = "/FileStore/zoom_car_customer_data/"

zoom_car_customer_data = spark.read.json(file_location_zoom_customers)
zoom_car_customer_data.show(5)

# For the current date code you can refer to Zoom car booking Analysis file

# Removing null records in critical fields (customer_id,name_email)

zoom_car_customer_bad_data = zoom_car_customer_data.filter((col('customer_id').isNull()) | (col('name').isNull()) | (col('email').isNull()))
zoom_car_customer_bad_data.show()

# Insert the bad data into a error table  

zoom_car_customer_bad_data.write.format("delta").mode("append").saveAsTable("dev_databricks_projects.zoom_data.error_customers")

+-----------+--------------------+--------------+--------------------+-----------+--------+
|customer_id|               email|          name|        phone_number|signup_date|  status|
+-----------+--------------------+--------------+--------------------+-----------+--------+
|  C00427221|                xxxx|          test|             2222222| 2024-04-18|inactive|
|  C00492265|                xxxx|          test|             2222222| 2024-05-18|  active|
|  C00466409|danieljohnson@yah...|Daniel Johnson|  (736)436-6369x9720| 2024-06-18|inactive|
|  C00474994| scottcook@yahoo.com|    Scott Cook|001-898-859-1057x...| 2024-01-10|  active|
|  C00428433|                xxxx|          test|             2222222| 2024-04-18|inactive|
+-----------+--------------------+--------------+--------------------+-----------+--------+
only showing top 5 rows

+-----------+-----+----+------------+-----------+------+
|customer_id|email|name|phone_number|signup_date|status|
+-----------+-----+----+---------

In [0]:
import re
from pyspark.sql.functions import *
# validation of Email formats. 
# store the invalid email formats in a table with append mode and do not proceed them further

def validate_email_formats(email):
    split_email = email.split('@')
    if len(split_email) != 2:
        return "INVALID"

    domain_names = ['gmail.com','yahoo.com','hotmail.com','outlook.com','aol.com','mail.com','live.com']
    pattern=r'^[a-zA-Z0-9]+$'

    if split_email[1] not in domain_names:
        return "INVALID"
    elif re.match(pattern,split_email[0]) == False:
        return "INVALID"
    
    return "VALID"

valid_email_udf=udf(validate_email_formats)
zoom_car_customer_good_data = zoom_car_customer_data.filter((col('customer_id').isNotNull()) & (col('name').isNotNull()) & (col('email').isNotNull()))
zoom_car_valid_email_data = zoom_car_customer_good_data.withColumn('is_email_valid',valid_email_udf(col('email')))\
    .filter(col('is_email_valid') == 'VALID')

zoom_car_valid_email_data.show(5)

# Write the invalid emails to error record tables 

zoom_car_invalid_email_data = zoom_car_customer_good_data.withColumn('is_email_valid',valid_email_udf(col('email')))\
    .filter(col('is_email_valid') == 'INVALID')\
    .select('customer_id','email','name','phone_number','signup_date','status')

zoom_car_invalid_email_data.show(5)

# Write invalid data into the table
zoom_car_invalid_email_data.write.format("delta").mode("append").saveAsTable("dev_databricks_projects.zoom_data.error_customers")




        

    



+-----------+--------------------+--------------+--------------------+-----------+--------+--------------+
|customer_id|               email|          name|        phone_number|signup_date|  status|is_email_valid|
+-----------+--------------------+--------------+--------------------+-----------+--------+--------------+
|  C00466409|danieljohnson@yah...|Daniel Johnson|  (736)436-6369x9720| 2024-06-18|inactive|         VALID|
|  C00474994| scottcook@yahoo.com|    Scott Cook|001-898-859-1057x...| 2024-01-10|  active|         VALID|
|  C00447623|johnpeterson@yaho...| John Peterson|   377.921.2360x6621| 2024-05-18|inactive|         VALID|
|  C00433281|  anaperez@yahoo.com|     Ana Perez|001-650-331-9625x302| 2024-06-18|  active|         VALID|
|  C00471055|williekelly@hotma...|  Willie Kelly|    337-284-7668x881| 2024-04-25|inactive|         VALID|
+-----------+--------------------+--------------+--------------------+-----------+--------+--------------+
only showing top 5 rows

+-----------

In [0]:
# Ensure that status is one of the pre-defined status. 
# For this we follow the approach of creating a dataset with all the valid status and performing a left join from the data to the valid status dataset. and erroring out the records where we do not find a match

valid_status_data=[
    {
    'def_status':'active'
    },
    {
        'def_status':'inactive'
    }
]

valid_data_df = spark.createDataFrame(valid_status_data)
valid_data_df.show()

valid_status_zoom_car_customer_data = zoom_car_valid_email_data.join(valid_data_df\
    ,on=valid_data_df['def_status']==zoom_car_valid_email_data['status']\
    ,how='left')\
        .filter(col('def_status').isNotNull())

invalid_status_zoom_car_customer_data = zoom_car_valid_email_data.join(valid_data_df\
    ,on=valid_data_df['def_status']==zoom_car_valid_email_data['status']\
    ,how='left')\
        .filter(col('def_status').isNull())

# add the invalid customer data into the error table 
invalid_status_zoom_car_customer_data\
    .select('customer_id','email','name','phone_number','signup_date','status')\
        .write\
            .format("delta")\
                .mode("append")\
                    .saveAsTable("dev_databricks_projects.zoom_data.error_customers")


+----------+
|def_status|
+----------+
|    active|
|  inactive|
+----------+



In [0]:
# load cleaned data into staging customers dev table
valid_status_zoom_car_customer_data.write.format("delta").mode("append").saveAsTable("dev_databricks_projects.zoom_data.staging_customers_data")


In [0]:
# Application of Transformation
# Normalize phone numbers to a standard format

valid_status_zoom_car_customer_data.show(5)

valid_status_zoom_car_customer_tmp_phone_modified_data = valid_status_zoom_car_customer_data.withColumn('tmp_standard_phone_number'\
    ,regexp_replace('phone_number',r"\D","")\
        )

valid_status_zoom_car_customer_phone_modified_data = valid_status_zoom_car_customer_tmp_phone_modified_data.withColumn('standard_phone_number'\
    ,concat(col('tmp_standard_phone_number').substr(1,3),lit("-")\
        ,col('tmp_standard_phone_number').substr(4,3),lit("-")\
            ,col('tmp_standard_phone_number').substr(7,4)\
                )\
                    )

# Calculation of customer tenure from signup date only for the active users. If we want to calculate for all users we can simply remove filter

current_dt = spark.sql("select current_date() as dt")
current_dtt=current_dt.collect()[0][0]
customer_tenure = valid_status_zoom_car_customer_phone_modified_data.filter(col('status')=='active')\
    .withColumn('current_date_calc',to_date(lit(current_dtt)))\
        .withColumn('tenure',date_diff(col('current_date_calc'),col('signup_date'))).select('signup_date','current_date_calc','tenure')




valid_status_zoom_car_customer_phone_modified_data.select('standard_phone_number').show(5)
customer_tenure.show(5)


+-----------+--------------------+--------------+--------------------+-----------+------+--------------+----------+
|customer_id|               email|          name|        phone_number|signup_date|status|is_email_valid|def_status|
+-----------+--------------------+--------------+--------------------+-----------+------+--------------+----------+
|  C00449138|dawnbryant@hotmai...|   Dawn Bryant|  (372)433-9737x7105| 2024-06-18|active|         VALID|    active|
|  C00479438|nicholasdoyle@out...|Nicholas Doyle|   (805)263-9518x437| 2024-01-10|active|         VALID|    active|
|  C00451562|edwardmiller@gmai...| Edward Miller|  (235)824-0661x9203| 2024-02-12|active|         VALID|    active|
|  C00455400|candicewhite@gmai...| Candice White| (935)209-6798x81073| 2024-05-18|active|         VALID|    active|
|  C00444997|sherryward@gmail.com|   Sherry Ward|001-495-420-0175x...| 2024-01-10|active|         VALID|    active|
+-----------+--------------------+--------------+--------------------+--