#### 1. Remove records with NULL customer_id
#### 2. Remove exact duplicate records.
#### 3. Remove duplicate records based on created_timestamp.
#### 4. Cast the columns to the correct datatype.
#### 5. Write transformed data to Silver schema.   

In [0]:
df_customers_raw = spark.sql('SELECT * FROM gizmobox.bronze.py_customers');
display(df_customers_raw)

#### 1,2. Remove records with NULL customer_id and Remove exact duplicate records.


In [0]:
df_customers_filtered = (df_customers_raw
                                        .filter(df_customers_raw.customer_id.isNotNull())
                                        .distinct());
display(df_customers_filtered)

####  3. Remove duplicate records based on created_timestamp.

In [0]:
from pyspark.sql.functions import max

df_customers_max = (df_customers_filtered
                      .groupBy("customer_id")
                      .agg(max("created_timestamp").alias("max_timestamp")))
display(df_customers_max)

In [0]:
df_distinct_customer = (df_customers_filtered
                            .join(df_customers_max,
                                 (df_customers_filtered.customer_id == df_customers_max.customer_id) &
                                 (df_customers_filtered.created_timestamp == df_customers_max.max_timestamp),
                                 'inner')
                            .select(df_customers_filtered['*'])
                        )
display(df_distinct_customer)


#### 4. Cast the columns to the correct datatype.

In [0]:
df_customers_final = df_distinct_customer.select(
                            df_distinct_customer.customer_id,
                            df_distinct_customer.customer_name,
                            df_distinct_customer.date_of_birth.cast('date'),
                            df_distinct_customer.email,
                            df_distinct_customer.member_since.cast('date'),
                            df_distinct_customer.telephone,
                            df_distinct_customer.created_timestamp.cast('timestamp')
)
display(df_customers_final)

#### 5. Write transformed data to Silver schema.

In [0]:
df_customers_final.writeTo("gizmobox.silver.py_customers").createOrReplace()


In [0]:
display(spark.sql('SELECT * FROM gizmobox.silver.py_customers'))