In [0]:
spark.conf.set(
"fs.azure.account.key.myrgstore.dfs.core.windows.net",
"Cdhaz7zz+7n======================================================================")

In [0]:
display(dbutils.fs.ls("abfss://raw@myrgstore.dfs.core.windows.net"))

path,name,size,modificationTime
abfss://raw@myrgstore.dfs.core.windows.net/accounts.csv,accounts.csv,237,1733357495000
abfss://raw@myrgstore.dfs.core.windows.net/customers.csv,customers.csv,629,1733357495000
abfss://raw@myrgstore.dfs.core.windows.net/loan_payments.csv,loan_payments.csv,277,1733357495000
abfss://raw@myrgstore.dfs.core.windows.net/loans.csv,loans.csv,271,1733357495000
abfss://raw@myrgstore.dfs.core.windows.net/transactions.csv,transactions.csv,408,1733357495000


In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()

# Read data from raw container
accounts_df = spark.read.csv("abfss://raw@myrgstore.dfs.core.windows.net/accounts.csv", header=True)
customers_df = spark.read.csv("abfss://raw@myrgstore.dfs.core.windows.net/customers.csv", header=True)
loan_payments_df = spark.read.csv("abfss://raw@myrgstore.dfs.core.windows.net/loan_payments.csv", header=True)
loans_df = spark.read.csv("abfss://raw@myrgstore.dfs.core.windows.net/loans.csv", header=True)
transactions_df = spark.read.csv("abfss://raw@myrgstore.dfs.core.windows.net/transactions.csv", header=True)

accounts_df.show(5)
customers_df.show(5)
loan_payments_df.show(5)
loans_df.show(5)
transactions_df.show(5)

+----------+-----------+------------+-------+
|account_id|customer_id|account_type|balance|
+----------+-----------+------------+-------+
|         1|          1|    Checking|   1000|
|         2|          1|     Savings|   5000|
|         3|          2|    Checking|   1500|
|         4|          2|  Investment|   7500|
|         5|          3|     Savings|   2000|
+----------+-----------+------------+-------+
only showing top 5 rows

+-----------+----------+---------+------------+-----------+-----+-----+
|customer_id|first_name|last_name|     address|       city|state|  zip|
+-----------+----------+---------+------------+-----------+-----+-----+
|          1|      John|      Doe|  123 Elm St|Springfield|   IL|62701|
|          2|      Jane|    Smith|  456 Oak St|    Chicago|   IL|60614|
|          3|     Emily|  Johnson| 789 Pine St|     Dallas|   TX|75201|
|          4|   Michael| Williams|101 Maple St|    Seattle|   WA|98101|
|          5|     Sarah|    Brown|202 Birch St|   New Yor

In [0]:
# Remove null values
accounts_df = accounts_df.dropna()
customers_df = customers_df.dropna()
loan_payments_df = loan_payments_df.dropna()
loans_df = loans_df.dropna()
transactions_df = transactions_df.dropna()

In [0]:
# Remove Duplicates
accounts_df = accounts_df.dropDuplicates()
customers_df = customers_df.dropDuplicates()
loan_payments_df = loan_payments_df.dropDuplicates()
loans_df = loans_df.dropDuplicates()
transactions_df = transactions_df.dropDuplicates()

In [0]:
# Renaming the column name from zip to postal_code from customer file.
customers_df = customers_df.withColumnRenamed("zip", "postal_code")

In [0]:
# Change data types for customer table Converting 'customer_id' and 'postal_code' to Integer

from pyspark.sql import functions as F

# Change data types for customer table Converting 'customer_id' and 'postal_code' to Integer

customers_df = (customers_df
.withColumn("customer_id", F.col("customer_id").cast("int"))
.withColumn("postal_code", F.col("postal_code").cast("int"))
)

# Display the updated schema to verify changes

customers_df.printSchema()

# Display the updated schema to verify changes

customers_df.show(5)

root
 |-- customer_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: integer (nullable = true)

+-----------+----------+---------+------------+-----------+-----+-----------+
|customer_id|first_name|last_name|     address|       city|state|postal_code|
+-----------+----------+---------+------------+-----------+-----+-----------+
|          9|    Olivia|    Davis|  606 Fir St|     Boston|   MA|       2101|
|          6|     David|    Jones|303 Cedar St|Los Angeles|   CA|      90001|
|          5|     Sarah|    Brown|202 Birch St|   New York|   NY|      10001|
|          2|      Jane|    Smith|  456 Oak St|    Chicago|   IL|      60614|
|          3|     Emily|  Johnson| 789 Pine St|     Dallas|   TX|      75201|
+-----------+----------+---------+------------+-----------+-----+-----------+
only show

In [0]:
# Change data types for accounts table Converting 'customer_id', 'account_id' to integer and 'balance' to double

accounts_df = (accounts_df
.withColumn("account_id", F.col("account_id").cast("int"))
.withColumn("customer_id", F.col("customer_id").cast("int"))
.withColumn("balance", F.col("balance").cast("double"))
)

# Display the updated schema to verify changes

accounts_df.printSchema()

# Display the updated schema to verify changes

accounts_df.show(5)

root
 |-- account_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- account_type: string (nullable = true)
 |-- balance: double (nullable = true)

+----------+-----------+------------+-------+
|account_id|customer_id|account_type|balance|
+----------+-----------+------------+-------+
|         9|          7|  Investment| 8000.0|
|         8|          6|     Savings| 6000.0|
|         5|          3|     Savings| 2000.0|
|         4|          2|  Investment| 7500.0|
|         1|          1|    Checking| 1000.0|
+----------+-----------+------------+-------+
only showing top 5 rows



In [0]:
# Change data types for loan_payments table converting ‘payment_id’, ‘loan_id’ to Int and ‘payment_date’ to date and ‘payment_amount’ to double.

loan_payments_df = (loan_payments_df
.withColumn("payment_id", F.col("payment_id").cast("int"))
.withColumn("loan_id", F.col("loan_id").cast("int"))
.withColumn("payment_date", F.to_date(F.col("payment_date"), "yyyy-MM-dd"))

# Adjust date format as needed

.withColumn("payment_amount", F.col("payment_amount").cast("double")) 
)

# Display the updated schema to verify changes

loan_payments_df.printSchema()

# Display the updated schema to verify changes

loan_payments_df.show(5)

root
 |-- payment_id: integer (nullable = true)
 |-- loan_id: integer (nullable = true)
 |-- payment_date: date (nullable = true)
 |-- payment_amount: double (nullable = true)

+----------+-------+------------+--------------+
|payment_id|loan_id|payment_date|payment_amount|
+----------+-------+------------+--------------+
|         9|      5|  2024-05-10|         400.0|
|         3|      2|  2024-01-20|         150.0|
|         8|      4|  2024-04-01|         200.0|
|         1|      1|  2024-01-15|         250.0|
|         5|      3|  2024-01-25|         400.0|
+----------+-------+------------+--------------+
only showing top 5 rows



In [0]:
# Change data types for loans table converting ‘loan_id’, ‘customer_id’ and ‘loan_term’ to Int and ‘loan_amount’, ‘interest_rate’ to double

loans_df = (loans_df
.withColumn("loan_id", F.col("loan_id").cast("int"))
.withColumn("customer_id", F.col("customer_id").cast("int"))
.withColumn("loan_amount", F.col("loan_amount").cast("double"))
.withColumn("interest_rate", F.col("interest_rate").cast("double"))
.withColumn("loan_term", F.col("loan_term").cast("int"))
)

# Display the updated schema to verify changes

loans_df.printSchema()

# Display the updated schema to verify changes

loans_df.show(5)

root
 |-- loan_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- interest_rate: double (nullable = true)
 |-- loan_term: integer (nullable = true)

+-------+-----------+-----------+-------------+---------+
|loan_id|customer_id|loan_amount|interest_rate|loan_term|
+-------+-----------+-----------+-------------+---------+
|      6|          6|     8000.0|          3.9|       24|
|      7|          7|     9500.0|          4.1|       30|
|      5|          5|    12000.0|          4.5|       48|
|      3|          3|     6000.0|         3.75|       18|
|      1|          1|     5000.0|          3.5|       12|
+-------+-----------+-----------+-------------+---------+
only showing top 5 rows



In [0]:
# Change data types for transactions table converting ‘transaction_id’, ‘account_id’ and ‘transaction_date’ to Date and ‘trannsaction_amount’’ to double

transactions_df = (transactions_df
.withColumn("transaction_id", F.col("transaction_id").cast("int"))
.withColumn("account_id", F.col("account_id").cast("int"))
.withColumn("transaction_date", F.to_date(F.col("transaction_date"),
"yyyy-MM-dd"))
.withColumn("transaction_amount",
F.col("transaction_amount").cast("double"))
)

# Display the updated schema to verify changes

transactions_df.printSchema()

# Display the updated schema to verify changes

transactions_df.show(5)

root
 |-- transaction_id: integer (nullable = true)
 |-- account_id: integer (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- transaction_amount: double (nullable = true)
 |-- transaction_type: string (nullable = true)

+--------------+----------+----------------+------------------+----------------+
|transaction_id|account_id|transaction_date|transaction_amount|transaction_type|
+--------------+----------+----------------+------------------+----------------+
|             4|         2|      2024-09-04|             -50.0|      Withdrawal|
|             8|         6|      2024-09-08|            -300.0|      Withdrawal|
|            10|         8|      2024-09-10|            -150.0|      Withdrawal|
|             7|         5|      2024-09-07|             250.0|         Deposit|
|             5|         3|      2024-09-05|             150.0|         Deposit|
+--------------+----------+----------------+------------------+----------------+
only showing top 5 rows



In [0]:
# Define the paths to the Silver container

silver_accounts = "abfss://silver@myrgstore.dfs.core.windows.net/delta/accounts_delta"

silver_customers = "abfss://silver@myrgstore.dfs.core.windows.net/delta/customers_delta"

silver_loan_payments = "abfss://silver@myrgstore.dfs.core.windows.net/delta/loan_payments_delta"

silver_loans = "abfss://silver@myrgstore.dfs.core.windows.net/delta/loans_delta"

silver_transactions = "abfss://silver@myrgstore.dfs.core.windows.net/delta/transactions_delta"

In [0]:
# Write the cleaned data back to the Silver container

accounts_df.write.format("delta").mode("overwrite").save(silver_accounts)
customers_df.write.format("delta").mode("overwrite").save(silver_customers)
loan_payments_df.write.format("delta").mode("overwrite").save(silver_loan_payments)
loans_df.write.format("delta").mode("overwrite").save(silver_loans)
transactions_df.write.format("delta").mode("overwrite").save(silver_transactions)

In [0]:
# Define the paths to the Silver container

silver_accounts = "abfss://silver@myrgstore.dfs.core.windows.net/delta/accounts_delta"

silver_customers = "abfss://silver@myrgstore.dfs.core.windows.net/delta/customers_delta"

silver_loan_payments = "abfss://silver@myrgstore.dfs.core.windows.net/delta/loan_payments_delta"

silver_loans = "abfss://silver@myrgstore.dfs.core.windows.net/delta/loans_delta"

silver_transactions = "abfss://silver@myrgstore.dfs.core.windows.net/delta/transactions_delta"

silver_delta = "abfss://silver@myrgstore.dfs.core.windows.net/delta/silver_delta"