### Imported Libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### Readed Data

In [0]:
# Read the data from the source
transactions_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("s3://end-to-end-banking-data-pipeline/bronze/raw_transactions")

In [0]:
# Display Schema
transactions_df.printSchema()

root
 |-- txn_id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- txn_date: timestamp (nullable = true)
 |-- txn_type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- channel: string (nullable = true)
 |-- date: date (nullable = true)



### Transformations

In [0]:
# Handling null values
transactions_df = transactions_df.fillna(0)

In [0]:
# Droping Duplicates
transactions_df = transactions_df.dropDuplicates()

In [0]:
# Adding Column Date
transactions_df = transactions_df.withColumn("date", to_date(col("txn_date")))

In [0]:
# Columns to lowercase
transactions_df = transactions_df.select(*(col(c).alias(c.lower()) for c in transactions_df.columns))

In [0]:
# Write Data to s3 silver
transactions_df.write.mode("overwrite")\
    .format("csv")\
    .option("header", "true")\
    .save("s3://end-to-end-banking-data-pipeline/silver/cleaned_transactions")