In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window


In [0]:

spark.conf.set("spark.cosmos.accountEndpoint", "") 
spark.conf.set("spark.cosmos.accountKey", "")       
spark.conf.set("spark.cosmos.database", "BankDB")               


In [0]:
cosmos_endpoint = ""
cosmos_key = ""
database_name = "BankDB"
container_name = "ATMTransactions" 
container_name2 = "UPIEvents"
container_name3 = "FraudAlerts"

df = spark.read.format("cosmos.oltp") \
    .option("spark.cosmos.accountEndpoint", cosmos_endpoint) \
    .option("spark.cosmos.accountKey", cosmos_key) \
    .option("spark.cosmos.database", database_name) \
    .option("spark.cosmos.container", container_name) \
    .load()

# UPI Events
upi_df = spark.read.format("cosmos.oltp") \
    .option("spark.cosmos.accountEndpoint", cosmos_endpoint) \
    .option("spark.cosmos.accountKey", cosmos_key) \
    .option("spark.cosmos.database", database_name) \
    .option("spark.cosmos.container", container_name2) \
    .load()

# Fraud Items
fraud_df = spark.read.format("cosmos.oltp") \
    .option("spark.cosmos.accountEndpoint", cosmos_endpoint) \
    .option("spark.cosmos.accountKey", cosmos_key) \
    .option("spark.cosmos.database", database_name) \
    .option("spark.cosmos.container", container_name3) \
    .load()




In [0]:
storage_account = "teststoreshamm"
bronze_container = "bronze"
silver_container = "silver"
gold_container = "gold"

bronze_path_atm = f"abfss://{bronze_container}@{storage_account}.dfs.core.windows.net/atm/"
bronze_path_upi = f"abfss://{bronze_container}@{storage_account}.dfs.core.windows.net/upi/"
bronze_path_fraud = f"abfss://{bronze_container}@{storage_account}.dfs.core.windows.net/fraud/"



In [0]:
# Cosmos DB connection
cosmos_endpoint = ""
cosmos_key = ""  
database_name = "BankDB"

# Cosmos DB containers
container_atm = "ATMTransactions"
container_upi = "UPIEvents"
container_fraud = "FraudAlerts"

# ADLS storage configuration (direct key method)
storage_account_name = "teststoreshamm"
storage_account_key = ""  
adls_root = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/"

spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    storage_account_key
)

# Delta Lake paths
bronze_path_atm = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/atm/"
bronze_path_upi = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/upi/"
bronze_path_fraud = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/fraud/"

silver_path = f"abfss://silver@{storage_account_name}.dfs.core.windows.net/transactions/"
gold_path = f"abfss://gold@{storage_account_name}.dfs.core.windows.net/fact_transactions/"


In [0]:
# ATM Transactions
atm_df = spark.read.format("cosmos.oltp") \
    .option("spark.cosmos.accountEndpoint", cosmos_endpoint) \
    .option("spark.cosmos.accountKey", cosmos_key) \
    .option("spark.cosmos.database", database_name) \
    .option("spark.cosmos.container", container_atm) \
    .load()

# UPI Events
upi_df = spark.read.format("cosmos.oltp") \
    .option("spark.cosmos.accountEndpoint", cosmos_endpoint) \
    .option("spark.cosmos.accountKey", cosmos_key) \
    .option("spark.cosmos.database", database_name) \
    .option("spark.cosmos.container", container_upi) \
    .load()

# Fraud Items
fraud_df = spark.read.format("cosmos.oltp") \
    .option("spark.cosmos.accountEndpoint", cosmos_endpoint) \
    .option("spark.cosmos.accountKey", cosmos_key) \
    .option("spark.cosmos.database", database_name) \
    .option("spark.cosmos.container", container_fraud) \
    .load()


In [0]:
atm_df.write.format("delta").mode("overwrite").save(bronze_path_atm)
upi_df.write.format("delta").mode("overwrite").save(bronze_path_upi)
fraud_df.write.format("delta").mode("overwrite").save(bronze_path_fraud)


In [0]:
from pyspark.sql.functions import col, to_timestamp, upper

# Normalize ATM
atm_clean = (
    atm_df
    .withColumn("txn_type", upper(col("txn_type")))
    .withColumn("txn_time", to_timestamp(col("TransactionTime"), "yyyy-MM-dd HH:mm:ss"))
)

# Normalize UPI
upi_clean = (
    upi_df
    .withColumn("txn_type", upper(col("txn_type")))
    .withColumn("txn_time", to_timestamp(col("TxnTimestamp"), "yyyy-MM-dd HH:mm:ss"))
)

# Normalize Fraud
fraud_clean = (
    fraud_df
    .withColumn("alertType", upper(col("alertType")))
    .withColumn("alert_time", to_timestamp(col("alertTime"), "yyyy-MM-dd HH:mm:ss"))
)

In [0]:
from pyspark.sql.functions import lit

# Add source column for tracking
atm_fact = atm_clean.withColumn("source", lit("ATM"))
upi_fact = upi_clean.withColumn("source", lit("UPI"))

# Merge into one Fact table
fact_transactions = atm_fact.unionByName(upi_fact, allowMissingColumns=True)

# Optional: join fraud alerts
fact_transactions = fact_transactions.join(
    fraud_clean,
    col('txn_id') == col('txn_id'),  # replace with actual key
    how="left"
)


In [0]:
# Ensure all column names are unique by appending a suffix to duplicates
def make_unique_columns(df):
    cols = df.columns
    new_cols = []
    col_count = {}
    for col in cols:
        if col in col_count:
            col_count[col] += 1
            new_cols.append(f"{col}_{col_count[col]}")
        else:
            col_count[col] = 0
            new_cols.append(col)
    return df.toDF(*new_cols)

fact_transactions = make_unique_columns(fact_transactions)

display(fact_transactions)
fact_transactions.write.format("delta").mode("overwrite").save(silver_path)

In [0]:
from pyspark.sql.functions import count, to_date

gold_df = fact_transactions.groupBy(
    to_date(col("txn_time")).alias("txn_date"),
    col("txn_type")
).agg(count("*").alias("total_txn"))

gold_df.write.format("delta").mode("overwrite").save(gold_path)