In [42]:
from os import truncate

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, trim, to_date, sum as spark_sum, max as spark_max, min as spark_min

spark = SparkSession.builder.appName("OnlineBankingAnalysis").getOrCreate()

loan_df = spark.read.option("header", "true").csv("/content/loan.csv", inferSchema=True)
credit_df = spark.read.option("header", "true").csv("/content/credit card.csv", inferSchema=True)
txn_df = spark.read.option("header", "true").csv("/content/txn.csv", inferSchema=True)

loan_df = loan_df.selectExpr([f"`{c}` as `{c.strip().replace(' ', '_')}`" for c in loan_df.columns])
credit_df = credit_df.selectExpr([f"`{c}` as `{c.strip().replace(' ', '_')}`" for c in credit_df.columns])
txn_df = txn_df.selectExpr([f"`{c}` as `{c.strip().replace(' ', '_')}`" for c in txn_df.columns])

loan_df = loan_df.withColumn("Loan_Amount_CLEAN", regexp_replace(trim(col("Loan_Amount")), ",", "").cast("double"))

txn_df = txn_df.withColumn("WITHDRAWAL_AMT_CLEAN", regexp_replace(trim(col("WITHDRAWAL_AMT")), ",", "").cast("double"))
txn_df = txn_df.withColumn("DEPOSIT_AMT_CLEAN", regexp_replace(trim(col("DEPOSIT_AMT")), ",", "").cast("double"))
txn_df = txn_df.withColumn("BALANCE_CLEAN", regexp_replace(trim(col("BALANCE_AMT")), ",", "").cast("double"))
txn_df = txn_df.withColumn("DATE", to_date("VALUE_DATE", "dd-MM-yyyy"))
'''
# ---------------- LOAN DATA ----------------

print("1.Number of loans in each category")
loan_df.groupBy("Loan_Category").count().show()


print("2.People who took loan > 1 lakh")
filtered_loan_1 = loan_df.filter(col("Loan_Amount_CLEAN") > 100000)
print("Count:", filtered_loan_1.count())
filtered_loan_1.show(truncate=False )

print("\n3. People with income > 60000")
filtered_income = loan_df.filter(col("Income") > 60000)
print("Count:", filtered_income.count())
filtered_income.show()

print("\n4. People with 2+ returned cheques and income < 50000")
filtered_returned_1 = loan_df.filter((col("Returned_Cheque") >= 2) & (col("Income") < 50000))
print("Count:", filtered_returned_1.count())
filtered_returned_1.show()

print("\n5. People with 2+ returned cheques and Single")
filtered_returned_2 = loan_df.filter((col("Returned_Cheque") >= 2) & (col("Marital_Status") == "SINGLE"))
print("Count:", filtered_returned_2.count())
filtered_returned_2.show()

print("\n6. People with monthly expenses > 50000")
filtered_expense = loan_df.filter(col("Expenditure") > 50000)
print("Count:", filtered_expense.count())
filtered_expense.show()


# ---------------- CREDIT CARD DATA ----------------

print("7. Credit card users in Spain")
spain_credit_users = credit_df.filter(col("Geography") == "Spain")
print("Count:", spain_credit_users.count())
spain_credit_users.show(truncate=False)


# ---------------- TRANSACTION DATA ----------------

print("8.Maximum withdrawal amount")
txn_df.select(spark_max("WITHDRAWAL_AMT_CLEAN").alias("Max_Withdrawal")).show()

print("9.Minimum withdrawal amount")
txn_df.select(spark_min("WITHDRAWAL_AMT_CLEAN").alias("Min_Withdrawal")).show()

print("10.Maximum deposit amount")
txn_df.select(spark_max("DEPOSIT_AMT_CLEAN").alias("Max_Deposit")).show()

print("11.Minimum deposit amount")
txn_df.select(spark_min("DEPOSIT_AMT_CLEAN").alias("Min_Deposit")).show()

print("12.Total balance in each account")
txn_df.groupBy("Account_No").agg(spark_sum("BALANCE_CLEAN").alias("Total_Balance")).show(truncate = False)
'''
print("\n13. Customers with withdrawal > 1 lakh")
high_withdrawal = txn_df.filter(col("WITHDRAWAL_AMT_CLEAN") > 100000).select("Account_No", "WITHDRAWAL_AMT_CLEAN").distinct()
print("Count:", high_withdrawal.count())
high_withdrawal.show(truncate=False)



13. Customers with withdrawal > 1 lakh
Count: 10058
+-------------+--------------------+
|Account_No   |WITHDRAWAL_AMT_CLEAN|
+-------------+--------------------+
|409000611074'|274600.0            |
|409000493201'|1500000.0           |
|409000493201'|199604.27           |
|409000438620'|186604.0            |
|409000438620'|3.6675558E7         |
|1196711'     |7530283.0           |
|1196428'     |812361.0            |
|1196428'     |6348768.0           |
|1196428'     |3043151.63          |
|409000362497'|576954.0            |
|409000362497'|3423962.0           |
|409000362497'|3.144482503E7       |
|1196428'     |4441827.47          |
|409000611074'|145450.0            |
|409000493201'|119401.28           |
|1196711'     |628945.0            |
|1196428'     |289670.04           |
|409000362497'|3.483281361E7       |
|409000362497'|4.289763641E7       |
|409000362497'|2.678162613E7       |
+-------------+--------------------+
only showing top 20 rows

