In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
.master("local") \
.appName("Bank SG") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

In [3]:
# Customer
cus = spark.read.format("csv").option("sep", ";").option("inferSchema", "true").option("header", "true").load("customer")
# Account    
acc = spark.read.format("csv").option("sep", ";").option("inferSchema", "true").option("header", "true").load("Account")
# Product Rule
prodRule = spark.read.format("csv").option("sep", ";").option("inferSchema", "true").option("header", "true").load("Product Rules")
# Statement
stat = spark.read.format("csv").option("sep", ";").option("inferSchema", "true").option("header", "true").load("Statement")   

In [4]:
cus.show()
acc.show()
prodRule.show()
stat.show()

+---+------+-----------+--------------+----------+----+
| id| accNo|phoneNumber|       address|       PAN|Name|
+---+------+-----------+--------------+----------+----+
|  1|ac0001| 9999999999|Taj Mahal,Agra|ARSPP****7| Ram|
+---+------+-----------+--------------+----------+----+

+------+--------------------+------+----------+------+
|  acNo|              branch|  type|    openOn|status|
+------+--------------------+------+----------+------+
|ac0001|Pritech 11 Block,...|saving|10-01-2001|active|
+------+--------------------+------+----------+------+

+-------------+-------+------------+
|      Product|   rule|      amount|
+-------------+-------+------------+
|     car loan|dti<0.5| salary*12*3|
|personal loan|dti<0.7| salary*12*5|
|    home loan|dti<0.3|salary*12*10|
+-------------+-------+------------+

+------+--------+-------+-------------+
| accNo|credited|debited|       source|
+------+--------+-------+-------------+
|ac0001|    1000|   null|       salary|
|ac0001|    null|    10

In [5]:
cus.persist()

DataFrame[id: int, accNo: string, phoneNumber: bigint, address: string, PAN: string, Name: string]

In [None]:
# Finding DTI:



In [5]:
#Finding DTI:

stat1 = stat.withColumn("credited", when(stat.credited.isNull(), 0).otherwise(stat.credited)) #diff of scala
statSumDeb = stat1.groupBy("accNo").agg(sum("debited").alias("totDebited"))
statSumCre = stat1.filter(stat1.source == "salary").groupBy("accNo").agg(sum("credited").alias("totCredited"))
statDebCre = statSumCre.join(statSumDeb, "accNo")
DTI_DF = statDebCre.withColumn("DTI", col("totDebited") / col("totCredited"))

In [6]:
statSumCre.show()
DTI_DF.show()

+------+-----------+
| accNo|totCredited|
+------+-----------+
|ac0001|       1000|
+------+-----------+

+------+-----------+----------+----+
| accNo|totCredited|totDebited| DTI|
+------+-----------+----------+----+
|ac0001|       1000|       350|0.35|
+------+-----------+----------+----+



In [7]:
#Final output

schema = StructType([
    StructField("loan", StringType()),
    StructField("loanAmt", IntegerType())
])

def prodAndLoan(a, b): 
    if (a > 0.30): 
        return ("Personal loan", b * 12 * 5) 
    else: 
        return ("No loan", 0)
    
prod_prop = udf(prodAndLoan, schema)

a = cus.join(DTI_DF, "accNo").select("accNo", "Name", "totCredited", "DTI")
b = a.withColumn("newCol", prod_prop(a.DTI, a.totCredited))

b.show(truncate=0)
    

+------+----+-----------+----+----------------------+
|accNo |Name|totCredited|DTI |newCol                |
+------+----+-----------+----+----------------------+
|ac0001|Ram |1000       |0.35|[Personal loan, 60000]|
+------+----+-----------+----+----------------------+



In [8]:
b.select(b.accNo.alias("customer ACC"), "Name", "DTI", month(current_timestamp()), "newCol.loan", "newcol.loanAmt").show()

+------------+----+----+--------------------------+-------------+-------+
|customer ACC|Name| DTI|month(current_timestamp())|         loan|loanAmt|
+------------+----+----+--------------------------+-------------+-------+
|      ac0001| Ram|0.35|                        10|Personal loan|  60000|
+------------+----+----+--------------------------+-------------+-------+



In [9]:
a.show()

+------+----+-----------+----+
| accNo|Name|totCredited| DTI|
+------+----+-----------+----+
|ac0001| Ram|       1000|0.35|
+------+----+-----------+----+



In [10]:
cus.schema.fieldNames()

['id', 'accNo', 'phoneNumber', 'address', 'PAN', 'Name']