In [47]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol','true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()


In [48]:
spark

In [49]:
loans_repay_raw_df = spark.read.format("csv").option("header",True).option("inferSchema",True).load("/user/itv017244/lendingclubproject/raw/loans_repayments_csv")

In [50]:
loans_repay_raw_df

loan_id,total_rec_prncp,total_rec_int,total_rec_late_fee,total_pymnt,last_pymnt_amnt,last_pymnt_d,next_pymnt_d
56633077,3000.0,376.21,0.0,3376.205975527,93.74,Aug-2018,
55927518,15600.0,1956.32,0.0,17556.320693408998,487.9,Aug-2018,
56473345,20000.0,2408.94,0.0,22408.9398096902,9677.72,May-2017,
56463188,11200.0,5231.01,0.0,16431.0146429476,7475.86,Feb-2018,
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,
56483027,10000.0,2062.03,0.0,12062.026276051,335.38,Aug-2018,
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019
56643620,16000.0,1031.67,0.0,17031.673055266598,8363.28,Mar-2017,
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,


In [51]:
loans_repay_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_rec_prncp: string (nullable = true)
 |-- total_rec_int: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- total_pymnt: string (nullable = true)
 |-- last_pymnt_amnt: string (nullable = true)
 |-- last_pymnt_d: string (nullable = true)
 |-- next_pymnt_d: string (nullable = true)



In [52]:
loans_repay_schema = 'loan_id string,total_principal_received float,total_interest_received float,total_late_fee_received float,total_payment_received float,last_payment_amount float,last_payment_date string,next_payment_date string'

In [53]:
loans_repay_raw_df = spark.read.format("csv").option("header",True).schema(loans_repay_schema).load("/user/itv017244/lendingclubproject/raw/loans_repayments_csv")

In [54]:
loans_repay_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)



In [55]:
from pyspark.sql.functions import current_timestamp

In [56]:
loan_repay_df_ingestd = loans_repay_raw_df.withColumn("ingest_date",current_timestamp())

In [57]:
loan_repay_df_ingestd

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,,2025-03-18 14:54:...
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,,2025-03-18 14:54:...
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,,2025-03-18 14:54:...
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,,2025-03-18 14:54:...
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,,2025-03-18 14:54:...
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,,2025-03-18 14:54:...
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,,2025-03-18 14:54:...
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019,2025-03-18 14:54:...
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,,2025-03-18 14:54:...
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,,2025-03-18 14:54:...


In [58]:
loan_repay_df_ingestd.createOrReplaceTempView("loan_repayment")

In [59]:
spark.sql("select count(*) from loan_repayment where total_payment_received is null")

count(1)
94


In [60]:
loan_repay_df_ingestd.count()

2260701

In [61]:
columns_to_check = ["total_principal_received","total_interest_received","total_late_fee_received","total_payment_received","last_payment_amount"]

In [62]:
loans_repay_filtered_df = loan_repay_df_ingestd.na.drop(subset=columns_to_check)

In [63]:
loans_repay_filtered_df.count()

2260498

In [64]:
loans_repay_filtered_df.createOrReplaceTempView("loan_repayment")

In [65]:
spark.sql("select * from loan_repayment where total_payment_received= 0.0  and total_principal_received != 0.0")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
1064185,11600.98,11600.98,10000.0,0.0,0.0,0.0,Dec-2014,2025-03-18 14:54:...
516382,21890.229,21856.03,16000.0,0.0,0.0,0.0,Mar-2014,2025-03-18 14:54:...
528899,3045.0364,3019.64,2500.0,0.0,0.0,0.0,Jan-2013,2025-03-18 14:54:...
527598,2398.9092,2220.51,2200.0,0.0,0.0,0.0,Jul-2011,2025-03-18 14:54:...
525697,21797.86,19894.9,15750.0,0.0,0.0,0.0,Jun-2015,2025-03-18 14:54:...
522641,3146.8193,3146.82,3000.0,0.0,0.0,0.0,Sep-2011,2025-03-18 14:54:...
515655,29938.576,29905.75,22800.0,0.0,0.0,0.0,May-2013,2025-03-18 14:54:...
501234,15219.313,15155.9,12000.0,0.0,0.0,0.0,May-2013,2025-03-18 14:54:...
498194,11642.714,11031.47,10000.0,0.0,0.0,0.0,Jan-2013,2025-03-18 14:54:...
495171,11138.843,10024.96,10000.0,0.0,0.0,0.0,Apr-2013,2025-03-18 14:54:...


In [67]:
from pyspark.sql.functions import when, col

In [70]:
loans_payment_fixed_df = loans_repay_filtered_df.withColumn(
    "total_payment_received",
    when(
        (col("total_principal_received")!= 0.0) & 
        (col("total_payment_received") == 0.0)
      , col("total_principal_received")+col("total_interest_received")+ col("total_late_fee_received")
    ).otherwise(col("total_payment_received")))

In [79]:
loans_payment_fixed_df.filter("total_payment_received == 0.0").count()

949

In [80]:
loans_payment_fixed_df2 = loans_payment_fixed_df.filter("total_payment_received != 0.0")

In [81]:
loans_payment_fixed_df2.filter("total_payment_received == 0.0").count()

0

In [83]:
loans_payment_fixed_df2.filter("last_payment_date == 0.0").count()

48

In [84]:
loans_payment_fixed_df2.filter("next_payment_date == 0.0").count()

24

In [85]:
loans_payment_fixed_df2.filter("last_payment_date is null").count()

1477

In [86]:
loans_payment_fixed_df2.filter("next_payment_date is null").count()

1344240

In [89]:
loans_payment_ldate_fixed_df = loans_payment_fixed_df2.withColumn(
    "last_payment_date", 
            when(
                (col("last_payment_date")== 0.0)
                ,None)
            .otherwise(col("last_payment_date")))

In [90]:
loans_payment_ndate_fixed_df = loans_payment_ldate_fixed_df.withColumn(
    "next_payment_date", 
            when(
                (col("next_payment_date")== 0.0)
                ,None)
            .otherwise(col("next_payment_date")))

In [93]:
loans_payment_ndate_fixed_df.filter("last_payment_date == 0.0").count()

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date


In [94]:
loans_payment_ndate_fixed_df.filter("next_payment_date == 0.0").count()

0

In [95]:
loans_payment_ndate_fixed_df.write.option("header", True).format("csv").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/loan_repayment_csv").save()

In [96]:
loans_payment_ndate_fixed_df.write.format("parquet").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/loan_repayment_parquet").save()

In [None]:
spark.stop()