In [40]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol','true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()


In [41]:
spark

In [42]:
loans_deafulter_raw_df = spark.read.format("csv").option("header",True).option("inferSchema",True).load("/user/itv017244/lendingclubproject/raw/loans_defaulters_csv")

In [43]:
loans_deafulter_raw_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
bfcec8da7b3f1fe83...,1.0,0.0,0.0,0.0,2.0,0.0,11.0,
36399458db4295868...,0.0,0.0,1.0,1.0,0.0,0.0,,63.0
e4912b650878ca941...,0.0,0.0,0.0,0.0,0.0,0.0,67.0,
3d6f35ad2e2be4572...,0.0,0.0,1.0,1.0,0.0,0.0,,66.0
478e1cd37e35c20f7...,1.0,0.0,1.0,1.0,0.0,0.0,15.0,108.0
a5d42e35cc0be2789...,0.0,0.0,1.0,1.0,1.0,0.0,,83.0
7f3ef55c784bf6b16...,0.0,0.0,0.0,0.0,0.0,0.0,41.0,
beb30abaeca08f49b...,2.0,0.0,0.0,0.0,0.0,0.0,14.0,
233e5e6797dd0a68b...,0.0,0.0,1.0,1.0,1.0,0.0,58.0,85.0
c4b178ffaf80ed472...,1.0,0.0,0.0,0.0,0.0,0.0,17.0,


In [44]:
loans_deafulter_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- mths_since_last_delinq: string (nullable = true)
 |-- mths_since_last_record: string (nullable = true)



In [45]:
loans_deafulter_raw_df.createOrReplaceTempView("loans_defaulters")

In [46]:
spark.sql("select distinct(delinq_2yrs) from loans_defaulters")

delinq_2yrs
1.0
271 monthly payme...
I bike to work on...
183xx
VISA and AMEX cre...
etc. and I feel t...
AZ
017xx
923xx
446xx


In [47]:
spark.sql("select delinq_2yrs, count(*) as total from loans_defaulters group by delinq_2yrs order by total desc")

delinq_2yrs,total
0.0,1838878
1.0,281335
2.0,81285
3.0,29539
4.0,13179
5.0,6599
6.0,3717
7.0,2062
8.0,1223
9.0,818


In [48]:
loans_defaulter_schema = 'member_id string,delinq_2yrs float,delinq_amnt float,pub_rec float,pub_rec_bankruptcies float,inq_last_6mths float,total_rec_late_fee float,mths_since_last_delinq float,mths_since_last_record float'

In [49]:
loans_defaulter_raw_df = spark.read.format("csv").option("header",True).schema(loans_defaulter_schema).load("/user/itv017244/lendingclubproject/raw/loans_defaulters_csv")

In [50]:
loans_defaulter_raw_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
bfcec8da7b3f1fe83...,1.0,0.0,0.0,0.0,2.0,0.0,11.0,
36399458db4295868...,0.0,0.0,1.0,1.0,0.0,0.0,,63.0
e4912b650878ca941...,0.0,0.0,0.0,0.0,0.0,0.0,67.0,
3d6f35ad2e2be4572...,0.0,0.0,1.0,1.0,0.0,0.0,,66.0
478e1cd37e35c20f7...,1.0,0.0,1.0,1.0,0.0,0.0,15.0,108.0
a5d42e35cc0be2789...,0.0,0.0,1.0,1.0,1.0,0.0,,83.0
7f3ef55c784bf6b16...,0.0,0.0,0.0,0.0,0.0,0.0,41.0,
beb30abaeca08f49b...,2.0,0.0,0.0,0.0,0.0,0.0,14.0,
233e5e6797dd0a68b...,0.0,0.0,1.0,1.0,1.0,0.0,58.0,85.0
c4b178ffaf80ed472...,1.0,0.0,0.0,0.0,0.0,0.0,17.0,


In [51]:
loans_defaulter_raw_df.createOrReplaceTempView("loans_defaulters")

In [52]:
spark.sql("select delinq_2yrs, count(*) as total from loans_defaulters group by delinq_2yrs order by total desc")

delinq_2yrs,total
0.0,1838878
1.0,281335
2.0,81285
3.0,29539
4.0,13179
5.0,6599
6.0,3717
7.0,2062
8.0,1223
9.0,818


In [53]:
from pyspark.sql.functions import col

In [54]:
loans_def_processed_df = loans_defaulter_raw_df.withColumn("delinq_2yrs", col("delinq_2yrs").cast("integer")).fillna(0,subset = ["delinq_2yrs"])

In [55]:
loans_def_processed_df.createOrReplaceTempView("loans_defaulters")

In [56]:
spark.sql("select delinq_2yrs, count(*) as total from loans_defaulters group by delinq_2yrs order by total desc")

delinq_2yrs,total
0,1839141
1,281337
2,81285
3,29545
4,13180
5,6601
6,3719
7,2063
8,1226
9,821


In [57]:
spark.sql("select * from loans_defaulters").show(2)

+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|           member_id|delinq_2yrs|delinq_amnt|pub_rec|pub_rec_bankruptcies|inq_last_6mths|total_rec_late_fee|mths_since_last_delinq|mths_since_last_record|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|bfcec8da7b3f1fe83...|          1|        0.0|    0.0|                 0.0|           2.0|               0.0|                  11.0|                  null|
|36399458db4295868...|          0|        0.0|    1.0|                 1.0|           0.0|               0.0|                  null|                  63.0|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
only showing top 2 rows



In [58]:
loans_def_delinq_df= spark.sql("select member_id, delinq_2yrs,delinq_amnt, int(mths_since_last_delinq) from loans_defaulters where delinq_2yrs>0 or mths_since_last_delinq>0 ")

In [59]:
loans_def_delinq_df

member_id,delinq_2yrs,delinq_amnt,mths_since_last_delinq
bfcec8da7b3f1fe83...,1,0.0,11
e4912b650878ca941...,0,0.0,67
478e1cd37e35c20f7...,1,0.0,15
7f3ef55c784bf6b16...,0,0.0,41
beb30abaeca08f49b...,2,0.0,14
233e5e6797dd0a68b...,0,0.0,58
c4b178ffaf80ed472...,1,0.0,17
7f45445bc792b29c7...,0,0.0,38
f67ba01e72eb5f109...,0,0.0,74
47196375e82e575cb...,1,0.0,21


In [60]:
spark.sql("select * from loans_defaulters").show(2)

+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|           member_id|delinq_2yrs|delinq_amnt|pub_rec|pub_rec_bankruptcies|inq_last_6mths|total_rec_late_fee|mths_since_last_delinq|mths_since_last_record|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|bfcec8da7b3f1fe83...|          1|        0.0|    0.0|                 0.0|           2.0|               0.0|                  11.0|                  null|
|36399458db4295868...|          0|        0.0|    1.0|                 1.0|           0.0|               0.0|                  null|                  63.0|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
only showing top 2 rows



In [61]:
loans_def_records_enq_df= spark.sql("select member_id from loans_defaulters where pub_rec>0.0 or pub_rec_bankruptcies>0.0 or inq_last_6mths>0.0 ")

In [62]:
loans_def_records_enq_df

member_id
bfcec8da7b3f1fe83...
36399458db4295868...
3d6f35ad2e2be4572...
478e1cd37e35c20f7...
a5d42e35cc0be2789...
233e5e6797dd0a68b...
07e81491adcc0e7c3...
412a63a98577abf03...
6f45755c48f1b4e98...
229e078da68df21b4...


In [63]:
loans_def_delinq_df.write.option("header", True).format("csv").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/loan_deafulters_delinq_csv").save()

In [64]:
loans_def_delinq_df.write.format("parquet").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/loan_defaulters_delinq_parquet").save()

In [65]:
loans_def_records_enq_df.write.option("header", True).format("csv").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/loan_deafulters_records_enq_csv").save()

In [66]:
loans_def_records_enq_df.write.format("parquet").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/loan_defaulters_records_enq_parquet").save()

In [67]:
loans_def_p_pub_rec_df = loans_def_processed_df.withColumn("pub_rec", col("pub_rec").cast("integer")).fillna(0,subset = ["pub_rec"])

In [68]:
loans_def_p_pub_rec_bankruptcies_df = loans_def_p_pub_rec_df.withColumn("pub_rec_bankruptcies", col("pub_rec_bankruptcies").cast("integer")).fillna(0,subset = ["pub_rec_bankruptcies"])

In [69]:
loans_def_p_inq_last_6mths_df = loans_def_p_pub_rec_bankruptcies_df.withColumn("inq_last_6mths", col("inq_last_6mths").cast("integer")).fillna(0,subset = ["inq_last_6mths"])

In [70]:
loans_def_p_inq_last_6mths_df.createOrReplaceTempView("loans_defaulters")

In [71]:
loans_def_detail_records_enq_df= spark.sql("select member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths from loans_defaulters")

In [72]:
loans_def_detail_records_enq_df.write.option("header", True).format("csv").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/loan_deafulters_detail_records_enq_csv").save()

In [73]:
loans_def_detail_records_enq_df.write.format("parquet").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/loan_defaulters_detail_records_enq_parquet").save()

In [None]:
spark.stop()