In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol','true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()


In [2]:
spark

In [3]:
loans_schema = ('loan_id string, member_id string, loan_amount float, funded_amount float, loan_term_month string, interest_rate float, monthly_installment float,issue_date string, loan_status string, loan_purpose string, loan_title string')

In [4]:
loans_raw_df = spark.read.format("csv").option("header",True).schema(loans_schema).load("/user/itv017244/lendingclubproject/raw/loans_data_csv")

In [5]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_month: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [6]:
from pyspark.sql.functions import current_timestamp

In [7]:
loans_df_ingestd=loans_raw_df.withColumn("ingest_date", current_timestamp())

In [8]:
loans_df_ingestd

loan_id,member_id,loan_amount,funded_amount,loan_term_month,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,36 months,14.16,171.28,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:07:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,36 months,9.67,353.24,Apr-2014,other,Other,,2025-03-18 12:07:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,36 months,11.99,830.24,May-2014,credit_card,Credit card refin...,,2025-03-18 12:07:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,36 months,23.43,58.41,Apr-2014,renewable_energy,Green loan,,2025-03-18 12:07:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,36 months,12.99,336.9,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:07:...
14177845,843b995d39160ddab...,13200.0,13200.0,60 months,12.99,300.28,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:07:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,60 months,17.57,251.61,Apr-2014,other,Other,,2025-03-18 12:07:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,36 months,18.25,653.01,May-2014,home_improvement,Home improvement,,2025-03-18 12:07:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,36 months,9.67,385.35,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:07:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,36 months,20.49,119.73,Apr-2014,credit_card,Credit card refin...,,2025-03-18 12:07:...


In [9]:
loans_df_ingestd.createOrReplaceTempView("loans")

In [10]:
spark.sql("select count(*) from loans")

count(1)
2260701


In [11]:
spark.sql("select count(*) from loans where loan_amount is null")

count(1)
33


In [12]:
columns_to_check = ["loan_amount","funded_amount","loan_term_month","interest_rate","monthly_installment","issue_date","loan_status","loan_purpose"]

In [13]:
loans_filtered_df = loans_df_ingestd.na.drop(subset=columns_to_check)

In [14]:
loans_filtered_df.count()

2237341

In [15]:
loans_filtered_df

loan_id,member_id,loan_amount,funded_amount,loan_term_month,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,36 months,14.16,171.28,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,36 months,9.67,353.24,Apr-2014,other,Other,,2025-03-18 12:08:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,36 months,11.99,830.24,May-2014,credit_card,Credit card refin...,,2025-03-18 12:08:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,36 months,23.43,58.41,Apr-2014,renewable_energy,Green loan,,2025-03-18 12:08:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,36 months,12.99,336.9,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
14177845,843b995d39160ddab...,13200.0,13200.0,60 months,12.99,300.28,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,60 months,17.57,251.61,Apr-2014,other,Other,,2025-03-18 12:08:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,36 months,18.25,653.01,May-2014,home_improvement,Home improvement,,2025-03-18 12:08:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,36 months,9.67,385.35,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,36 months,20.49,119.73,Apr-2014,credit_card,Credit card refin...,,2025-03-18 12:08:...


In [16]:
loans_filtered_df.createOrReplaceTempView("loans")

In [17]:
from pyspark.sql.functions import regexp_replace, col

In [18]:
loans_term_modified_df = loans_filtered_df \
.withColumn("loan_term_month",(regexp_replace(col("loan_term_month")," months", "") \
.cast("int")/12) \
.cast("int")) \
.withColumnRenamed("loan_term_month","loan_term_year")

In [19]:
loans_term_modified_df

loan_id,member_id,loan_amount,funded_amount,loan_term_year,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,3,14.16,171.28,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,3,9.67,353.24,Apr-2014,other,Other,,2025-03-18 12:08:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,3,11.99,830.24,May-2014,credit_card,Credit card refin...,,2025-03-18 12:08:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,3,23.43,58.41,Apr-2014,renewable_energy,Green loan,,2025-03-18 12:08:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,3,12.99,336.9,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
14177845,843b995d39160ddab...,13200.0,13200.0,5,12.99,300.28,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,5,17.57,251.61,Apr-2014,other,Other,,2025-03-18 12:08:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,3,18.25,653.01,May-2014,home_improvement,Home improvement,,2025-03-18 12:08:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,3,9.67,385.35,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,3,20.49,119.73,Apr-2014,credit_card,Credit card refin...,,2025-03-18 12:08:...


In [20]:
loans_term_modified_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_year: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [21]:
loans_term_modified_df.createOrReplaceTempView("loans")

In [22]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_year,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,3,14.16,171.28,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,3,9.67,353.24,Apr-2014,other,Other,,2025-03-18 12:08:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,3,11.99,830.24,May-2014,credit_card,Credit card refin...,,2025-03-18 12:08:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,3,23.43,58.41,Apr-2014,renewable_energy,Green loan,,2025-03-18 12:08:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,3,12.99,336.9,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
14177845,843b995d39160ddab...,13200.0,13200.0,5,12.99,300.28,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,5,17.57,251.61,Apr-2014,other,Other,,2025-03-18 12:08:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,3,18.25,653.01,May-2014,home_improvement,Home improvement,,2025-03-18 12:08:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,3,9.67,385.35,Apr-2014,debt_consolidation,Debt consolidation,,2025-03-18 12:08:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,3,20.49,119.73,Apr-2014,credit_card,Credit card refin...,,2025-03-18 12:08:...


In [23]:
spark.sql("select distinct loan_purpose from loans")

loan_purpose
Debit Free
MY LOAN
My consolidation ...
payoff current lo...
KickDebt
Financial Needs
Business vehicle ...
Future Saver
travel
Debt consolidatio...


In [24]:
spark.sql("select loan_purpose, count(*) as total_desc from (select lower(upper(loan_purpose)) as loan_purpose from loans) group by loan_purpose order by total_desc desc ").show()

+--------------------+----------+
|        loan_purpose|total_desc|
+--------------------+----------+
|  debt consolidation|   1175801|
|credit card refin...|    470107|
|    home improvement|    140185|
|               other|    127812|
|      major purchase|     45153|
|    medical expenses|     25502|
|            business|     20922|
|       car financing|     20553|
|            vacation|     14652|
|moving and reloca...|     13809|
|         home buying|     12733|
|       consolidation|      8142|
|debt consolidatio...|      4823|
|credit card conso...|      3660|
|       personal loan|      3414|
|  consolidation loan|      2689|
|  credit card payoff|      2573|
|credit card refin...|      2539|
|         consolidate|      2190|
|            personal|      2151|
+--------------------+----------+
only showing top 20 rows



In [25]:
loan_purpose_lookup = ["debt consolidation","credit card","home improvement",
                       "other","major purchase","medical","small_business","car",
                       "vacation","moving","house","wedding","renewable_energy","educational"]

In [26]:
from pyspark.sql.functions import when

In [27]:
loans_purpose_modified = loans_term_modified_df.withColumn("loan_purpose",when(col("loan_purpose").isin(loan_purpose_lookup),col("loan_purpose")).otherwise("other"))

In [28]:
loans_purpose_modified

loan_id,member_id,loan_amount,funded_amount,loan_term_year,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,3,14.16,171.28,Apr-2014,debt_consolidation,other,,2025-03-18 12:08:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,3,9.67,353.24,Apr-2014,other,other,,2025-03-18 12:08:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,3,11.99,830.24,May-2014,credit_card,other,,2025-03-18 12:08:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,3,23.43,58.41,Apr-2014,renewable_energy,other,,2025-03-18 12:08:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,3,12.99,336.9,Apr-2014,debt_consolidation,other,,2025-03-18 12:08:...
14177845,843b995d39160ddab...,13200.0,13200.0,5,12.99,300.28,Apr-2014,debt_consolidation,other,,2025-03-18 12:08:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,5,17.57,251.61,Apr-2014,other,other,,2025-03-18 12:08:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,3,18.25,653.01,May-2014,home_improvement,other,,2025-03-18 12:08:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,3,9.67,385.35,Apr-2014,debt_consolidation,other,,2025-03-18 12:08:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,3,20.49,119.73,Apr-2014,credit_card,other,,2025-03-18 12:08:...


In [29]:
loans_purpose_modified.createOrReplaceTempView("loans")

In [30]:
spark.sql("select loan_purpose, count(*) as total_desc from (select lower(upper(loan_purpose)) as loan_purpose from loans) group by loan_purpose order by total_desc desc ").show()

+------------------+----------+
|      loan_purpose|total_desc|
+------------------+----------+
|             other|   2230382|
|debt consolidation|      5085|
|  home improvement|       717|
|       credit card|       401|
|          vacation|       177|
|           wedding|       161|
|           medical|        98|
|    major purchase|        90|
|               car|        80|
|            moving|        58|
|             house|        52|
|    small_business|        37|
|       educational|         2|
|  renewable_energy|         1|
+------------------+----------+



In [33]:
from pyspark.sql.functions import count

In [35]:
loans_purpose_modified \
.groupBy("loan_purpose") \
.agg(count("*") \
.alias("total")) \
.orderBy(col("total").desc())

loan_purpose,total
other,2230382
debt consolidation,5085
home improvement,717
credit card,401
vacation,177
wedding,161
medical,98
major purchase,90
car,80
moving,58


In [36]:
loans_purpose_modified

loan_id,member_id,loan_amount,funded_amount,loan_term_year,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
14408468,5d7676571dee53d06...,5000.0,5000.0,3,14.16,171.28,Apr-2014,debt_consolidation,other,,2025-03-18 12:17:...
14520567,31fd1ef036c5caf26...,11000.0,11000.0,3,9.67,353.24,Apr-2014,other,other,,2025-03-18 12:17:...
14708730,9b48253ca5848fa78...,25000.0,25000.0,3,11.99,830.24,May-2014,credit_card,other,,2025-03-18 12:17:...
14491003,d6b8f2e32be148721...,1500.0,1500.0,3,23.43,58.41,Apr-2014,renewable_energy,other,,2025-03-18 12:17:...
14510981,0a81ad556e20bcdb9...,10000.0,10000.0,3,12.99,336.9,Apr-2014,debt_consolidation,other,,2025-03-18 12:17:...
14177845,843b995d39160ddab...,13200.0,13200.0,5,12.99,300.28,Apr-2014,debt_consolidation,other,,2025-03-18 12:17:...
13947687,04a8bd7c4e1407ba8...,10000.0,10000.0,5,17.57,251.61,Apr-2014,other,other,,2025-03-18 12:17:...
12905731,3e9a89fa96988af68...,18000.0,18000.0,3,18.25,653.01,May-2014,home_improvement,other,,2025-03-18 12:17:...
14137736,394a8db0bd3bcf7de...,12000.0,12000.0,3,9.67,385.35,Apr-2014,debt_consolidation,other,,2025-03-18 12:17:...
13207083,09cea8d2b6f1272f7...,3200.0,3200.0,3,20.49,119.73,Apr-2014,credit_card,other,,2025-03-18 12:17:...


In [38]:
loans_purpose_modified.write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","/user/itv017244/lendingclubproject/cleaned/loans_csv").save()

In [39]:
loans_purpose_modified.write \
.option("header",True) \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv017244/lendingclubproject/cleaned/loans_parquet").save()

In [None]:
spark.stop()