In [0]:
from pyspark.sql.functions import *

df = spark.read.format("delta").load("/FileStore/lending/silver/delta")

In [0]:
# Q1 - Loan Approval Rate
total = df.count()
approved = df.filter(col("loan_status") == "Fully Paid").count()
print(f"Q1 - Approval Rate: {approved/total:.2%}")


Q1 - Approval Rate: 47.63%


In [0]:
# Q2 - Average loan amount by grade
df.groupBy("grade").agg(avg("loan_amnt").alias("AvgLoan")).orderBy("grade").show()

+-----+------------------+
|grade|           AvgLoan|
+-----+------------------+
|    A|14603.343209545825|
|    B| 14173.33819852703|
|    C|15038.083317821778|
|    D| 15711.98300680591|
|    E|17453.078391907933|
|    F| 19124.64653110048|
|    G|20383.988740959896|
+-----+------------------+



In [0]:
# Q3 - Charged off vs Fully paid
df.groupBy("loan_status").count().filter(col("loan_status").isin("Charged Off", "Fully Paid")).show()

+-----------+-------+
|loan_status|  count|
+-----------+-------+
| Fully Paid|1076751|
|Charged Off| 268558|
+-----------+-------+



In [0]:
# Q4 - Loan purpose distribution
df.groupBy("purpose").count().orderBy(desc("count")).show()

+--------------------+-------+
|             purpose|  count|
+--------------------+-------+
|  debt_consolidation|1277790|
|         credit_card| 516926|
|    home_improvement| 150440|
|               other| 139413|
|      major_purchase|  50429|
|             medical|  27481|
|      small_business|  24659|
|                 car|  24009|
|            vacation|  15525|
|              moving|  15402|
|               house|  14131|
|             wedding|   2351|
|    renewable_energy|   1445|
|         educational|    412|
| I have eliminate...|      1|
| my cell is on a ...|      1|
| MD""... approx. ...|      1|
| and have never h...|      1|
| which was a swin...|      1|
| usually doubling...|      1|
+--------------------+-------+
only showing top 20 rows



In [0]:
# Q5 - Unique states
print("Q5 - Unique States:", df.select("addr_state").distinct().count())

Q5 - Unique States: 271


In [0]:
# Q6 - Average interest rate by sub-grade
df.groupBy("sub_grade").agg(avg("int_rate").alias("AvgInterest")).orderBy("sub_grade").show()

+---------+------------------+
|sub_grade|       AvgInterest|
+---------+------------------+
|       A1| 5.600265353152358|
|       A2| 6.552336764326663|
|       A3| 7.094534597727951|
|       A4|  7.56023906377143|
|       A5| 8.195006086398775|
|       B1|  9.07855442353105|
|       B2| 9.974970186619458|
|       B3| 10.70501452316554|
|       B4|11.372777749955029|
|       B5|12.013544779307304|
|       C1|12.783782992806444|
|       C2|13.537631867967175|
|       C3|14.104219423653143|
|       C4| 14.87805884435167|
|       C5|15.768242893616625|
|       D1|  16.6581692689528|
|       D2|17.599961179165266|
|       D3| 18.38845569971858|
|       D4| 19.07287050056509|
|       D5|20.063590779420135|
+---------+------------------+
only showing top 20 rows



In [0]:
# Q7 - Correlation between loan amount and interest rate
print("Q7 - Correlation:", df.stat.corr("loan_amnt", "int_rate"))

Q7 - Correlation: 0.09808178013492619


In [0]:

# Q8 - Average annual income
df.select(avg("annual_inc").alias("AvgAnnualIncome")).show()

+-----------------+
|  AvgAnnualIncome|
+-----------------+
|77992.44637764242|
+-----------------+



In [0]:

# Q9 - Default rate by employment length
df.withColumn("default", when(col("loan_status") == "Charged Off", 1).otherwise(0))\
  .groupBy("emp_length").agg(avg("default").alias("DefaultRate")).orderBy("emp_length").show()

+----------+-------------------+
|emp_length|        DefaultRate|
+----------+-------------------+
|      NULL|0.12674660364023005|
|         2|0.11838900999626858|
|         3|0.11886939635856666|
|         4|0.11639398265070824|
|         5|0.11807613566407536|
|         6|0.11829130451728573|
|         7| 0.1253573547656292|
|         8|0.13164479839850296|
|         9|0.12767806536935575|
|        10|0.11103802782066965|
+----------+-------------------+



In [0]:

# Q10 - Home ownership distribution
df.groupBy("home_ownership").count().orderBy(desc("count")).show()

+--------------+-------+
|home_ownership|  count|
+--------------+-------+
|      MORTGAGE|1111449|
|          RENT| 894929|
|           OWN| 253057|
|           ANY|    996|
|         OTHER|    182|
|          NONE|     54|
|       2 years|      1|
+--------------+-------+



In [0]:
# Q11 - Average debt-to-income ratio by grade
df.groupBy("grade").agg(avg("dti").alias("AvgDTI")).orderBy("grade").show()

+-----+------------------+
|grade|            AvgDTI|
+-----+------------------+
|    A|16.239339044863325|
|    B|17.967428521872705|
|    C|19.552627592025633|
|    D|20.930404344941337|
|    E|21.550460016086568|
|    F| 21.67774207451392|
|    G|22.434566935616655|
+-----+------------------+



In [0]:
# Q12 - Default rate by term (36 vs 60 months)
df.withColumn("default", when(col("loan_status") == "Charged Off", 1).otherwise(0))\
  .groupBy("term").agg(avg("default").alias("DefaultRate")).show()


+----------+-------------------+
|      term|        DefaultRate|
+----------+-------------------+
| 36 months| 0.1014142533579665|
| 60 months|0.16178174075223456|
+----------+-------------------+



In [0]:
# Q13 - Grade with highest loss given default
df.filter(col("loan_status") == "Charged Off")\
  .groupBy("grade").agg(avg("loan_amnt").alias("AvgLoss")).orderBy(desc("AvgLoss")).show(1)

+-----+-----------------+
|grade|          AvgLoss|
+-----+-----------------+
|    G|20495.47149122807|
+-----+-----------------+
only showing top 1 row



In [0]:
# Q14 - Credit history length and approval
df.withColumn("credit_history_length", datediff(current_date(), col("earliest_cr_line")) / 365.25)\
  .groupBy("loan_status").agg(avg("credit_history_length").alias("AvgCreditHistoryLength")).show()

+--------------------+----------------------+
|         loan_status|AvgCreditHistoryLength|
+--------------------+----------------------+
|          Fully Paid|                  NULL|
|             Default|                  NULL|
|     In Grace Period|                  NULL|
|Does not meet the...|                  NULL|
|         Charged Off|    15.485284052019164|
|  Late (31-120 days)|                  NULL|
|             Current|                  NULL|
|Does not meet the...|                  NULL|
|   Late (16-30 days)|                  NULL|
|            Oct-2015|                  NULL|
+--------------------+----------------------+



In [0]:
# Q15 - Most common loan purpose in high-risk loans
df.filter(col("loan_status") == "Charged Off")\
  .groupBy("purpose").count().orderBy(desc("count")).show(1)

+------------------+------+
|           purpose| count|
+------------------+------+
|debt_consolidation|165005|
+------------------+------+
only showing top 1 row



In [0]:
# Q16 - Interest rate by purpose and term
df.groupBy("purpose", "term").agg(avg("int_rate").alias("AvgInterest")).orderBy("purpose").show()

+--------------------+----------+-----------+
|             purpose|      term|AvgInterest|
+--------------------+----------+-----------+
|                NULL| 60 months|      10.99|
|      After gradu...| 60 months|      17.56|
|  (Citi Bank) whi...| 36 months|      14.96|
|  Hilal Khalil Ho...| 36 months|      11.54|
|  I have requeste...| 36 months|      13.85|
|  I realize that ...| 36 months|      13.79|
|  I'll work on th...| 36 months|       8.94|
|  and another one...| 36 months|      15.65|
|  but a bit too m...| 36 months|       9.45|
| 20 foot Yamaha s...| 36 months|        8.0|
| 5 years).  I req...| 36 months|       7.51|
|                 768| 36 months|       7.88|
| BUT I LOVE THE C...| 36 months|      19.41|
| Bank of America ...| 36 months|      16.32|
| Butler PA. It is...| 36 months|       9.32|
| CA for 13 years....| 36 months|       9.38|
| CB Radio or Poli...| 36 months|      14.42|
|         Credit Card| 36 months|       9.45|
| I accidentally c...| 36 months| 

In [0]:
# Q17 - Trend in number of loans over the years
df.withColumn("year", substring(col("issue_d"), -4, 4))\
  .groupBy("year").count().orderBy("year").show()

+----+------+
|year| count|
+----+------+
|2007|   603|
|2008|  2393|
|2009|  5281|
|2010| 12537|
|2011| 21721|
|2012| 53367|
|2013|134814|
|2014|235629|
|2015|421094|
|2016|434407|
|2017|443579|
|2018|495242|
|fied|     1|
+----+------+



In [0]:
# Q18 - Loans funded but not accepted (use proxy if available)
df.filter(col("loan_status").like("Does not meet%")).count()

2749

In [0]:
# Q19 - Average open credit lines
df.select(avg("open_acc").alias("AvgOpenCredit")).show()


+------------------+
|     AvgOpenCredit|
+------------------+
|11.624544599950363|
+------------------+



In [0]:
# Q20 - Top 5 employment titles among defaulters
df.filter(col("loan_status") == "Charged Off")\
  .groupBy("emp_title").count().orderBy(desc("count")).show(5)

+---------+-----+
|emp_title|count|
+---------+-----+
|     NULL|22461|
|  Manager| 4090|
|  Teacher| 3950|
|    Owner| 2696|
|   Driver| 2116|
+---------+-----+
only showing top 5 rows



In [0]:
# Q21 - Average revolving balance
df.select(avg("revol_bal").alias("AvgRevolvingBalance")).show()

+-------------------+
|AvgRevolvingBalance|
+-------------------+
|  16657.35474391925|
+-------------------+



In [0]:
# Q22 - Borrowers with no delinquencies
df.filter(col("delinq_2yrs") == 0).count()

1838880

In [0]:
# Q23 - Effect of verification status on loan defaults
df.withColumn("default", when(col("loan_status") == "Charged Off", 1).otherwise(0))\
  .groupBy("verification_status").agg(avg("default").alias("DefaultRate")).show()

+-------------------+-------------------+
|verification_status|        DefaultRate|
+-------------------+-------------------+
|           Verified|  0.158492831515602|
|    Source Verified|0.12325355720298342|
|       Not Verified|0.07993356659317997|
|            38000.0|                0.0|
+-------------------+-------------------+



In [0]:
# Q24 - Charge-off rate by loan amount buckets
df.withColumn("bucket", (floor(col("loan_amnt")/5000)*5000))\
  .withColumn("default", when(col("loan_status") == "Charged Off", 1).otherwise(0))\
  .groupBy("bucket").agg(avg("default").alias("ChargeOffRate")).orderBy("bucket").show()

+------+-------------------+
|bucket|      ChargeOffRate|
+------+-------------------+
|     0|0.09602713810424686|
|  5000|0.10861329485018821|
| 10000| 0.1211712959624748|
| 15000|0.13309742080849735|
| 20000| 0.1309787315946492|
| 25000|0.12498709664099758|
| 30000|0.12193161748655568|
| 35000|0.13302418900797428|
| 40000| 0.0422260848717334|
+------+-------------------+



In [0]:
# Q25 - Average funded amount vs requested
df.select(avg("funded_amnt"), avg("loan_amnt")).show()

+------------------+------------------+
|  avg(funded_amnt)|    avg(loan_amnt)|
+------------------+------------------+
|15041.664056818605|15046.931227849467|
+------------------+------------------+



In [0]:
# Q26 - Proportion of jointly owned loans
df.groupBy("application_type").count().show()

+-----------------+------+
| application_type| count|
+-----------------+------+
|         Oct-2016|     1|
|              1.0|    57|
|            675.0|     2|
|         Jan-2012|     1|
|         May-2010|     1|
|         Jul-2010|     1|
|        Joint App|120710|
|         Jun-2011|     1|
|            710.0|     2|
|            660.0|     1|
|8541.663712072801|     1|
|         Feb-2011|     1|
|            645.0|     1|
|            509.0|     1|
|         May-2011|     1|
|             25.0|     1|
|         14153.43|     1|
|7393.367160475102|     1|
|             NULL|    55|
|                f|     1|
+-----------------+------+
only showing top 20 rows



In [0]:
# Q27 - Inquiries in last 6 months vs default
df.withColumn("default", when(col("loan_status") == "Charged Off", 1).otherwise(0))\
  .groupBy("inq_last_6mths").agg(avg("default").alias("DefaultRate")).orderBy("inq_last_6mths").show()

+--------------------+-------------------+
|      inq_last_6mths|        DefaultRate|
+--------------------+-------------------+
|                NULL|0.03225806451612903|
|  I am currently ...|                0.0|
| $300.00 each.  P...|                0.0|
| AND QUIETLY.<br/...|                1.0|
| I have CreditSec...|                0.0|
| I need the reduc...|                0.0|
| I opted to partn...|                0.0|
| I will be able t...|                0.0|
| Marketing Materials|                0.0|
| and plenty of bu...|                0.0|
| and therefore re...|                0.0|
|                auto|                0.0|
| cancelling these...|                0.0|
| community and gr...|                0.0|
| household expens...|                0.0|
| however the rate...|                0.0|
| nearly all inves...|                1.0|
| or in full upon ...|                1.0|
|                rent|                0.0|
| so we know exact...|                0.0|
+----------

In [0]:
# Q28 - Common purposes for rejected loans
df_rejected = df.filter(col("loan_status") == "Rejected")
display(
    df_rejected.groupBy("purpose")
               .count()
               .orderBy("count", ascending=False)
)

purpose,count


In [0]:
# Q29 - Average installment by loan term
df.groupBy("term").agg(avg("installment").alias("AvgInstallment")).show()

+----------+-----------------+
|      term|   AvgInstallment|
+----------+-----------------+
| 36 months|422.1533858216925|
| 60 months|504.3033599369498|
+----------+-----------------+



In [0]:
# Q30 - Top state in terms of loan origination

from pyspark.sql.functions import count

display(
    df.groupBy("addr_state")
      .agg(count("*").alias("LoanCount"))
      .orderBy("LoanCount", ascending=False)
      .limit(1)
)

addr_state,LoanCount
CA,314481


In [0]:
# Q31 - Average income-to-loan ratio
from pyspark.sql.functions import col, avg

display(
    df.withColumn("income_to_loan_ratio", col("annual_inc") / col("loan_amnt"))
      .agg(avg("income_to_loan_ratio").alias("AvgIncomeToLoanRatio"))
)

AvgIncomeToLoanRatio
7.197082543862327


In [0]:
# Q32 - Default distribution among top 10 zip codes

from pyspark.sql.functions import count

top_zip_codes = (
    df.groupBy("zip_code")
      .agg(count("*").alias("LoanCount"))
      .orderBy("LoanCount", ascending=False)
      .limit(10)
      .select("zip_code")
)

default_distribution = (
    df.join(top_zip_codes, on="zip_code", how="inner")
      .groupBy("zip_code", "loan_status")
      .agg(count("*").alias("Count"))
      .orderBy("zip_code", "loan_status")
)

display(default_distribution)

zip_code,loan_status,Count
070xx,Charged Off,2247
070xx,Current,7241
070xx,Does not meet the credit policy. Status:Charged Off,3
070xx,Does not meet the credit policy. Status:Fully Paid,27
070xx,Fully Paid,8509
070xx,In Grace Period,85
070xx,Late (16-30 days),35
070xx,Late (31-120 days),168
100xx,Charged Off,1870
100xx,Current,6157


In [0]:
# Q33 - Loans with 0 credit lines in last 2 years (proxy using `pub_rec`)
loans_with_zero_credit_lines = df.filter(col("pub_rec") == 0)
display(loans_with_zero_credit_lines)

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
491699,,7000.0,7000.0,6975.0,36 months,12.73,234.97,C,C1,Crothall Services Group,3.0,RENT,28685.0,Not Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491699,"Borrower added on 03/05/10 > Straight to the point... Citibank increased my APR from 12.99% to 29.99% for no reason.  Borrower added on 03/05/10 > This is loan going to serve as a both a consolidation for 2 high rate credit cards and a refinance. As you see in my credit report, I've never been late or missed a payment; and I will continue to value that status. Securing this loan will just make my life easier and save me a lot of interest. Thank you",debt_consolidation,Noelle's debt consol.,104xx,NY,22.72,0.0,Mar-2005,695.0,699.0,2.0,,,13.0,0.0,15489.0,64.0,14.0,f,0.0,0.0,8444.942493933,8414.78,7000.0,1444.94,0.0,0.0,0.0,Dec-2012,954.65,,Dec-2014,724.0,720.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491685,,15000.0,15000.0,14975.0,36 months,10.62,488.4,B,B3,PharMerica,,RENT,111000.0,Not Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491685,,other,Education,757xx,TX,21.95,0.0,Aug-1994,730.0,734.0,1.0,,,19.0,0.0,31678.0,46.9,57.0,f,0.0,0.0,17583.584829786698,17554.28,15000.0,2583.58,0.0,0.0,0.0,Mar-2013,517.23,,Mar-2019,769.0,765.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491667,,6400.0,6400.0,6375.0,36 months,7.88,200.2,A,A5,Robert Half International Inc.,3.0,RENT,38000.0,Not Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491667,,debt_consolidation,Make money easily,956xx,CA,17.94,0.0,Aug-1989,725.0,729.0,0.0,33.0,,7.0,0.0,9443.0,40.7,12.0,f,0.0,0.0,7207.5090884128,7179.35,6400.0,807.51,0.0,0.0,0.0,Mar-2013,209.97,,Mar-2013,689.0,685.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491160,,4000.0,4000.0,4000.0,36 months,14.59,137.86,D,D1,walmart,2.0,RENT,17000.0,Not Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491160,,wedding,wedding expenses,130xx,NY,5.08,0.0,Sep-2006,680.0,684.0,0.0,,,5.0,0.0,1889.0,78.7,5.0,f,0.0,0.0,4963.3064239472,4963.31,4000.0,963.31,0.0,0.0,0.0,Mar-2013,144.87,,Mar-2013,714.0,710.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491675,,20000.0,20000.0,19950.0,36 months,13.85,682.08,C,C4,Delloite,10.0,MORTGAGE,500000.0,Not Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491675,,other,Short Term Tax Loan,300xx,GA,8.17,0.0,Sep-1991,700.0,704.0,2.0,,,14.0,0.0,59777.0,86.3,47.0,f,0.0,0.0,22886.9924287214,22829.78,20000.0,2887.0,0.0,0.0,0.0,Jul-2011,16.27,,Jul-2011,719.0,715.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491668,,6000.0,6000.0,6000.0,36 months,11.36,197.47,B,B5,Hightower Investment group,3.0,OWN,65000.0,Not Verified,Mar-2010,Charged Off,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491668,,debt_consolidation,GREAT BORROWER --DEBT CONSOLIDATION,100xx,NY,6.02,0.0,Jun-2000,700.0,704.0,0.0,,,7.0,0.0,6370.0,92.3,14.0,f,0.0,0.0,2957.37,2957.37,1773.57,591.33,0.0,592.47,105.6006,Mar-2011,394.94,,Oct-2016,709.0,705.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491663,,5500.0,5500.0,5500.0,36 months,11.36,181.02,B,B5,"double barrel environmental services, in",2.0,MORTGAGE,48000.0,Not Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491663,"Borrower added on 03/07/10 > I have had this goal of paying off my credit cards for about 5 years now, it has been hard keeping up with the increased interest rates. I'm almost there, hopefully by this October I can reach my goal. I am a single mother with 3 teenagers, and I'm trying to payoff all these cards to save aggressivly for my first college bound teen. She has dreams on attending either USC or WSU, I need to help her with this goal, and it is my dream to see my children succeed. The credit cards I hope to pay off with this consolidation are: 1)Chase -$3,417.75 @ 31.99% 2)Chase- $1,619.81 @ 29.99% 3)Bank of America $1078.43 @ 25.74% At Double Barrel Environmental I have many responsibilites. I was hired with them as the Office Manager charged with all the administrative functions, such as payroll, billing, compliance, vehicle maintenance, and all HR functions. Recently I have been tasked with building a data base system, and also with outside sales and New Business Accounts.",credit_card,Road to Success,933xx,CA,11.95,0.0,Apr-1999,690.0,694.0,0.0,,,6.0,0.0,5090.0,74.9,31.0,f,0.0,0.0,5735.4,5735.4,5500.0,235.4,0.0,0.0,0.0,Sep-2010,2147.02,,Mar-2019,709.0,705.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491632,,10000.0,10000.0,9996.622267112662,36 months,15.7,350.11,D,D4,e-Dialog,3.0,MORTGAGE,68500.0,Not Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491632,Borrower added on 03/05/10 > I recently paid off most of my debt with my tax return but want to continue to be debt free in 2010. With this loan I can consolidate what is left into one bill and pay off within 3 years. Thanks for your consideration.,credit_card,Finish Paying off Dept in 2010,021xx,MA,19.15,0.0,Aug-1998,660.0,664.0,1.0,77.0,,21.0,0.0,16171.0,52.5,48.0,f,0.0,0.0,11466.3124714841,11461.09,10000.0,1466.31,0.0,0.0,0.0,Apr-2011,7274.73,,Mar-2019,694.0,690.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491618,,25000.0,25000.0,24946.587588819868,36 months,15.33,870.71,D,D3,City of Hialeah,10.0,MORTGAGE,80000.0,Not Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491618,"Borrower added on 03/16/10 > I have just over $25,000 in credit card debt. Once I get funded completely I will pay off all the cards. It will take me the next 3 years to pay off this loan. Once paid off they are all gone. With your help I will increace my credit score even higher. I have never been late making payments. In my job it is very important to pay all your bills. I will be retiring from my department in two years and applying for another Police job. They due an intensive background check when you apply and credit is one of the most important things they check. So if want another police job I have to get rid of all these debts. My score is good and no deliquents ever, just want to be a top canidate in these very competivie times. I do have a substatnial retirements saved up, but due to tax penalties I cannot touch any of it until I am 59 and half. That is 13 years from now. I assure you I am a good investment and thank you all for investing in me I won't let you down.  Borrower added on 03/17/10 > Time is running out, I don't want to come back in 6 months to get the rest. I have never been late with any payments. You will make a good smart choice to invest in me. Together we can help each other in these difficult times. Thank you all for helping me. I only have 1 day and twenty hours left to be fully funded, I won't be able to pay off all my accounts until I reach $25,000.",debt_consolidation,Pay off my high interest credit cards.,331xx,FL,22.57,0.0,Nov-1985,700.0,704.0,0.0,,,7.0,0.0,14918.0,89.3,36.0,f,0.0,0.0,27085.737419132,27026.34,24999.99,2085.75,0.0,0.0,0.0,Oct-2010,21866.75,,Nov-2012,694.0,690.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
491622,,25000.0,25000.0,24975.0,36 months,16.07,879.85,D,D5,AARP Services,4.0,MORTGAGE,120000.0,Verified,Mar-2010,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=491622,"Borrower added on 03/06/10 > My loan request is to pay off my revolving credit card debt. Please note that there is a 60K Home Equity Line of credit in my overall debt. I am not attempting to pay off my Home Equity Line, just the credit card debt. Thanks for your consideration.",debt_consolidation,Debt Pay Off,201xx,VA,21.05,0.0,Dec-1993,675.0,679.0,1.0,,,16.0,0.0,92764.0,71.9,44.0,f,0.0,0.0,38079.8091525031,38041.73,25000.0,13079.81,0.0,0.0,0.0,Feb-2015,7912.97,,Jul-2018,754.0,750.0,0.0,,1.0,Individual,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [0]:
# Q34 - Employment length distribution in defaulted loans
employment_length_distribution = (
    df.filter(col("loan_status") == "Default")
      .groupBy("emp_length")
      .agg(count("*").alias("Count"))
      .orderBy("emp_length")
)

display(employment_length_distribution)

emp_length,Count
,10
2.0,8
3.0,5
4.0,2
6.0,2
8.0,3
10.0,10
