In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol','true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()


In [2]:
spark

In [3]:
customers_schema = ('member_id string, emp_title string, emp_length string, home_ownership string, annual_inc float, addr_state string, zip_code string,country string, grade string, sub_grade string, verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string')

In [4]:
customers_raw_df = spark.read.format("csv").option("header",True).schema(customers_schema).load("/user/itv017244/lendingclubproject/raw/customer_data_csv")

In [5]:
customers_raw_df

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
d86a1c7244ace602c...,Admin,3 years,RENT,31000.0,OR,970xx,USA,B,B3,Source Verified,49481.0,Individual,,
25b8cd9ff47e50eea...,Teacher,10+ years,MORTGAGE,55983.0,TX,780xx,USA,C,C2,Source Verified,210551.0,Individual,,
f9773942305b2dc9c...,Behavioral Asst,10+ years,MORTGAGE,40000.0,VA,201xx,USA,F,F1,Source Verified,272608.0,Individual,,
b22f99c198f3738a7...,owner,< 1 year,RENT,20000.0,NC,274xx,USA,C,C3,Source Verified,58521.0,Individual,,
4b3b7068c15b0e046...,Supervisor,< 1 year,RENT,45000.0,NY,117xx,USA,C,C5,Source Verified,25500.0,Individual,,
0c7a9f8a6d5b209d2...,case manager,1 year,RENT,35000.0,NV,891xx,USA,B,B1,Verified,37588.0,Individual,,
f1cc7e6ae9c735148...,,,MORTGAGE,90000.0,AZ,851xx,USA,B,B5,Verified,303921.0,Individual,,
8f8faa7eeb2a1ee41...,Merchandiser,6 years,RENT,22000.0,NY,145xx,USA,D,D1,Source Verified,17650.0,Individual,,
dad3f5ea8788186cb...,executive assistant,1 year,RENT,41000.0,WA,981xx,USA,C,C5,Verified,29975.0,Individual,,
a20867c0985126202...,Industrial Tech,10+ years,MORTGAGE,75000.0,KS,676xx,USA,B,B4,Not Verified,68593.0,Individual,,


In [6]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [7]:
customer_df_renamed = customers_raw_df.withColumnRenamed("annual_inc","annual_income") \
.withColumnRenamed("addr_state","address_state") \
.withColumnRenamed("zip_code","address_zipCode") \
.withColumnRenamed("country","address_country") \
.withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit") \
.withColumnRenamed("annual_inc_join","joint_annual_income")

In [8]:
customer_df_renamed

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipCode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint
d86a1c7244ace602c...,Admin,3 years,RENT,31000.0,OR,970xx,USA,B,B3,Source Verified,49481.0,Individual,,
25b8cd9ff47e50eea...,Teacher,10+ years,MORTGAGE,55983.0,TX,780xx,USA,C,C2,Source Verified,210551.0,Individual,,
f9773942305b2dc9c...,Behavioral Asst,10+ years,MORTGAGE,40000.0,VA,201xx,USA,F,F1,Source Verified,272608.0,Individual,,
b22f99c198f3738a7...,owner,< 1 year,RENT,20000.0,NC,274xx,USA,C,C3,Source Verified,58521.0,Individual,,
4b3b7068c15b0e046...,Supervisor,< 1 year,RENT,45000.0,NY,117xx,USA,C,C5,Source Verified,25500.0,Individual,,
0c7a9f8a6d5b209d2...,case manager,1 year,RENT,35000.0,NV,891xx,USA,B,B1,Verified,37588.0,Individual,,
f1cc7e6ae9c735148...,,,MORTGAGE,90000.0,AZ,851xx,USA,B,B5,Verified,303921.0,Individual,,
8f8faa7eeb2a1ee41...,Merchandiser,6 years,RENT,22000.0,NY,145xx,USA,D,D1,Source Verified,17650.0,Individual,,
dad3f5ea8788186cb...,executive assistant,1 year,RENT,41000.0,WA,981xx,USA,C,C5,Verified,29975.0,Individual,,
a20867c0985126202...,Industrial Tech,10+ years,MORTGAGE,75000.0,KS,676xx,USA,B,B4,Not Verified,68593.0,Individual,,


In [9]:
from pyspark.sql.functions import current_timestamp

In [10]:
customer_df_ingestd=customer_df_renamed.withColumn("ingest_date",current_timestamp())

In [11]:
customer_df_ingestd

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipCode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
d86a1c7244ace602c...,Admin,3 years,RENT,31000.0,OR,970xx,USA,B,B3,Source Verified,49481.0,Individual,,,2025-03-18 16:39:...
25b8cd9ff47e50eea...,Teacher,10+ years,MORTGAGE,55983.0,TX,780xx,USA,C,C2,Source Verified,210551.0,Individual,,,2025-03-18 16:39:...
f9773942305b2dc9c...,Behavioral Asst,10+ years,MORTGAGE,40000.0,VA,201xx,USA,F,F1,Source Verified,272608.0,Individual,,,2025-03-18 16:39:...
b22f99c198f3738a7...,owner,< 1 year,RENT,20000.0,NC,274xx,USA,C,C3,Source Verified,58521.0,Individual,,,2025-03-18 16:39:...
4b3b7068c15b0e046...,Supervisor,< 1 year,RENT,45000.0,NY,117xx,USA,C,C5,Source Verified,25500.0,Individual,,,2025-03-18 16:39:...
0c7a9f8a6d5b209d2...,case manager,1 year,RENT,35000.0,NV,891xx,USA,B,B1,Verified,37588.0,Individual,,,2025-03-18 16:39:...
f1cc7e6ae9c735148...,,,MORTGAGE,90000.0,AZ,851xx,USA,B,B5,Verified,303921.0,Individual,,,2025-03-18 16:39:...
8f8faa7eeb2a1ee41...,Merchandiser,6 years,RENT,22000.0,NY,145xx,USA,D,D1,Source Verified,17650.0,Individual,,,2025-03-18 16:39:...
dad3f5ea8788186cb...,executive assistant,1 year,RENT,41000.0,WA,981xx,USA,C,C5,Verified,29975.0,Individual,,,2025-03-18 16:39:...
a20867c0985126202...,Industrial Tech,10+ years,MORTGAGE,75000.0,KS,676xx,USA,B,B4,Not Verified,68593.0,Individual,,,2025-03-18 16:39:...


In [12]:
customer_df_ingestd.count()

2260701

In [13]:
customer_distinct = customer_df_ingestd.distinct()

In [14]:
customer_distinct.count()

2260638

In [15]:
customer_distinct.createOrReplaceTempView("customers")

In [16]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipCode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
6f41ba80743de831a...,Compliance Manager,4 years,RENT,105000.0,NY,103xx,USA,B,B5,Source Verified,76200.0,Individual,,,2025-03-18 16:39:...
cb34ff22c771a490a...,Program Manager,10+ years,RENT,50000.0,CO,802xx,USA,B,B2,Verified,128949.0,Individual,,,2025-03-18 16:39:...
f799ea321cb9c012a...,Production Manager,10+ years,MORTGAGE,117000.0,GA,300xx,USA,D,D2,Not Verified,363998.0,Individual,,,2025-03-18 16:39:...
031159a48dc448521...,certified ophthal...,2 years,MORTGAGE,24600.0,FL,344xx,USA,C,C3,Source Verified,146050.0,Individual,,,2025-03-18 16:39:...
7d5805c6cb069a2e7...,Optometrist,1 year,MORTGAGE,86000.0,MT,591xx,USA,D,D5,Verified,343671.0,Individual,,,2025-03-18 16:39:...
568a2f661ddb0165e...,fuel truck driver,2 years,RENT,40000.0,OH,439xx,USA,B,B1,Verified,24632.0,Individual,,,2025-03-18 16:39:...
3a23f414ce21d7e33...,daytime server,10+ years,MORTGAGE,40000.0,NY,144xx,USA,B,B3,Source Verified,55200.0,Individual,,,2025-03-18 16:39:...
0cc3c0aef89cccc7e...,Cook,10+ years,RENT,27000.0,TN,380xx,USA,B,B4,Verified,32636.0,Individual,,,2025-03-18 16:39:...
f2a3507d760ad9b0e...,Sales Manager,7 years,RENT,61992.0,UT,841xx,USA,B,B5,Not Verified,83870.0,Individual,,,2025-03-18 16:39:...
866871c33c28b03f3...,supervisor,10+ years,MORTGAGE,55000.0,MA,027xx,USA,B,B3,Not Verified,328380.0,Individual,,,2025-03-18 16:39:...


In [17]:
spark.sql("select count(*) from customers where annual_income is null")

count(1)
5


In [18]:
customers_income_filtered = spark.sql("select * from customers where annual_income is not null")

In [19]:
customers_income_filtered.createOrReplaceTempView("customers")

In [20]:
spark.sql("select distinct(emp_length) from customers").show()

+----------+
|emp_length|
+----------+
|   9 years|
|   5 years|
|      null|
|    1 year|
|   2 years|
|   7 years|
|   8 years|
|   4 years|
|   6 years|
|   3 years|
| 10+ years|
|  < 1 year|
+----------+



In [21]:
from pyspark.sql.functions import regexp_replace, col

In [22]:
customers_employement_length_cleaned = customers_income_filtered.withColumn("emp_length",regexp_replace(col("emp_length"),"(\D)",""))

In [23]:
customers_employement_length_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipCode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
c6d019775fa5cd969...,Director Client S...,3,MORTGAGE,115000.0,CA,925xx,USA,B,B5,Not Verified,503470.0,Individual,,,2025-03-18 16:40:...
21183d2c7db249daa...,QA Data Analyst,1,RENT,80000.0,TN,372xx,USA,F,F1,Verified,90937.0,Individual,,,2025-03-18 16:40:...
9c9d0d405a4a4f946...,Head Chef,3,OWN,42000.0,NY,144xx,USA,D,D1,Source Verified,35860.0,Individual,,,2025-03-18 16:40:...
4417c1dd92ca00cb8...,Water Utility Man...,10,MORTGAGE,88000.0,WI,549xx,USA,B,B2,Source Verified,415437.0,Joint App,109360.0,Not Verified,2025-03-18 16:40:...
cbacde71545fbf19f...,Production associate,2,MORTGAGE,50000.0,OH,433xx,USA,D,D5,Source Verified,200460.0,Individual,,,2025-03-18 16:40:...
bf7f61644080059b0...,Customer service ...,1,RENT,21600.0,NC,276xx,USA,D,D2,Verified,35499.0,Individual,,,2025-03-18 16:40:...
daed3153c617945a4...,mail carrier,10,MORTGAGE,64000.0,CA,925xx,USA,C,C4,Source Verified,323637.0,Individual,,,2025-03-18 16:40:...
a526496511decc037...,QA Supervisor,10,RENT,65000.0,OR,975xx,USA,A,A1,Source Verified,6000.0,Individual,,,2025-03-18 16:40:...
9eccca78cb1e4e54a...,National Sales Ex...,1,RENT,175000.0,TN,370xx,USA,C,C4,Source Verified,208937.0,Individual,,,2025-03-18 16:40:...
f097799d08f359b9e...,Director,5,RENT,120000.0,CA,954xx,USA,C,C2,Verified,113835.0,Individual,,,2025-03-18 16:40:...


In [24]:
customers_employement_length_cleaned.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipCode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [25]:
customers_employement_length_casted = customers_employement_length_cleaned.withColumn("emp_length",customers_employement_length_cleaned.emp_length.cast('int'))

In [26]:
customers_employement_length_casted

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipCode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_inc_joint,verification_status_joint,ingest_date
47d9fb70fc3f7dbe9...,Senior Dental NCO,10.0,RENT,80000.0,CO,809xx,USA,D,D1,Verified,135120.0,Individual,,,2025-03-18 16:40:...
178fcd21a0680e314...,owner operator,10.0,RENT,75000.0,CA,906xx,USA,A,A5,Source Verified,9400.0,Individual,,,2025-03-18 16:40:...
8f06c25aeb784e844...,Welder,10.0,OWN,70000.0,MI,481xx,USA,C,C5,Not Verified,181446.0,Individual,,,2025-03-18 16:40:...
8e12c66923ea4491f...,Special Ed Teacher,9.0,MORTGAGE,49627.0,AR,721xx,USA,A,A1,Verified,231785.0,Joint App,76627.0,Not Verified,2025-03-18 16:40:...
b91b8986ace47f3aa...,Director of Revenue,2.0,MORTGAGE,80000.0,WV,253xx,USA,B,B1,Source Verified,85453.0,Individual,,,2025-03-18 16:40:...
4d583cc2d828b1b29...,Senior Master Ser...,10.0,MORTGAGE,84700.0,TX,782xx,USA,A,A2,Source Verified,267628.0,Joint App,87200.0,Not Verified,2025-03-18 16:40:...
b58d5191c15ec7b39...,,,RENT,40000.0,GA,305xx,USA,A,A4,Not Verified,64190.0,Joint App,70000.0,Not Verified,2025-03-18 16:40:...
7582023dc8766cdf1...,Branch Manager II,2.0,RENT,70000.0,OR,974xx,USA,C,C2,Not Verified,117758.0,Individual,,,2025-03-18 16:40:...
5ab27cfffd327070c...,Director,10.0,RENT,80313.0,WI,547xx,USA,B,B4,Not Verified,149053.0,Individual,,,2025-03-18 16:40:...
465c31e6f936ffd59...,Document Review A...,1.0,RENT,75000.0,TX,774xx,USA,D,D5,Source Verified,252743.0,Individual,,,2025-03-18 16:40:...


In [27]:
customers_employement_length_casted.filter("emp_length is null").count()

146903

In [28]:
customers_employement_length_casted.createOrReplaceTempView("customers")

In [None]:
average_emp_length = spark.sql("select floor(avg(emp_length)) as average_emp_length from customers").collect()

In [None]:
print(average_emp_length)

In [None]:
avg_emp_duration = average_emp_length[0][0]

In [None]:
print(avg_emp_duration)

In [None]:
customers_emp_length_replaced = customers_employement_length_casted.na.fill(avg_emp_duration,subset=['emp_length'])

In [None]:
customers_emp_length_replaced

In [None]:
customers_emp_length_replaced.filter("emp_length is null").count()

In [None]:
customers_emp_length_replaced.createOrReplaceTempView("customers")

In [None]:
spark.sql("select distinct address_state from customers")

In [None]:
spark.sql("select  count(address_state) from customers where length(address_state)>2")

In [None]:
from pyspark.sql.functions import length, when, col

In [None]:
customers_state_cleaned = customers_emp_length_replaced.withColumn("address_state",when(length(col("address_state"))>2, "NA" ).otherwise(col("address_state")))

In [None]:
customers_state_cleaned.select("address_state").distinct()

In [None]:
customers_state_cleaned.write.format("parquet").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/customers_parquet").save()

In [198]:
customers_state_cleaned.createOrReplaceTempView("customers")

In [200]:
spark.sql("""select * from customers
""").write.option("header",True).format("parquet").mode("overwrite"). \
option("path","/user/itv017244/lendingclubproject/cleaned/customers_parquet").save()

In [201]:
spark.sql("""select * from customers
""").write.option("header",True).format("csv").mode("overwrite"). \
option("path","/user/itv017244/lendingclubproject/cleaned/customers_csv").save()

In [197]:
customers_state_cleaned.write.option("header",True).format("csv").mode("overwrite").option("path","/user/itv017244/lendingclubproject/cleaned/customers_csv").save()

In [None]:
spark.stop()