In [21]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()

spark= SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [22]:
customer_schema = """member_id string, emp_title string, emp_length string, home_ownership string, 
annual_inc float, addr_state string, zip_code string, country string, grade string, sub_grade string, 
verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, 
verification_status_joint string"""

In [23]:
customer_raw_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(customer_schema) \
.load("/public/trendytech/lendingclubproject/raw/customers_data_csv")

In [24]:
customer_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [25]:
customer_df_renamed = customer_raw_df.withColumnRenamed("annual_inc","Annual_income")\
.withColumnRenamed("addr_state", "address_state")\
.withColumnRenamed("zip_code", "address_zipcode")\
.withColumnRenamed("country", "address_country")\
.withColumnRenamed("tot_hi_cred_lim", "total_high_credit_limit")\
.withColumnRenamed("annual_inc_joint", "join_annual_income")

In [26]:
customer_df_renamed

member_id,emp_title,emp_length,home_ownership,Annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


In [27]:
from pyspark.sql.functions import *

In [28]:
customer_df_ingest = customer_df_renamed.withColumn("ingest_date",current_timestamp())

In [29]:
customer_df_ingest.count()

2260701

In [30]:
customer_disctinct = customer_df_ingest.distinct()

In [31]:
customer_disctinct.count()

2260638

In [32]:
customer_disctinct.createOrReplaceTempView("customers")

In [33]:
customers_income_filtered_df = spark.sql("select * from customers where annual_income is not null")

In [34]:
customers_income_filtered_df.createOrReplaceTempView("customers")

In [35]:
spark.sql("select count(*) from customers where annual_income is null")

count(1)
0


In [36]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,Annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
8d248cb98540c2328...,Server,6 years,RENT,45000.0,IL,601xx,USA,A,A2,Not Verified,95723.0,Individual,,,2025-03-30 05:49:...
14f6f68226731dd5f...,AVP-MANAGER CD/IRA,10+ years,MORTGAGE,78000.0,PA,161xx,USA,A,A1,Source Verified,222610.0,Individual,,,2025-03-30 05:49:...
ed48e5bdbc3935fc9...,dental hygienist,2 years,OWN,75000.0,NY,104xx,USA,A,A2,Not Verified,65501.0,Individual,,,2025-03-30 05:49:...
c8d1c8c4eb457b075...,Director,4 years,MORTGAGE,450000.0,IL,606xx,USA,A,A4,Source Verified,1421095.0,Individual,,,2025-03-30 05:49:...
9979a29daa7bdd637...,,,MORTGAGE,0.0,OH,452xx,USA,B,B4,Not Verified,186919.0,Joint App,90000.0,Not Verified,2025-03-30 05:49:...
8fc7f262d9fc48246...,Directing Manager...,3 years,MORTGAGE,163650.0,TX,770xx,USA,D,D5,Not Verified,195023.0,Individual,,,2025-03-30 05:49:...
bd8d3ffaa056370c1...,Scan clerk,10+ years,OWN,42000.0,CO,809xx,USA,B,B1,Source Verified,96835.0,Individual,,,2025-03-30 05:49:...
657644fb3bcb71bfb...,,< 1 year,MORTGAGE,109000.0,HI,967xx,USA,F,F1,Verified,12800.0,Individual,,,2025-03-30 05:49:...
8cb9698af6bfa16ae...,,,MORTGAGE,75000.0,WI,530xx,USA,A,A2,Verified,224737.0,Individual,,,2025-03-30 05:49:...
f03c53d83906bbbed...,LINE SERVICE,10+ years,MORTGAGE,50000.0,IL,622xx,USA,D,D1,Verified,88100.0,Individual,,,2025-03-30 05:49:...


In [37]:
spark.sql("select distinct(emp_length) from customers")

emp_length
5 years
9 years
""
1 year
2 years
7 years
8 years
4 years
6 years
3 years


In [41]:
customer_emplength_length = customer_disctinct.withColumn("emp_length",regexp_replace(col("emp_length"),"(\D)",""))

In [42]:
customer_emplength_length.printSchema()


root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- Annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [43]:
customer_emplength_length


member_id,emp_title,emp_length,home_ownership,Annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
e624456cd66acaa74...,Senior Engineerin...,10,MORTGAGE,83500.0,MI,480xx,USA,C,C5,Source Verified,375333.0,Individual,,,2025-03-30 05:53:...
3e6c80cab4ce4c9b0...,Applications Cons...,1,MORTGAGE,105000.0,KY,402xx,USA,C,C5,Source Verified,149094.0,Individual,,,2025-03-30 05:53:...
ef28e06a456f0743f...,Sales,9,RENT,194000.0,SC,296xx,USA,B,B2,Not Verified,222847.0,Individual,,,2025-03-30 05:53:...
38804b3293c8738fd...,SECRETARY,1,RENT,20000.0,NY,113xx,USA,C,C4,Verified,32200.0,Individual,,,2025-03-30 05:53:...
eac38e2d6be5cb037...,Manager,4,MORTGAGE,66000.0,TX,780xx,USA,E,E2,Verified,183128.0,Individual,,,2025-03-30 05:53:...
b4f58989a784359da...,owner,10,MORTGAGE,100000.0,MA,025xx,USA,C,C5,Source Verified,144064.0,Individual,,,2025-03-30 05:53:...
5a6d2472853f60991...,Server,2,RENT,40000.0,MN,551xx,USA,C,C2,Verified,66675.0,Individual,,,2025-03-30 05:53:...
b30d0c9dab5c0888a...,SR Clerk,3,MORTGAGE,70000.0,IL,629xx,USA,B,B1,Not Verified,36923.0,Individual,,,2025-03-30 05:53:...
991d10ffa2d232fa4...,Architect,1,MORTGAGE,91000.0,MD,212xx,USA,B,B2,Source Verified,304592.0,Individual,,,2025-03-30 05:53:...
6a1ba52bd4a106abf...,Personal Banker,3,RENT,34000.0,ID,837xx,USA,C,C2,Verified,94307.0,Individual,,,2025-03-30 05:53:...


In [45]:
customer_cast = customer_emplength_length.withColumn("emp_length",customer_emplength_length.emp_length.cast('int'))

In [47]:
customer_cast.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- Annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [48]:
customer_cast.filter("emp_length is null").count()

146905

In [50]:
customer_cast.createOrReplaceTempView("customers")

In [52]:
avg_empg_length=spark.sql("select floor(avg(emp_length)) as avg_emp_length from customers").collect()

In [53]:
avg_empg_length

[Row(avg_emp_length=6)]

In [54]:
avg_emp_duration = avg_empg_length[0][0]

In [55]:
avg_emp_duration

6

In [56]:
customers_emplength_replaced = customer_cast.na.fill(avg_emp_duration,subset = ["emp_length"])

In [57]:
customers_emplength_replaced.filter("emp_length is null").count()

0

In [58]:
customers_emplength_replaced.createOrReplaceTempView("customers")

In [59]:
spark.sql("select distinct(address_state) from customers")

address_state
Helping Kenya's D...
175 (total projec...
223xx
AZ
SC
I am 56 yrs. old ...
"so Plan """"C"""" is ..."
financially I mad...
but no one will l...
LA


In [60]:
spark.sql("select count(address_state) from customers where length(address_state) > 2")

count(address_state)
255


In [61]:
from pyspark.sql.functions import when, length, col

customers_state_cleaned_df = customers_emplength_replaced.withColumn("address_state",
when (length(col("address_state"))>2,"NA").otherwise(col("address_state")))

In [62]:
customers_state_cleaned_df

member_id,emp_title,emp_length,home_ownership,Annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
08beada3202f9f8ff...,Carpenter,1,RENT,60000.0,AZ,853xx,USA,C,C3,Not Verified,34993.0,Individual,,,2025-03-30 06:09:...
845077c0d067b22f8...,Owner/Manager,2,RENT,35935.0,UT,843xx,USA,D,D2,Not Verified,24632.0,Joint App,47661.0,Not Verified,2025-03-30 06:09:...
10c474f6b01026129...,Mechanic Assistant,1,MORTGAGE,35000.0,WI,539xx,USA,C,C5,Verified,93476.0,Individual,,,2025-03-30 06:09:...
8b548ec93c15bfb4e...,Transportation se...,5,RENT,60000.0,FL,333xx,USA,E,E5,Source Verified,74398.0,Individual,,,2025-03-30 06:09:...
a9ed1518687532b62...,Network Administr...,2,MORTGAGE,65000.0,MO,647xx,USA,A,A4,Source Verified,175276.0,Individual,,,2025-03-30 06:09:...
32f072a9f3d3f1b37...,,6,MORTGAGE,32832.0,OR,973xx,USA,C,C5,Not Verified,167818.0,Individual,,,2025-03-30 06:09:...
f7e86d9665a1d642e...,Machinist,10,RENT,60000.0,MI,494xx,USA,C,C3,Not Verified,21600.0,Individual,,,2025-03-30 06:09:...
9fe7168bb112ef8d4...,Registered Nurse,10,MORTGAGE,46800.0,MO,630xx,USA,D,D5,Not Verified,270147.0,Joint App,166800.0,Not Verified,2025-03-30 06:09:...
db1c31fb75c9276ac...,Clinician,2,MORTGAGE,50100.0,CA,958xx,USA,A,A5,Source Verified,322634.0,Individual,,,2025-03-30 06:09:...
91c18c805d4a0bbed...,Vice President,10,MORTGAGE,210000.0,MA,021xx,USA,C,C4,Source Verified,469937.0,Individual,,,2025-03-30 06:09:...


In [63]:
customers_state_cleaned_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv015703/lendingclubproject/cleaned/customers_parquet") \
.save()

In [64]:
customers_state_cleaned_df.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv015703/lendingclubproject/cleaned/customers_csv") \
.save()

In [65]:
spark.stop()