In [67]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.shuffle.useOldFetchProtocol','true'). \
        config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
        enableHiveSupport(). \
        master('yarn'). \
        getOrCreate()

In [68]:
customers_schema = 'member_id string, emp_title string, emp_length string,home_ownership string, annual_inc float,addr_state string,zip_code string,country string,grade string,sub_grade string,verification_status string,tot_hi_cred_lim float,application_type string,annual_inc_joint float,verification_status_joint string'

In [69]:
customers_raw_df=spark.read\
.format("csv")\
.option("header","true")\
.schema(customers_schema) \
.load("/public/trendytech/lendingclubproject/raw/customers_data_csv")

In [70]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [71]:
customers_df_renamed = customers_raw_df.withColumnRenamed("annual_inc","annual_income") \
.withColumnRenamed("addr_state","address_state") \
.withColumnRenamed("zip_code","address_zipcode") \
.withColumnRenamed("country","address_country") \
.withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit") \
.withColumnRenamed("annual_inc_joint","join_annual_income") \

In [72]:
customers_df_renamed

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


In [73]:
from pyspark.sql.functions import current_timestamp

In [74]:
customers_df_ingestd=customers_df_renamed.withColumn("ingest_date",current_timestamp())

In [75]:
customers_df_ingestd

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2023-10-03 12:05:...
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2023-10-03 12:05:...
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2023-10-03 12:05:...
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2023-10-03 12:05:...
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2023-10-03 12:05:...
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2023-10-03 12:05:...
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2023-10-03 12:05:...
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2023-10-03 12:05:...
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2023-10-03 12:05:...
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2023-10-03 12:05:...


In [76]:
customers_df_ingestd.count()

2260701

In [77]:
customers_distinct= customers_df_ingestd.distinct()

In [12]:
customers_distinct.count()

2260638

In [78]:
customers_distinct.createOrReplaceTempView("customers")

In [79]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
131655ec48594de93...,Tailor,4 years,MORTGAGE,60000.0,MI,481xx,USA,A,A1,Not Verified,232055.0,Individual,,,2023-10-03 12:06:...
c774147f6e1564b42...,Guide,< 1 year,MORTGAGE,43300.0,IN,471xx,USA,B,B2,Not Verified,248220.0,Individual,,,2023-10-03 12:06:...
77d8693c20af906f4...,Customer Service,2 years,RENT,56000.0,TX,765xx,USA,C,C5,Source Verified,72042.0,Joint App,144000.0,Source Verified,2023-10-03 12:06:...
f8dbff88f39ed0d17...,Detective,10+ years,RENT,80000.0,NY,112xx,USA,A,A4,Source Verified,428901.0,Individual,,,2023-10-03 12:06:...
7960f3751183b5980...,Coordinator,< 1 year,RENT,32000.0,FL,330xx,USA,C,C5,Not Verified,58574.0,Individual,,,2023-10-03 12:06:...
ec2ba71e322075587...,Owner,10+ years,MORTGAGE,46000.0,NC,273xx,USA,C,C4,Not Verified,22200.0,Individual,,,2023-10-03 12:06:...
7b8de938abf0e2a2b...,Project Skills Co...,4 years,RENT,26000.0,SD,572xx,USA,C,C1,Source Verified,16800.0,Individual,,,2023-10-03 12:06:...
f682277b9c259b98a...,,,RENT,21060.0,WA,992xx,USA,C,C2,Verified,16900.0,Individual,,,2023-10-03 12:06:...
3c58d7a13ce9e0500...,Production Associ...,2 years,MORTGAGE,60000.0,MO,641xx,USA,A,A5,Not Verified,130709.0,Individual,,,2023-10-03 12:06:...
d89b83693bd259e26...,Teacher,10+ years,MORTGAGE,70106.0,TX,774xx,USA,A,A3,Not Verified,334025.0,Joint App,131106.0,,2023-10-03 12:06:...


In [80]:
spark.sql("select count(*) from customers where annual_income is null")

count(1)
5


In [81]:
customers_income_filtered = spark.sql("select * from customers where annual_income is not null")

In [82]:
customers_income_filtered.createOrReplaceTempView("customers")

In [83]:
spark.sql("select distinct(emp_length) from customers")

emp_length
5 years
9 years
""
1 year
2 years
7 years
8 years
4 years
6 years
3 years


In [84]:
from pyspark.sql.functions import regexp_replace,col

In [85]:
customers_emplength_cleaned = customers_income_filtered.withColumn("emp_length",regexp_replace(col("emp_length"),"(\D)",""))

In [86]:
customers_emplength_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
727a008bbced1f5c1...,registered nurse,5.0,MORTGAGE,65000.0,MI,483xx,USA,D,D1,Not Verified,174912.0,Individual,,,2023-10-03 12:07:...
26a2f8ef8fd8a7ecc...,engineering foreman,7.0,MORTGAGE,85000.0,IN,461xx,USA,C,C3,Source Verified,93851.0,Individual,,,2023-10-03 12:07:...
705733fa70ee752d8...,Shipping Manager,10.0,MORTGAGE,75000.0,WI,546xx,USA,B,B4,Source Verified,216092.0,Individual,,,2023-10-03 12:07:...
ade39b68392a8d548...,Office Manager,1.0,RENT,26400.0,AZ,852xx,USA,B,B5,Source Verified,55869.0,Individual,,,2023-10-03 12:07:...
b7448a5cf926ff698...,Plumber,10.0,MORTGAGE,91000.0,IL,629xx,USA,D,D3,Not Verified,324610.0,Individual,,,2023-10-03 12:07:...
8934849e3360a49b0...,Delivery Driver,6.0,RENT,50001.0,CA,925xx,USA,C,C1,Verified,79500.0,Individual,,,2023-10-03 12:07:...
4b03f181613c1fdff...,,,MORTGAGE,52215.0,IL,606xx,USA,C,C5,Verified,256101.0,Individual,,,2023-10-03 12:07:...
4d4237ffb7a0b25d5...,owner,10.0,OWN,485000.0,KS,661xx,USA,D,D5,Source Verified,302335.0,Individual,,,2023-10-03 12:07:...
f252e35f2bc54bc3e...,Systems Administr...,5.0,RENT,48975.0,VA,240xx,USA,A,A5,Source Verified,35510.0,Individual,,,2023-10-03 12:07:...
bfd7c2b84c62820f9...,"Civil Engineer, T...",1.0,RENT,56200.0,VA,235xx,USA,B,B4,Not Verified,120262.0,Individual,,,2023-10-03 12:07:...


In [87]:
customers_emplength_cleaned.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [88]:
customers_emplength_casted = customers_emplength_cleaned.withColumn("emp_length",customers_emplength_cleaned.emp_length.cast('int'))

In [89]:
customers_emplength_casted.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [90]:
customers_emplength_casted.filter("emp_length is null").count()

146903

In [91]:
customers_emplength_casted.createOrReplaceTempView("customers")

In [92]:
avg_emp_length = spark.sql("select floor(avg(emp_length)) as avg_emp_length from customers").collect()

In [93]:
print(avg_emp_length)

[Row(avg_emp_length=6)]


In [94]:
avg_emp_duration = avg_emp_length[0][0]

In [95]:
print(avg_emp_duration)

6


In [96]:
customers_emplength_replaced=customers_emplength_casted.na.fill(avg_emp_duration,subset=['emp_length'])

In [97]:
customers_emplength_replaced

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
f9db62ff779480102...,OWNER,10,MORTGAGE,125000.0,MA,018xx,USA,A,A1,Not Verified,456900.0,Individual,,,2023-10-03 12:09:...
a1fed2473444ddd14...,Director,10,RENT,105000.0,NJ,074xx,USA,B,B2,Source Verified,157418.0,Individual,,,2023-10-03 12:09:...
a637f9903cda5a177...,Application Devel...,4,MORTGAGE,62000.0,FL,330xx,USA,B,B4,Source Verified,265060.0,Individual,,,2023-10-03 12:09:...
3407259d06d62b74b...,Vice President,3,RENT,112000.0,CA,917xx,USA,A,A5,Source Verified,62967.0,Individual,,,2023-10-03 12:09:...
9002eeb9fe71d818e...,Social Worker,10,MORTGAGE,58000.0,WA,992xx,USA,C,C4,Not Verified,213175.0,Individual,,,2023-10-03 12:09:...
49ea6b9b2b2f56631...,SR PLUMBER,10,MORTGAGE,102000.0,CA,907xx,USA,A,A2,Source Verified,534700.0,Individual,,,2023-10-03 12:09:...
40bf9fe614c381df4...,Teller,3,RENT,21600.0,FL,330xx,USA,C,C4,Verified,25107.0,Individual,,,2023-10-03 12:09:...
9425a471dc415585c...,MANAGER,10,RENT,135000.0,CA,918xx,USA,A,A3,Verified,50500.0,Joint App,225000.0,,2023-10-03 12:09:...
e4bb4d59585cf694b...,Bulk Driver,5,RENT,44000.0,FL,342xx,USA,A,A1,Not Verified,76298.0,Individual,,,2023-10-03 12:09:...
812dbc189ef0fde95...,Store Manager,5,RENT,46000.0,NJ,070xx,USA,B,B2,Not Verified,34900.0,Individual,,,2023-10-03 12:09:...


In [98]:
customers_emplength_replaced.createOrReplaceTempView("customers")

In [99]:
spark.sql("select distinct(address_state) from customers")

address_state
Helping Kenya's D...
223xx
175 (total projec...
SC
AZ
"so Plan """"C"""" is ..."
I am 56 yrs. old ...
financially I mad...
but no one will l...
LA


In [100]:
spark.sql("select count(address_state) from customers where length(address_state)>2")

count(address_state)
254


In [101]:
from pyspark.sql.functions import when,col,length

In [102]:
customers_state_cleaned = customers_emplength_replaced.withColumn("address_state",
                                        when(length(col("address_state"))>2,"NA").otherwise(col("address_state"))
                                       )

In [103]:
customers_state_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
b895d43edc496516a...,VP Design,6,RENT,200000.0,NY,100xx,USA,B,B1,Source Verified,335526.0,Individual,,,2023-10-03 12:10:...
564dff9647aa54bb6...,Airport Tower Coo...,10,MORTGAGE,60000.0,MN,551xx,USA,A,A3,Not Verified,225324.0,Individual,,,2023-10-03 12:10:...
41c49e3681b5e7e20...,Sales rep,10,MORTGAGE,62000.0,FL,336xx,USA,B,B4,Not Verified,369038.0,Individual,,,2023-10-03 12:10:...
d76b563f7d02d640e...,Manager,10,MORTGAGE,85000.0,FL,327xx,USA,A,A5,Not Verified,626670.0,Individual,,,2023-10-03 12:10:...
c36f29f3e8a7995e1...,Store Manager,3,RENT,66187.0,FL,333xx,USA,B,B4,Source Verified,71417.0,Individual,,,2023-10-03 12:10:...
a4892bc5d9439bd80...,Owner,10,RENT,150000.0,NJ,080xx,USA,B,B5,Not Verified,113024.0,Individual,,,2023-10-03 12:10:...
1b2c22fdcecd18c06...,Senior Auditor,10,MORTGAGE,65000.0,MO,651xx,USA,B,B2,Source Verified,155412.0,Individual,,,2023-10-03 12:10:...
005918e15acf4a054...,Executive Director,1,MORTGAGE,64000.0,WI,548xx,USA,B,B5,Source Verified,191321.0,Individual,,,2023-10-03 12:10:...
e8b54bcfe7df4811e...,Sr. Customer Succ...,5,MORTGAGE,190000.0,NJ,080xx,USA,B,B2,Not Verified,106470.0,Individual,,,2023-10-03 12:10:...
c83cdc1cfd813c33a...,electrician,10,MORTGAGE,80000.0,CA,949xx,USA,C,C3,Verified,36203.0,Individual,,,2023-10-03 12:10:...


In [104]:
customers_state_cleaned.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv006222/myproject/cleaned/customers_parquet") \
.save()

In [66]:
spark.stop()