In [1]:
from pyspark.sql import SparkSession
import getpass 
username = getpass.getuser()
spark = SparkSession. \
     builder. \
     config('spark.ui.port', '0'). \
     config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
     config('spark.shuffle.useOldFetchProtocol', 'true'). \
     enableHiveSupport(). \
     master('yarn'). \
     getOrCreate()

In [2]:
customers_schema = 'member_id string,emp_title string,emp_length string,home_ownership string,annual_income float,addr_state string,zip_code string,country string,grade string,sub_grade string,verification_status string,tot_hi_cred_lim float,application_type string,annual_income_joint float,verification_status_joint string'

In [4]:
customers_raw_df = spark.read \
.format("csv") \
.schema(customers_schema) \
.option("header","true") \
.load("/public/trendytech/lendingclubproject/raw/customers_data_csv")

In [5]:
customers_raw_df

member_id,emp_title,emp_length,home_ownership,annual_income,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_income_joint,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


In [6]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_income_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [7]:
from pyspark.sql.functions import *

In [8]:
customers_df_renamed = customers_raw_df.withColumnRenamed("annual_inc","annual_income") \
.withColumnRenamed("addr_state","address_state") \
.withColumnRenamed("zip_code","address_zipcode") \
.withColumnRenamed("country","address_country") \
.withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit") \
.withColumnRenamed("annual_inc_joint","join_annual_income")

In [9]:
from pyspark.sql.functions import current_timestamp

In [10]:
customers_df_ingestd = customers_df_renamed.withColumn("ingest_date",current_timestamp())

In [11]:
customers_df_ingestd

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_income_joint,verification_status_joint,ingest_date
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2024-01-12 04:34:...
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2024-01-12 04:34:...
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2024-01-12 04:34:...
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2024-01-12 04:34:...
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2024-01-12 04:34:...
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2024-01-12 04:34:...
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2024-01-12 04:34:...
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2024-01-12 04:34:...
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2024-01-12 04:34:...
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2024-01-12 04:34:...


In [12]:
customers_df_ingestd.count()

2260701

In [13]:
customers_distinct = customers_df_ingestd.distinct()

In [14]:
customers_distinct.count()

2260638

In [15]:
customers_distinct.createOrReplaceTempView("customers")

In [16]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_income_joint,verification_status_joint,ingest_date
2d0dc0e55b652b9ff...,Director,3 years,RENT,90000.0,NY,115xx,USA,C,C1,Source Verified,85400.0,Individual,,,2024-01-12 04:34:...
f95ea23110bdb84a4...,general manager,2 years,RENT,60000.0,WA,989xx,USA,D,D2,Not Verified,188108.0,Individual,,,2024-01-12 04:34:...
8fdb5ad97390d53dd...,Marketing/clinica...,10+ years,OWN,75000.0,PA,152xx,USA,F,F1,Source Verified,135299.0,Individual,,,2024-01-12 04:34:...
3f1fb051290565450...,contractor,1 year,RENT,62500.0,WI,537xx,USA,E,E1,Source Verified,52599.0,Individual,,,2024-01-12 04:34:...
689fd5a0d7f6f7019...,Beauty Manager,10+ years,RENT,50000.0,DE,198xx,USA,A,A5,Source Verified,36235.0,Individual,,,2024-01-12 04:34:...
cea6551f90d2889e1...,accounting,5 years,RENT,55000.0,NY,117xx,USA,A,A4,Verified,62561.0,Individual,,,2024-01-12 04:34:...
135ea69ca0921ced0...,Police Officer,8 years,RENT,80000.0,CT,063xx,USA,C,C2,Source Verified,109601.0,Individual,,,2024-01-12 04:34:...
4b1338f8f4ac233b4...,Executive assistant,3 years,MORTGAGE,200000.0,IL,604xx,USA,B,B1,Source Verified,418565.0,Individual,,,2024-01-12 04:34:...
07db457415c319199...,Hydrologist 2,8 years,MORTGAGE,59400.0,MN,551xx,USA,D,D3,Source Verified,238624.0,Individual,,,2024-01-12 04:34:...
986522d55e77aec75...,CEO,10+ years,RENT,100000.0,TX,782xx,USA,A,A5,Source Verified,70200.0,Individual,,,2024-01-12 04:34:...


In [17]:
spark.sql("select count(*) from customers where annual_income is null")

count(1)
5


In [18]:
customers_income_filtered = spark.sql("select * from customers where annual_income is not null")

In [19]:
customers_income_filtered.createOrReplaceTempView("customers")

In [20]:
spark.sql("select distinct(emp_length) from customers").show()

+----------+
|emp_length|
+----------+
|   5 years|
|   9 years|
|      null|
|    1 year|
|   2 years|
|   7 years|
|   8 years|
|   4 years|
|   6 years|
|   3 years|
| 10+ years|
|  < 1 year|
+----------+



In [21]:
from pyspark.sql.functions import regexp_replace, col

In [22]:
customers_emplength_cleaned = customers_income_filtered.withColumn("emp_length",regexp_replace(col("emp_length"),"(\D)",""))

In [23]:
customers_emplength_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_income_joint,verification_status_joint,ingest_date
ff4f2ad5a1ad5ac49...,Business Developm...,6.0,MORTGAGE,47000.0,TX,750xx,USA,A,A1,Not Verified,332077.0,Individual,,,2024-01-12 04:34:...
b2afaa03adab721a0...,"Partner, CPA",7.0,MORTGAGE,165000.0,MN,554xx,USA,B,B2,Source Verified,108415.0,Individual,,,2024-01-12 04:34:...
e398fac2fd2d2a84b...,Sales Manager,10.0,MORTGAGE,175000.0,WA,992xx,USA,C,C4,Not Verified,376040.0,Individual,,,2024-01-12 04:34:...
7028ec77194adcf54...,Firefighter/Param...,10.0,MORTGAGE,80000.0,CO,801xx,USA,B,B2,Verified,471071.0,Individual,,,2024-01-12 04:34:...
24ca95380aeae4c82...,dispath,10.0,MORTGAGE,33300.0,ND,581xx,USA,B,B3,Verified,75250.0,Individual,,,2024-01-12 04:34:...
95a61937adf485d09...,Server,9.0,MORTGAGE,44000.0,HI,967xx,USA,D,D3,Source Verified,319400.0,Individual,,,2024-01-12 04:34:...
ddeaffd921b90d9e7...,Patient Access Rep,4.0,OWN,30000.0,KS,674xx,USA,C,C5,Verified,101828.0,Individual,,,2024-01-12 04:34:...
1dfe61396fb1a3a01...,Creative Services...,2.0,OWN,95000.0,FL,334xx,USA,B,B3,Not Verified,305149.0,Individual,,,2024-01-12 04:34:...
3abccd1b031ddea79...,AR,8.0,OWN,48000.0,OH,452xx,USA,C,C2,Source Verified,38992.0,Individual,,,2024-01-12 04:34:...
66338ab2e5b3bc4ad...,manager,10.0,MORTGAGE,106000.0,OH,434xx,USA,E,E2,Not Verified,446418.0,Individual,,,2024-01-12 04:34:...


In [24]:
customers_emplength_cleaned.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_income_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [25]:
customers_emplength_casted = customers_emplength_cleaned.withColumn("emp_length",customers_emplength_cleaned.emp_length.cast('int'))

In [26]:
customers_emplength_casted

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_income_joint,verification_status_joint,ingest_date
b9dfcdce8c51c4029...,ramp service,10.0,OWN,55000.0,CA,940xx,USA,C,C1,Source Verified,18000.0,Individual,,,2024-01-12 04:34:...
b4a8cf8e8cf9a9a9d...,Business Initiati...,10.0,MORTGAGE,200000.0,NV,891xx,USA,A,A2,Verified,452732.0,Individual,,,2024-01-12 04:34:...
4b92caa0af7c8eb29...,Sales,10.0,MORTGAGE,300000.0,NY,117xx,USA,A,A1,Source Verified,1016410.0,Individual,,,2024-01-12 04:34:...
ce03f2b325792ff5d...,,,OWN,25200.0,OH,441xx,USA,B,B1,Verified,27100.0,Individual,,,2024-01-12 04:34:...
5a8fd1d60e3f843e2...,Leadman,10.0,MORTGAGE,72000.0,UT,840xx,USA,D,D4,Source Verified,284759.0,Individual,,,2024-01-12 04:34:...
94b0f7558c47043e3...,IT Consultant,10.0,RENT,67000.0,NC,282xx,USA,F,F5,Source Verified,93951.0,Individual,,,2024-01-12 04:34:...
10ebe7c3172d31f4a...,Weave Technical S...,2.0,RENT,66300.0,SC,296xx,USA,B,B1,Not Verified,220799.0,Individual,,,2024-01-12 04:34:...
49b712a41ae2c1220...,,,MORTGAGE,57000.0,AL,369xx,USA,D,D1,Source Verified,154912.0,Individual,,,2024-01-12 04:34:...
2bea91148ec3a133d...,mechanic,6.0,MORTGAGE,36400.0,VA,224xx,USA,G,G3,Source Verified,41300.0,Individual,,,2024-01-12 04:34:...
e680723a4baaef7b1...,Management,10.0,MORTGAGE,70000.0,IL,615xx,USA,D,D2,Verified,184422.0,Individual,,,2024-01-12 04:34:...


In [27]:
customers_emplength_casted.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_income_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [28]:
customers_emplength_casted.filter("emp_length is null").count()

146903

In [29]:
customers_emplength_casted.createOrReplaceTempView("customers")

In [30]:
avg_emp_length = spark.sql("select floor(avg(emp_length)) as avg_emp_length from customers").collect()

In [31]:
print(avg_emp_length)

[Row(avg_emp_length=6)]


In [32]:
avg_emp_duration = avg_emp_length[0][0]

In [33]:
print(avg_emp_duration)

6


In [34]:
customers_emplength_replaced = customers_emplength_casted.na.fill(avg_emp_duration,subset=['emp_length'])

In [35]:
customers_emplength_replaced

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_income_joint,verification_status_joint,ingest_date
50616b8d1d05cb02a...,,6,RENT,50000.0,CA,945xx,USA,B,B1,Not Verified,28126.0,Individual,,,2024-01-12 04:35:...
e263a58a071b70bb6...,Health care taker,3,RENT,60000.0,CA,946xx,USA,D,D3,Source Verified,49393.0,Individual,,,2024-01-12 04:35:...
4c50808962d0000c1...,Accountant IV,7,MORTGAGE,59000.0,TX,787xx,USA,A,A4,Not Verified,241108.0,Individual,,,2024-01-12 04:35:...
47d978089877621c1...,,6,MORTGAGE,140000.0,CA,946xx,USA,A,A1,Not Verified,322500.0,Individual,,,2024-01-12 04:35:...
f9152e117555f8f6b...,Registered nurse,1,RENT,63000.0,PA,150xx,USA,D,D3,Source Verified,41898.0,Individual,,,2024-01-12 04:35:...
dfe7ad1db0b1f7b4f...,Business Analyst,10,MORTGAGE,136000.0,NJ,074xx,USA,C,C2,Source Verified,447623.0,Individual,,,2024-01-12 04:35:...
74b985ed89ec2ba85...,Firefighter Lieut...,10,MORTGAGE,90000.0,FL,321xx,USA,B,B1,Source Verified,176611.0,Individual,,,2024-01-12 04:35:...
6a8470e9c8f117da0...,Office Services S...,10,MORTGAGE,75000.0,MI,480xx,USA,D,D1,Source Verified,226136.0,Individual,,,2024-01-12 04:35:...
8a0f0716629843dff...,Banker,2,RENT,55000.0,FL,329xx,USA,E,E5,Source Verified,106574.0,Individual,,,2024-01-12 04:35:...
d5d3a37ac3f891def...,SWITCHBOARD OPERATOR,10,MORTGAGE,67000.0,MA,010xx,USA,A,A2,Source Verified,153320.0,Individual,,,2024-01-12 04:35:...


In [36]:
customers_emplength_replaced.filter("emp_length is null").count()

0

In [37]:
customers_emplength_replaced.createOrReplaceTempView("customers")

In [38]:
spark.sql("select distinct(address_state) from customers")

address_state
Helping Kenya's D...
223xx
175 (total projec...
SC
AZ
"so Plan """"C"""" is ..."
I am 56 yrs. old ...
financially I mad...
but no one will l...
LA


In [39]:
spark.sql("select count(address_state) from customers where length(address_state)>2")

count(address_state)
254


In [40]:
from pyspark.sql.functions import when, col, length

In [41]:
customers_state_cleaned = customers_emplength_replaced.withColumn(
"address_state",
when(length(col('address_state'))>2,"NA").otherwise(col("address_state"))
)

In [42]:
customers_state_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,annual_income_joint,verification_status_joint,ingest_date
15961ebaba399446d...,University Southe...,10,RENT,55000.0,CA,900xx,USA,C,C2,Not Verified,,Individual,,,2024-01-12 04:35:...
a1840ed66533184ea...,AliceBlue,8,OWN,125000.0,VA,232xx,USA,A,A3,Verified,,Individual,,,2024-01-12 04:35:...
3a4ed694a8d63bfb2...,Green Sustainable...,2,RENT,42000.0,IL,605xx,USA,A,A5,Not Verified,,Individual,,,2024-01-12 04:35:...
715f9c1a9c5879f8a...,,6,OWN,30000.0,FL,322xx,USA,C,C2,Verified,,Individual,,,2024-01-12 04:35:...
decfb36a7c2a8f60f...,Front Row Sales &...,9,RENT,59500.0,MO,633xx,USA,B,B5,Not Verified,,Individual,,,2024-01-12 04:35:...
6dc86c0bdeea4e490...,Brittany Manor,10,OWN,24000.0,MI,486xx,USA,C,C1,Not Verified,,Individual,,,2024-01-12 04:35:...
d27d90e980a1f305d...,Student Business ...,7,RENT,35000.0,FL,326xx,USA,B,B4,Not Verified,,Individual,,,2024-01-12 04:35:...
8f05720a96ae6b437...,Orbital Science,2,MORTGAGE,85000.0,MD,209xx,USA,D,D5,Verified,,Individual,,,2024-01-12 04:35:...
8eabab81fbfabd3ae...,Aspenn Environmen...,5,RENT,33185.0,NJ,089xx,USA,B,B3,Not Verified,,Individual,,,2024-01-12 04:35:...
92b60ace6caf96e10...,San Diego Zoo,3,RENT,13000.0,CA,921xx,USA,D,D1,Not Verified,,Individual,,,2024-01-12 04:35:...


In [43]:
customers_state_cleaned.select("address_state").distinct()

address_state
SC
AZ
LA
MN
NJ
DC
OR
""
VA
""


In [45]:
customers_state_cleaned.write \
.format("parquet") \
.mode("overwrite") \
.option("path","public/trendytech/datasets/lendingclubproject/cleaned/customers_parquet") \
.save()

In [46]:
customers_state_cleaned.write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","public/trendytech/datasets/lendingclubproject/cleaned/customers_csv") \
.save()