In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [36]:
pd.options.display.max_columns = None

In [37]:
df = pd.read_excel("../data/data_cleaned.xlsx", index_col=0)

In [38]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,title,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000,36 months,11.44,329.48,B,B4,marketing,10+ years,RENT,117000.0,Not Verified,2015-01-01,Fully Paid,vacation,vacation,26.24,1990-06-01,16,0,36369,41.8,25,w,INDIVIDUAL,0,0,"0174 michelle gateway\nmendozaberg, ok 22690"
1,8000,36 months,11.99,265.68,B,B5,credit analyst,4 years,MORTGAGE,65000.0,Not Verified,2015-01-01,Fully Paid,debt_consolidation,debt consolidation,22.05,2004-07-01,17,0,20131,53.3,27,f,INDIVIDUAL,3,0,"1076 carney fort apt. 347\nloganmouth, sd 05113"
2,15600,36 months,10.49,506.97,B,B3,statistician,< 1 year,RENT,43057.0,Source Verified,2015-01-01,Fully Paid,credit_card,credit card refinancing,12.79,2007-08-01,13,0,11987,92.2,26,f,INDIVIDUAL,0,0,"87025 mark dale apt. 269\nnew sabrina, wv 05113"
3,7200,36 months,6.49,220.65,A,A2,client advocate,6 years,RENT,54000.0,Not Verified,2014-11-01,Fully Paid,credit_card,credit card refinancing,2.6,2006-09-01,6,0,5472,21.5,13,f,INDIVIDUAL,0,0,"823 reid ford\ndelacruzside, ma 00813"
4,24375,60 months,17.27,609.33,C,C5,destiny management inc.,9 years,MORTGAGE,55000.0,Verified,2013-04-01,Charged Off,credit_card,credit card refinance,33.95,1999-03-01,13,0,24584,69.8,43,f,INDIVIDUAL,1,0,"679 luna roads\ngreggshire, va 11650"


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 314084 entries, 0 to 396029
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   loan_amnt             314084 non-null  int64         
 1   term                  314084 non-null  object        
 2   int_rate              314084 non-null  float64       
 3   installment           314084 non-null  float64       
 4   grade                 314084 non-null  object        
 5   sub_grade             314084 non-null  object        
 6   emp_title             314084 non-null  object        
 7   emp_length            314084 non-null  object        
 8   home_ownership        314084 non-null  object        
 9   annual_inc            314084 non-null  float64       
 10  verification_status   314084 non-null  object        
 11  issue_d               314084 non-null  datetime64[ns]
 12  loan_status           314084 non-null  object        
 13  purp

In [40]:
df["issue_month"] = df["issue_d"].dt.month
df["issue_year"] = df["issue_d"].dt.year
df["earliest_cr_line"] = df["earliest_cr_line"].dt.year

In [41]:
df.drop(["issue_d"], axis=1, inplace=True)

In [42]:
term_values = {' 36 months': 36, ' 60 months': 60}
df['term'] = df.term.map(term_values)

In [43]:
df['initial_list_status'].unique()

array(['w', 'f'], dtype=object)

In [44]:
list_status = {'w': 0, 'f': 1}
df['initial_list_status'] = df.initial_list_status.map(list_status)

In [45]:
dummies = ['grade', 'verification_status' , 'home_ownership',"application_type"]
df = pd.get_dummies(df, columns=dummies, drop_first=True)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 314084 entries, 0 to 396029
Data columns (total 39 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   loan_amnt                            314084 non-null  int64  
 1   term                                 314084 non-null  int64  
 2   int_rate                             314084 non-null  float64
 3   installment                          314084 non-null  float64
 4   sub_grade                            314084 non-null  object 
 5   emp_title                            314084 non-null  object 
 6   emp_length                           314084 non-null  object 
 7   annual_inc                           314084 non-null  float64
 8   loan_status                          314084 non-null  object 
 9   purpose                              314084 non-null  object 
 10  title                                314084 non-null  object 
 11  dti               

In [47]:
df['zip_code'] = df.address.apply(lambda x: x[8:])

In [48]:
df.drop(["address"], axis=1, inplace=True)

In [49]:
# Mapping of target variable -
df['loan_status'] = df.loan_status.map({'Fully Paid':0, 'Charged Off':1})

In [50]:
df["emp_length"].value_counts()

emp_length
10+ years    100408
2 years       30432
< 1 year      26856
3 years       26824
5 years       22632
1 year        22042
4 years       20298
6 years       17851
7 years       17679
8 years       16168
9 years       12894
Name: count, dtype: int64

In [51]:
df["emp_length"] = df["emp_length"].apply(lambda x:x.split()[0]).replace({"<":0, "10+":10})

In [52]:
df["emp_length"].value_counts()

emp_length
10    100408
2      30432
0      26856
3      26824
5      22632
1      22042
4      20298
6      17851
7      17679
8      16168
9      12894
Name: count, dtype: int64

In [53]:
# convert bool to int 
for col in df.select_dtypes(include='bool').columns:
    df[col] = df[col].astype(int)

In [54]:
df["emp_length"] = df["emp_length"].astype(int)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 314084 entries, 0 to 396029
Data columns (total 39 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   loan_amnt                            314084 non-null  int64  
 1   term                                 314084 non-null  int64  
 2   int_rate                             314084 non-null  float64
 3   installment                          314084 non-null  float64
 4   sub_grade                            314084 non-null  object 
 5   emp_title                            314084 non-null  object 
 6   emp_length                           314084 non-null  int64  
 7   annual_inc                           314084 non-null  float64
 8   loan_status                          314084 non-null  int64  
 9   purpose                              314084 non-null  object 
 10  title                                314084 non-null  object 
 11  dti               

In [56]:
for col in df.select_dtypes(include='object').columns:
    df[col] = df.groupby(col)["loan_status"].transform("mean")

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 314084 entries, 0 to 396029
Data columns (total 39 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   loan_amnt                            314084 non-null  int64  
 1   term                                 314084 non-null  int64  
 2   int_rate                             314084 non-null  float64
 3   installment                          314084 non-null  float64
 4   sub_grade                            314084 non-null  float64
 5   emp_title                            314084 non-null  float64
 6   emp_length                           314084 non-null  int64  
 7   annual_inc                           314084 non-null  float64
 8   loan_status                          314084 non-null  int64  
 9   purpose                              314084 non-null  float64
 10  title                                314084 non-null  float64
 11  dti               

In [58]:
df.shape

(314084, 39)

In [59]:
df.to_excel("../data/data_fe.xlsx")