In [2]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
from typing import Union, List
from sklearn.preprocessing import StandardScaler, MinMaxScaler # type: ignore
from sklearn.decomposition import PCA                   # type: ignore

import warnings

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
warnings.filterwarnings(action='ignore')

In [3]:
class Preprocessor:
    def __init__(self):
        self.file_path = ""
        self.folder_path = ""
        self.df = pd.DataFrame()
        self.pca_df = pd.DataFrame()

    def __init__(self, data_file_path:str="", folder_path:str=""):
        self.file_path = data_file_path
        self.folder_path = folder_path
        self.df = pd.DataFrame()
        self.pca_df = pd.DataFrame()
        
    def load_origin_file(self, file_path:str)->None:
        if file_path!="":
            self.file_path = file_path
        self.df = pd.read_csv(self.file_path)
        
    def drop_columns(self, drop_columns_file_path:str = "drop_columns.txt")->None:
        with open(drop_columns_file_path, mode='r') as f:
            drop_fields = f.readlines()
            drop_fields = [drop_field.strip('\n') for drop_field in drop_fields]
        self.df.drop(columns=drop_fields, inplace=True)
    
    def __preprocess_target_variable(self, target_variable:str="loan_status")->None:
        # loan_status가 "current", "issued", "policy" 인 행을 필터링하여 삭제
        modified_df = self.df[~self.df[target_variable].isin(['Current', 'Issued', 'Does not meet the credit policy. Status:Fully Paid', 'Does not meet the credit policy. Status:Charged Off'])]
        # risk = 1, safe = 0 으로 처리
        modified_df.loc[modified_df['loan_status'].isin(['Fully Paid', 'In Grace Period']), 'loan_status'] = 0
        modified_df.loc[modified_df['loan_status'].isin(['Charged Off', 'Default', 'Late (16-30 days)', 'Late (31-120 days)']), 'loan_status'] = 1
        modified_df['loan_status'] = modified_df['loan_status'].astype('int')
        self.df = modified_df
        
    ## 5. 데이터 처리용 함수
    def __delete_suffix(self, term:str)->int:
        '''첫 단어만을 저장하는 함수'''
        term = term.strip().split()[0]
        return int(term)

    def __delete_suffix_percentage(self, term:str)->float:
        '''%를 자르는 함수'''
        term = term.strip('%')
        return float(term)
    
    def __fill_na_with_value(self, columns:List[str], filling_value:Union[str, int])->None:
        '''
        df: dataframe to fill NA
        column_name : column name to change NA values
        filling_value : value type or just value to fill column's NA
        '''
        for column_name in columns:
            if filling_value=="mode":
                mode_value = self.df[column_name].mode()[0]
            elif filling_value=="median":
                mode_value = self.df[column_name].median()
            else:
                mode_value = filling_value
            self.df[column_name].fillna(mode_value, inplace=True)
        
    def __preprocessing_na(self, is_train:bool)->None:
        '''
        'acc_open_past_24mths', 확인필요
        'avg_cur_bal', 확인필요
        '''
        ## 결측 처리
        # 결측 개수가 1천 건 이하인 경우는 해당 데이터(row) 삭제
        self.df.dropna(subset=['chargeoff_within_12_mths','collections_12_mths_ex_med','dti',
                                                'pub_rec_bankruptcies','revol_util','tax_liens'], inplace=True)
        
        if is_train:
            # A1. 최빈값 대체
            self.__fill_na_with_value(columns=['mo_sin_old_il_acct', 'mths_since_recent_bc', 'mths_since_recent_inq', 'emp_length'], filling_value='mode')
            
            # A2. 중앙값 대체
            self.__fill_na_with_value(columns=['bc_open_to_buy'], filling_value='median')
        # B. 2015년 대체
        # is_after_2015 컬럼 생성. all_util 변수를 기준으로 사용
        self.df['is_after_2015'] = self.df['all_util'].apply(lambda x: 0 if pd.isnull(x) else 1)
        # 결측값을 0으로 채우기
        
        # C. 2012년 대체
        # is_after_2012 컬럼 생성. pct_tl_nvr_dlq 변수를 기준으로 사용
        self.df['is_after_2012'] = self.df['pct_tl_nvr_dlq'].apply(lambda x: 0 if pd.isnull(x) else 1)
        # D. 결측 0 대체
        if is_train:
            self.__fill_na_with_value(columns=['open_acc_6m','open_act_il', 'open_il_12m', 'open_il_24m', 'total_bal_il',
                                           'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_cu_tl', 'mths_since_rcnt_il',
                                           'tot_cur_bal', 'total_rev_hi_lim', 'mo_sin_old_rev_tl_op',
                                           'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'num_bc_sats', 'num_bc_tl',
                                           'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_il_tl',
                                           'num_op_rev_tl','num_rev_accts','num_rev_tl_bal_gt_0','num_sats','num_tl_120dpd_2m','num_tl_30dpd',
                                           'num_tl_90g_dpd_24m','num_tl_op_past_12m','pct_tl_nvr_dlq','tot_hi_cred_lim','total_bal_ex_mort',
                                           'total_bc_limit','total_il_high_credit_limit'], filling_value=0)
        self.__fill_na_with_value(columns=['annual_inc_joint','dti_joint', 'revol_bal_joint'], filling_value=0)
        
        
    def __convert_object_to_numeric(self, column_name:str)->pd.DataFrame:
        unique_values = sorted(self.df[column_name].unique())
        value_map = {value:index for index, value in enumerate(unique_values)}
        self.df[column_name] = self.df[column_name].apply(lambda x:value_map.get(x))
        return self.df
    
    def __convert_object_to_one_hot(self, column_name:str)->None:
        encoded = pd.get_dummies(self.df[column_name], drop_first=True, prefix='cat')
        self.df = pd.concat([self.df, encoded], axis=1)
        self.df.drop(column_name, axis=1, inplace=True)
        
    def __preprocessing_objects(self)->None:
        # term
        self.df['term'] = self.df['term'].apply(self.__delete_suffix)
        # emp_length
        self.df = self.df.dropna(subset=['emp_length'])
        self.df['emp_length'] = self.df['emp_length'].astype(str).apply(lambda x: x.replace(' years','').replace(' year','').replace('+','').replace('< 1', '0'))
        self.df['emp_length'] = self.df['emp_length'].astype(int)
        # revol_util
        self.df['revol_util'] = self.df['revol_util'].apply(self.__delete_suffix_percentage)
        self.df['int_rate'] = self.df['int_rate'].apply(self.__delete_suffix_percentage)
        ## numeric
        # application_type
        self.df = self.__convert_object_to_numeric('application_type')
        # sub_grade
        self.df = self.__convert_object_to_numeric('sub_grade')

        ## one-hot
        # home_ownership
        self.df['home_ownership'] = self.df['home_ownership'].replace(['ANY', 'OTHER', 'NONE'], 'OTHERS')
        self.__convert_object_to_one_hot('home_ownership')
        # self.__purpose()
        # purpose
        self.__convert_object_to_one_hot('purpose')
        self.__verification_status()

    def __Multicollinearity(self)->None:
        # self.df.drop(columns=['fico_range_low'], inplace=True)
        self.df['fico_avg'] = (self.df['fico_range_low'] + self.df['fico_range_high'])/2
        self.df.drop(columns=['fico_range_low', 'fico_range_high'], inplace=True)
    def __purpose(self):
        self.df['purpose'] = self.df['purpose'].map({"small_business":"essential_purpose", 
                                                     "house":"essential_purpose", 
                                                     "moving":"essential_purpose",
                                                     "medical":"essential_purpose",
                                                     "renewable_energy":"essential_purpose",
                                                     "other":"essential_purpose",
                                                     "wedding":"optional_purpose",
                                                     "vacation":"optional_purpose",
                                                     "car":"optional_purpose",
                                                     "home_improvement":"optional_purpose",
                                                     "educational":"optional_purpose",
                                                     "debt_consolidation":"debt_consolidation",
                                                     "credit_card":"credit_card",})
    def __verification_status(self):
        self.df['verification_status'] = self.df['verification_status'].map({"Not Verified":False, 'Source Verified':True, 'Verified':True})

    def __log_transform(self):
        variables = [
          "all_util", "annual_inc", "annual_inc_joint", "bc_open_to_buy", # "avg_cur_bal",
          "delinq_amnt", "dti", "max_bal_bc", "mo_sin_old_il_acct", "mo_sin_old_rev_tl_op",
          "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc", "mths_since_rcnt_il",
          "mths_since_recent_bc", "num_accts_ever_120_pd", "num_actv_bc_tl", "num_actv_rev_tl",
          "num_bc_sats", "num_bc_tl", "num_il_tl", "num_op_rev_tl", "num_rev_accts",
          "num_rev_tl_bal_gt_0", "num_sats", "open_acc", "open_acc_6m", "open_act_il",
          "open_il_12m", "open_il_24m", "open_rv_12m", "open_rv_24m", "pub_rec_bankruptcies",
          "revol_bal", "revol_bal_joint", "tax_liens", "tot_cur_bal", "tot_hi_cred_lim",
          "total_acc", "total_bal_ex_mort", "total_bal_il", "total_bc_limit", "total_cu_tl",
          "total_il_high_credit_limit", "total_rev_hi_lim"
        ]
        self.df = self.df.loc[~(self.df[variables]<0).any(axis=1)]
        # 각 변수에 대해 로그 변환 수행 및 기존 변수 삭제
        for var in variables:
            # 로그 변환 후 변수 이름에 '_log'를 추가하여 새로운 변수 생성
            new_var = var + "_log"
            # 해당 변수가 0보다 큰 경우에만 로그 변환 수행하여 음수 무한대를 방지
            # 로그 변환 후에는 기존 값이 0인 경우 음수 무한대로 처리되므로 이에 대한 처리도 필요
            # 여기서 로그는 자연로그 (밑이 e인 로그)
            self.df[new_var] = np.log(self.df[var] + 1)  # 0이 아닌 값이어야 하므로 +1 추가
            # 기존 변수 삭제
            self.df.drop(columns=[var], inplace=True)
        return self.df
    
    def preprocess(self, is_train:bool=True)->None:
        # loan_status 제외 모든 column이 결측치(na)인 행 제거 (1개 행 제거됨)
        self.df.dropna(subset=self.df.columns.difference(['loan_status']),how='all', inplace=True)
        self.__preprocess_target_variable()
        # 결측치 제거
        self.__preprocessing_na(is_train)
        ## object 처리하기
        self.__preprocessing_objects()
        ## 다중공선성 제거 - 0419 추가
        self.__Multicollinearity()
        ## 로그변환 - 0419 추가
        self.__log_transform()
        # index 재설정
        self.df.reset_index(drop=True, inplace=True)
        if not is_train:
            self.df.dropna(subset=self.df.columns.difference(['loan_status']),inplace=True)
        self.df.reset_index(drop=True, inplace=True)
        
    def do_scaling(self, scaler:StandardScaler|MinMaxScaler, fitted_scaler=None):
        all_features = self.df.columns
        nemeric_features = [feature for feature in all_features if 'cat' not in feature]
        numeric_df = self.df[nemeric_features].drop(columns=['loan_status'])
        if fitted_scaler==None:
            scaler = scaler()
            scaler.fit(numeric_df)
        else:
            scaler = fitted_scaler
        self.scaled_df = scaler.transform(numeric_df)
        return self.scaled_df, scaler
    
    def do_pca(self, threshold:float=0.95, fitted_pca=None, n_components=0):
        if fitted_pca==None:
            pca = PCA()
            pca.fit(self.scaled_df)
            explained_variance = pca.explained_variance_ratio_
            cumulative_explained_variance = np.cumsum(explained_variance)
            n_components = np.argmax(cumulative_explained_variance >= threshold) + 1
            column_names = [f'PC{i+1}' for i in range(n_components)]
            pca = PCA(n_components=n_components)
            pca.fit(self.scaled_df)
        else:
            pca = fitted_pca
            column_names = [f'PC{i+1}' for i in range(n_components)]
        pca_components = pca.transform(self.scaled_df).astype('float32')
        self.pca_df = pd.DataFrame(data=pca_components, columns=column_names)
        return pca, n_components
    
    def get_final_df(self, is_pca:bool):
        """PCA적용한데이터프레임"""
        all_features = self.df.columns
        cat_features = [feature for feature in all_features if 'cat' in feature]
        cat_df = self.df[cat_features].reset_index(drop=True)
        target_df = self.df[['loan_status']].reset_index(drop=True)
        if is_pca:
            pca_df = self.pca_df.reset_index(drop=True)
        else:
            nemeric_features = [feature for feature in all_features if 'cat' not in feature]
            pca_df = self.df[nemeric_features].drop(columns=['loan_status'])
        self.df = pd.concat([cat_df, pca_df, target_df], axis=1)
        return self.df
    
    def get_df(self)->pd.DataFrame:
        return self.df

In [8]:
train_preprocessor = Preprocessor()
# lending_club_2020_train.csv 파일이 있는 절대 경로 혹은 상대 경로를 명시해주세요
train_preprocessor.load_origin_file(file_path="lending_club_2020_train.csv")
# drop_columns_0410.txt 파일의 위치를 명시해주세요
train_preprocessor.drop_columns(drop_columns_file_path='drop_columns.txt')
# preprocess를 돌리면, addr_state를 제외한 object field 및, na(결측치) 처리됩니다.
train_preprocessor.preprocess(is_train=True)
train_df = train_preprocessor.get_df()
_, train_scaler = train_preprocessor.do_scaling(StandardScaler)
# train_pca, train_n_components = train_preprocessor.do_pca()
scaled_train_df = train_preprocessor.get_final_df(is_pca=False)

In [9]:
print(scaled_train_df.shape)

(1131682, 86)


In [10]:
scaled_train_df.head()

Unnamed: 0,verification_status,application_type,cat_OTHERS,cat_OWN,cat_RENT,cat_credit_card,cat_debt_consolidation,cat_educational,cat_home_improvement,cat_house,cat_major_purchase,cat_medical,cat_moving,cat_other,cat_renewable_energy,cat_small_business,cat_vacation,cat_wedding,loan_amnt,term,int_rate,installment,sub_grade,emp_length,delinq_2yrs,inq_last_6mths,pub_rec,revol_util,collections_12_mths_ex_med,dti_joint,acc_now_delinq,chargeoff_within_12_mths,mths_since_recent_inq,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,is_after_2015,is_after_2012,fico_avg,all_util_log,annual_inc_log,annual_inc_joint_log,bc_open_to_buy_log,delinq_amnt_log,dti_log,max_bal_bc_log,mo_sin_old_il_acct_log,mo_sin_old_rev_tl_op_log,mo_sin_rcnt_rev_tl_op_log,mo_sin_rcnt_tl_log,mort_acc_log,mths_since_rcnt_il_log,mths_since_recent_bc_log,num_accts_ever_120_pd_log,num_actv_bc_tl_log,num_actv_rev_tl_log,num_bc_sats_log,num_bc_tl_log,num_il_tl_log,num_op_rev_tl_log,num_rev_accts_log,num_rev_tl_bal_gt_0_log,num_sats_log,open_acc_log,open_acc_6m_log,open_act_il_log,open_il_12m_log,open_il_24m_log,open_rv_12m_log,open_rv_24m_log,pub_rec_bankruptcies_log,revol_bal_log,revol_bal_joint_log,tax_liens_log,tot_cur_bal_log,tot_hi_cred_lim_log,total_acc_log,total_bal_ex_mort_log,total_bal_il_log,total_bc_limit_log,total_cu_tl_log,total_il_high_credit_limit_log,total_rev_hi_lim_log,loan_status
0,True,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,6000.0,36,7.97,187.94,4,2,1.0,0.0,0.0,14.0,0.0,10.2,0.0,0.0,3.0,0.0,0.0,0.0,4.0,97.1,1,1,757.0,4.174387,10.71444,11.517923,9.385218,0.0,2.269028,8.03625,4.997212,4.912655,2.564949,1.386294,0.693147,1.386294,4.043051,0.0,0.693147,0.693147,0.693147,1.609438,3.367296,1.609438,2.079442,0.693147,2.70805,2.70805,0.693147,2.397895,1.386294,1.386294,0.693147,0.693147,0.0,8.03625,8.275631,0.0,10.820878,11.27228,3.610918,10.820878,10.757158,9.615872,0.0,10.942208,10.003378,0
1,True,0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,23200.0,60,24.99,680.82,23,10,1.0,1.0,0.0,55.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,94.8,1,1,672.0,4.454347,11.608245,0.0,8.702012,0.0,3.575151,8.455105,4.844187,5.627621,1.94591,1.94591,1.098612,2.397895,2.079442,0.0,2.197225,2.397895,2.197225,2.70805,3.610918,2.564949,3.135494,2.397895,3.218876,3.218876,0.693147,2.397895,0.693147,1.098612,1.609438,1.94591,0.0,9.73566,0.0,0.0,13.315176,13.386134,4.110874,12.592905,12.533767,9.918425,1.386294,12.582679,10.31745,1
2,False,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,16000.0,36,7.07,494.55,1,10,0.0,0.0,0.0,5.5,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,1.0,100.0,1,1,822.0,3.332205,11.082158,0.0,10.91908,0.0,2.93066,7.85632,4.905275,5.993961,1.94591,1.94591,1.386294,2.772589,1.94591,0.0,1.94591,1.94591,2.197225,2.302585,2.079442,2.772589,2.944439,1.94591,2.944439,2.944439,0.693147,1.098612,0.0,1.098612,0.693147,1.098612,0.0,8.264106,0.0,0.0,11.522113,12.186584,3.367296,10.390594,10.263641,10.987003,0.0,10.80649,11.157692,1
3,False,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,4500.0,36,10.42,146.1,7,5,0.0,0.0,2.0,49.4,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,1.0,77.8,1,1,702.0,3.637586,10.819798,0.0,7.814803,0.0,2.078191,8.656433,5.42495,4.875197,2.70805,1.791759,0.693147,5.42495,2.70805,0.0,1.386294,1.386294,1.386294,1.609438,1.098612,1.609438,1.94591,1.386294,2.079442,2.079442,0.693147,1.098612,0.0,0.0,0.0,1.098612,1.098612,8.828348,0.0,0.0,12.178635,12.263577,2.302585,9.166806,7.919356,9.137877,0.0,9.390242,9.532496,0
4,True,0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,20000.0,36,9.99,645.25,7,10,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,2.0,77.5,0,1,677.0,0.0,11.002117,0.0,6.890609,0.0,3.399195,0.0,5.010635,5.252273,2.639057,2.302585,0.693147,0.0,3.044522,1.791759,1.791759,2.397895,1.791759,2.484907,2.639057,2.70805,3.295837,2.397895,2.995732,2.995732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.718783,0.0,0.0,11.83159,12.137612,3.713572,11.101704,0.0,9.305741,0.0,11.199461,10.229224,0


In [11]:
train_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,emp_length,verification_status,loan_status,delinq_2yrs,inq_last_6mths,pub_rec,revol_util,collections_12_mths_ex_med,application_type,dti_joint,acc_now_delinq,chargeoff_within_12_mths,mths_since_recent_inq,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,is_after_2015,is_after_2012,cat_OTHERS,cat_OWN,cat_RENT,cat_credit_card,cat_debt_consolidation,cat_educational,cat_home_improvement,cat_house,cat_major_purchase,cat_medical,cat_moving,cat_other,cat_renewable_energy,cat_small_business,cat_vacation,cat_wedding,fico_avg,all_util_log,annual_inc_log,annual_inc_joint_log,bc_open_to_buy_log,delinq_amnt_log,dti_log,max_bal_bc_log,mo_sin_old_il_acct_log,mo_sin_old_rev_tl_op_log,mo_sin_rcnt_rev_tl_op_log,mo_sin_rcnt_tl_log,mort_acc_log,mths_since_rcnt_il_log,mths_since_recent_bc_log,num_accts_ever_120_pd_log,num_actv_bc_tl_log,num_actv_rev_tl_log,num_bc_sats_log,num_bc_tl_log,num_il_tl_log,num_op_rev_tl_log,num_rev_accts_log,num_rev_tl_bal_gt_0_log,num_sats_log,open_acc_log,open_acc_6m_log,open_act_il_log,open_il_12m_log,open_il_24m_log,open_rv_12m_log,open_rv_24m_log,pub_rec_bankruptcies_log,revol_bal_log,revol_bal_joint_log,tax_liens_log,tot_cur_bal_log,tot_hi_cred_lim_log,total_acc_log,total_bal_ex_mort_log,total_bal_il_log,total_bc_limit_log,total_cu_tl_log,total_il_high_credit_limit_log,total_rev_hi_lim_log
0,6000.0,36,7.97,187.94,4,2,True,0,1.0,0.0,0.0,14.0,0.0,1,10.2,0.0,0.0,3.0,0.0,0.0,0.0,4.0,97.1,1,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,757.0,4.174387,10.71444,11.517923,9.385218,0.0,2.269028,8.03625,4.997212,4.912655,2.564949,1.386294,0.693147,1.386294,4.043051,0.0,0.693147,0.693147,0.693147,1.609438,3.367296,1.609438,2.079442,0.693147,2.70805,2.70805,0.693147,2.397895,1.386294,1.386294,0.693147,0.693147,0.0,8.03625,8.275631,0.0,10.820878,11.27228,3.610918,10.820878,10.757158,9.615872,0.0,10.942208,10.003378
1,23200.0,60,24.99,680.82,23,10,True,1,1.0,1.0,0.0,55.9,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,94.8,1,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,672.0,4.454347,11.608245,0.0,8.702012,0.0,3.575151,8.455105,4.844187,5.627621,1.94591,1.94591,1.098612,2.397895,2.079442,0.0,2.197225,2.397895,2.197225,2.70805,3.610918,2.564949,3.135494,2.397895,3.218876,3.218876,0.693147,2.397895,0.693147,1.098612,1.609438,1.94591,0.0,9.73566,0.0,0.0,13.315176,13.386134,4.110874,12.592905,12.533767,9.918425,1.386294,12.582679,10.31745
2,16000.0,36,7.07,494.55,1,10,False,1,0.0,0.0,0.0,5.5,0.0,0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,1.0,100.0,1,1,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,822.0,3.332205,11.082158,0.0,10.91908,0.0,2.93066,7.85632,4.905275,5.993961,1.94591,1.94591,1.386294,2.772589,1.94591,0.0,1.94591,1.94591,2.197225,2.302585,2.079442,2.772589,2.944439,1.94591,2.944439,2.944439,0.693147,1.098612,0.0,1.098612,0.693147,1.098612,0.0,8.264106,0.0,0.0,11.522113,12.186584,3.367296,10.390594,10.263641,10.987003,0.0,10.80649,11.157692
3,4500.0,36,10.42,146.1,7,5,False,0,0.0,0.0,2.0,49.4,0.0,0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,1.0,77.8,1,1,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,702.0,3.637586,10.819798,0.0,7.814803,0.0,2.078191,8.656433,5.42495,4.875197,2.70805,1.791759,0.693147,5.42495,2.70805,0.0,1.386294,1.386294,1.386294,1.609438,1.098612,1.609438,1.94591,1.386294,2.079442,2.079442,0.693147,1.098612,0.0,0.0,0.0,1.098612,1.098612,8.828348,0.0,0.0,12.178635,12.263577,2.302585,9.166806,7.919356,9.137877,0.0,9.390242,9.532496
4,20000.0,36,9.99,645.25,7,10,True,0,0.0,0.0,0.0,60.0,0.0,0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,2.0,77.5,0,1,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,677.0,0.0,11.002117,0.0,6.890609,0.0,3.399195,0.0,5.010635,5.252273,2.639057,2.302585,0.693147,0.0,3.044522,1.791759,1.791759,2.397895,1.791759,2.484907,2.639057,2.70805,3.295837,2.397895,2.995732,2.995732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.718783,0.0,0.0,11.83159,12.137612,3.713572,11.101704,0.0,9.305741,0.0,11.199461,10.229224


In [12]:
scaled_train_df.to_csv("scaled_train.csv", index=False)
train_df.to_csv("train.csv", index=False)

In [8]:
test_preprocessor = Preprocessor()
# lending_club_2020_train.csv 파일이 있는 절대 경로 혹은 상대 경로를 명시해주세요
test_preprocessor.load_origin_file(file_path="../data/lending_club_2020_test.csv")
# drop_columns_0410.txt 파일의 위치를 명시해주세요
test_preprocessor.drop_columns(drop_columns_file_path='drop_columns.txt')
# preprocess를 돌리면, addr_state를 제외한 object field 및, na(결측치) 처리됩니다.
test_preprocessor.preprocess(is_train=False)
test_df = test_preprocessor.get_df()
# test_df['total_loan_amnt'] = test_df['loan_amnt']*test_df['int_rate']
test_preprocessor.do_scaling(scaler=StandardScaler, fitted_scaler=train_scaler)
scaled_test_df = test_preprocessor.get_final_df(is_pca=False)

In [9]:
test_df.to_csv("test.csv", index=False)
scaled_test_df.to_csv("scaled_test.csv", index=False)

In [10]:
test_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,emp_length,verification_status,loan_status,delinq_2yrs,inq_last_6mths,pub_rec,revol_util,collections_12_mths_ex_med,application_type,dti_joint,acc_now_delinq,chargeoff_within_12_mths,mths_since_recent_inq,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,is_after_2015,is_after_2012,cat_OTHERS,cat_OWN,cat_RENT,cat_credit_card,cat_debt_consolidation,cat_educational,cat_home_improvement,cat_house,cat_major_purchase,cat_medical,cat_moving,cat_other,cat_renewable_energy,cat_small_business,cat_vacation,cat_wedding,fico_avg,all_util_log,annual_inc_log,annual_inc_joint_log,bc_open_to_buy_log,delinq_amnt_log,dti_log,max_bal_bc_log,mo_sin_old_il_acct_log,mo_sin_old_rev_tl_op_log,mo_sin_rcnt_rev_tl_op_log,mo_sin_rcnt_tl_log,mort_acc_log,mths_since_rcnt_il_log,mths_since_recent_bc_log,num_accts_ever_120_pd_log,num_actv_bc_tl_log,num_actv_rev_tl_log,num_bc_sats_log,num_bc_tl_log,num_il_tl_log,num_op_rev_tl_log,num_rev_accts_log,num_rev_tl_bal_gt_0_log,num_sats_log,open_acc_log,open_acc_6m_log,open_act_il_log,open_il_12m_log,open_il_24m_log,open_rv_12m_log,open_rv_24m_log,pub_rec_bankruptcies_log,revol_bal_log,revol_bal_joint_log,tax_liens_log,tot_cur_bal_log,tot_hi_cred_lim_log,total_acc_log,total_bal_ex_mort_log,total_bal_il_log,total_bc_limit_log,total_cu_tl_log,total_il_high_credit_limit_log,total_rev_hi_lim_log
0,19125.0,36,8.81,606.49,4,0,False,0,0.0,0.0,0.0,72.5,0.0,0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,2.0,100.0,1,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,702.0,4.110874,11.082158,0.0,8.685754,0.0,2.675527,8.982184,4.26268,4.736198,3.78419,2.197225,1.098612,3.332205,3.78419,0.0,1.386294,1.386294,1.386294,1.386294,1.791759,1.386294,1.609438,1.386294,1.791759,1.791759,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,9.654,0.0,0.0,12.028106,12.108272,2.484907,9.840495,8.069655,9.975855,1.098612,9.21044,9.975855
1,15000.0,36,5.32,451.73,0,1,False,0,0.0,0.0,0.0,12.6,0.0,0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,3.0,95.5,1,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,752.0,3.73767,11.225257,0.0,9.534017,0.0,2.033398,7.699389,4.795791,4.859812,1.386294,1.386294,1.386294,2.772589,1.386294,0.693147,1.386294,1.386294,1.386294,2.397895,1.386294,1.609438,2.70805,1.386294,1.94591,1.94591,1.386294,0.693147,0.0,0.693147,1.098612,1.098612,0.0,8.183118,0.0,0.0,12.291883,12.41218,3.135494,9.871171,9.666815,9.764283,0.0,9.8457,10.25418
2,30000.0,36,15.99,1054.57,14,2,True,0,0.0,0.0,1.0,44.2,0.0,1,19.22,0.0,0.0,2.0,0.0,0.0,0.0,8.0,84.4,1,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,662.0,4.521789,11.176067,11.681157,8.630879,0.0,3.433019,8.104099,4.59512,4.812184,2.484907,2.079442,1.386294,2.079442,2.484907,0.693147,1.386294,1.609438,1.386294,1.94591,2.995732,1.791759,2.397895,1.609438,2.995732,2.995732,0.0,2.639057,1.386294,1.791759,1.609438,1.609438,0.0,8.799812,0.0,0.0,12.93943,12.976732,3.496508,11.711834,11.655943,9.259226,0.693147,11.684734,9.615872
3,6000.0,36,13.58,203.85,11,5,True,0,1.0,0.0,0.0,34.8,0.0,0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,4.0,93.8,1,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,672.0,3.931826,11.225257,0.0,9.87241,0.0,3.047376,8.064322,4.70953,4.189655,2.397895,2.397895,0.693147,2.397895,2.397895,0.0,1.791759,2.079442,1.791759,1.94591,1.386294,2.397895,2.564949,2.079442,2.639057,2.639057,0.0,0.693147,1.098612,1.098612,1.098612,1.609438,0.0,9.637371,0.0,0.0,12.327591,12.488707,2.833213,10.395283,9.763018,10.335302,0.0,9.998843,10.691968
4,7500.0,36,10.56,243.99,7,6,True,0,0.0,1.0,0.0,20.0,0.0,0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,100.0,1,1,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,767.0,4.077537,10.947239,0.0,9.744081,0.0,2.758109,7.718241,3.526361,4.770685,1.098612,1.098612,1.098612,1.609438,3.465736,0.0,0.693147,1.386294,1.098612,1.098612,2.079442,2.079442,2.079442,1.386294,2.70805,2.70805,1.098612,1.94591,1.386294,1.94591,0.693147,1.098612,0.0,8.64365,0.0,0.0,11.871746,12.094878,2.833213,11.117138,11.029099,9.867912,0.0,11.053316,10.250652


In [11]:
scaled_test_df.head()

Unnamed: 0,verification_status,application_type,cat_OTHERS,cat_OWN,cat_RENT,cat_credit_card,cat_debt_consolidation,cat_educational,cat_home_improvement,cat_house,cat_major_purchase,cat_medical,cat_moving,cat_other,cat_renewable_energy,cat_small_business,cat_vacation,cat_wedding,loan_amnt,term,int_rate,installment,sub_grade,emp_length,delinq_2yrs,inq_last_6mths,pub_rec,revol_util,collections_12_mths_ex_med,dti_joint,acc_now_delinq,chargeoff_within_12_mths,mths_since_recent_inq,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,is_after_2015,is_after_2012,fico_avg,all_util_log,annual_inc_log,annual_inc_joint_log,bc_open_to_buy_log,delinq_amnt_log,dti_log,max_bal_bc_log,mo_sin_old_il_acct_log,mo_sin_old_rev_tl_op_log,mo_sin_rcnt_rev_tl_op_log,mo_sin_rcnt_tl_log,mort_acc_log,mths_since_rcnt_il_log,mths_since_recent_bc_log,num_accts_ever_120_pd_log,num_actv_bc_tl_log,num_actv_rev_tl_log,num_bc_sats_log,num_bc_tl_log,num_il_tl_log,num_op_rev_tl_log,num_rev_accts_log,num_rev_tl_bal_gt_0_log,num_sats_log,open_acc_log,open_acc_6m_log,open_act_il_log,open_il_12m_log,open_il_24m_log,open_rv_12m_log,open_rv_24m_log,pub_rec_bankruptcies_log,revol_bal_log,revol_bal_joint_log,tax_liens_log,tot_cur_bal_log,tot_hi_cred_lim_log,total_acc_log,total_bal_ex_mort_log,total_bal_il_log,total_bc_limit_log,total_cu_tl_log,total_il_high_credit_limit_log,total_rev_hi_lim_log,loan_status
0,False,0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,19125.0,36,8.81,606.49,4,0,0.0,0.0,0.0,72.5,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,2.0,100.0,1,1,702.0,4.110874,11.082158,0.0,8.685754,0.0,2.675527,8.982184,4.26268,4.736198,3.78419,2.197225,1.098612,3.332205,3.78419,0.0,1.386294,1.386294,1.386294,1.386294,1.791759,1.386294,1.609438,1.386294,1.791759,1.791759,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,9.654,0.0,0.0,12.028106,12.108272,2.484907,9.840495,8.069655,9.975855,1.098612,9.21044,9.975855,0
1,False,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,15000.0,36,5.32,451.73,0,1,0.0,0.0,0.0,12.6,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,3.0,95.5,1,1,752.0,3.73767,11.225257,0.0,9.534017,0.0,2.033398,7.699389,4.795791,4.859812,1.386294,1.386294,1.386294,2.772589,1.386294,0.693147,1.386294,1.386294,1.386294,2.397895,1.386294,1.609438,2.70805,1.386294,1.94591,1.94591,1.386294,0.693147,0.0,0.693147,1.098612,1.098612,0.0,8.183118,0.0,0.0,12.291883,12.41218,3.135494,9.871171,9.666815,9.764283,0.0,9.8457,10.25418,0
2,True,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,30000.0,36,15.99,1054.57,14,2,0.0,0.0,1.0,44.2,0.0,19.22,0.0,0.0,2.0,0.0,0.0,0.0,8.0,84.4,1,1,662.0,4.521789,11.176067,11.681157,8.630879,0.0,3.433019,8.104099,4.59512,4.812184,2.484907,2.079442,1.386294,2.079442,2.484907,0.693147,1.386294,1.609438,1.386294,1.94591,2.995732,1.791759,2.397895,1.609438,2.995732,2.995732,0.0,2.639057,1.386294,1.791759,1.609438,1.609438,0.0,8.799812,0.0,0.0,12.93943,12.976732,3.496508,11.711834,11.655943,9.259226,0.693147,11.684734,9.615872,0
3,True,0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,6000.0,36,13.58,203.85,11,5,1.0,0.0,0.0,34.8,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,4.0,93.8,1,1,672.0,3.931826,11.225257,0.0,9.87241,0.0,3.047376,8.064322,4.70953,4.189655,2.397895,2.397895,0.693147,2.397895,2.397895,0.0,1.791759,2.079442,1.791759,1.94591,1.386294,2.397895,2.564949,2.079442,2.639057,2.639057,0.0,0.693147,1.098612,1.098612,1.098612,1.609438,0.0,9.637371,0.0,0.0,12.327591,12.488707,2.833213,10.395283,9.763018,10.335302,0.0,9.998843,10.691968,0
4,True,0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,7500.0,36,10.56,243.99,7,6,0.0,1.0,0.0,20.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,100.0,1,1,767.0,4.077537,10.947239,0.0,9.744081,0.0,2.758109,7.718241,3.526361,4.770685,1.098612,1.098612,1.098612,1.609438,3.465736,0.0,0.693147,1.386294,1.098612,1.098612,2.079442,2.079442,2.079442,1.386294,2.70805,2.70805,1.098612,1.94591,1.386294,1.94591,0.693147,1.098612,0.0,8.64365,0.0,0.0,11.871746,12.094878,2.833213,11.117138,11.029099,9.867912,0.0,11.053316,10.250652,0


In [12]:
train_df.shape, scaled_train_df.shape, test_df.shape, scaled_test_df.shape

((1131682, 86), (1131682, 86), (320678, 86), (320678, 86))

In [19]:
origin_train = pd.read_csv("../data/lending_club_2020_train.csv")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
origin_train['recovery_rate'] = (origin_train['total_rec_prncp']+origin_train['recoveries'])/origin_train['loan_amnt']
origin_train['recovery_rate']