In [2]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings(action='ignore')

In [7]:
from typing import Union, List
'''
TODO
1) addr_state 변수 설정해야한다
2) 결측치에서 'acc_open_past_24mths', 'avg_cur_bal': 확인필요
'''
class Preprocessor:
    def __init__(self):
        self.file_path = ""
        self.folder_path = ""
        self.df = pd.DataFrame()
    
    def __init__(self, data_file_path:str="", folder_path:str=""):
        self.file_path = data_file_path
        self.folder_path = folder_path
        self.df = pd.DataFrame()
        
    def load_origin_file(self, file_path:str)->None:
        if file_path!="":
            self.file_path = file_path
        self.df = pd.read_csv(self.file_path)
        
    def drop_columns(self, drop_columns_file_path:str = "drop_columns_0410.txt")->None:
        with open(drop_columns_file_path, mode='r') as f:
            drop_fields = f.readlines()
            drop_fields = [drop_field.strip('\n') for drop_field in drop_fields]
        self.df.drop(columns=drop_fields, inplace=True)
    
    def __preprocess_target_variable(self, target_variable:str="loan_status")->None:
        # loan_status가 "current", "issued", "policy" 인 행을 필터링하여 삭제
        modified_df = self.df[~self.df[target_variable].isin(['Current', 'Issued', 'Does not meet the credit policy. Status:Fully Paid', 'Does not meet the credit policy. Status:Charged Off'])]
        # risk = 1, safe = 0 으로 처리
        modified_df.loc[modified_df['loan_status'].isin(['Fully Paid', 'In Grace Period']), 'loan_status'] = 0
        modified_df.loc[modified_df['loan_status'].isin(['Charged Off', 'Default', 'Late (16-30 days)', 'Late (31-120 days)']), 'loan_status'] = 1
        modified_df['loan_status'] = modified_df['loan_status'].astype('int')
        self.df = modified_df
        
    ## 5. 데이터 처리용 함수
    def __delete_suffix(self, term:str)->int:
        '''첫 단어만을 저장하는 함수'''
        term = term.strip().split()[0]
        return int(term)

    def __delete_suffix_percentage(self, term:str)->float:
        '''%를 자르는 함수'''
        term = term.strip('%')
        return float(term)
    
    def __fill_na_with_value(self, columns:List[str], filling_value:Union[str, int])->None:
        '''
        df: dataframe to fill NA
        column_name : column name to change NA values
        filling_value : value type or just value to fill column's NA
        '''
        for column_name in columns:
            if filling_value=="mode":
                mode_value = self.df[column_name].mode()[0]
            elif filling_value=="median":
                mode_value = self.df[column_name].median()
            else:
                mode_value = filling_value
            self.df[column_name].fillna(mode_value, inplace=True)
        
    def __preprocessing_na(self)->None:
        '''
        'acc_open_past_24mths', 확인필요
        'avg_cur_bal', 확인필요
        '''
        ## 결측 처리
        # 결측 개수가 1천 건 이하인 경우는 해당 데이터(row) 삭제
        self.df.dropna(subset=['chargeoff_within_12_mths','collections_12_mths_ex_med','dti',
                                                'pub_rec_bankruptcies','revol_util','tax_liens'], inplace=True)
        
        # A1. 최빈값 대체
        self.__fill_na_with_value(columns=['mo_sin_old_il_acct', 'mths_since_recent_bc', 'mths_since_recent_inq', 'emp_length'], filling_value='mode')
        
        # A2. 중앙값 대체
        self.__fill_na_with_value(columns=['bc_open_to_buy'], filling_value='median')
        # B. 2015년 대체
        # is_after_2015 컬럼 생성. all_util 변수를 기준으로 사용
        self.df['is_after_2015'] = self.df['all_util'].apply(lambda x: 0 if pd.isnull(x) else 1)
        # 결측값을 0으로 채우기
        
        # C. 2012년 대체
        # is_after_2012 컬럼 생성. pct_tl_nvr_dlq 변수를 기준으로 사용
        self.df['is_after_2012'] = self.df['pct_tl_nvr_dlq'].apply(lambda x: 0 if pd.isnull(x) else 1)
        # D. 결측 0 대체
        self.__fill_na_with_value(columns=['annual_inc_joint','dti_joint','revol_bal_joint', 'open_acc_6m',
                                           'open_act_il', 'open_il_12m', 'open_il_24m', 'total_bal_il',
                                           'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_cu_tl', 'mths_since_rcnt_il',
                                           'tot_cur_bal', 'total_rev_hi_lim', 'mo_sin_old_rev_tl_op',
                                           'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'num_bc_sats', 'num_bc_tl',
                                           'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_il_tl',
                                           'num_op_rev_tl','num_rev_accts','num_rev_tl_bal_gt_0','num_sats','num_tl_120dpd_2m','num_tl_30dpd',
                                           'num_tl_90g_dpd_24m','num_tl_op_past_12m','pct_tl_nvr_dlq','tot_hi_cred_lim','total_bal_ex_mort',
                                           'total_bc_limit','total_il_high_credit_limit'], filling_value=0)
        
        
    def __convert_object_to_numeric(self, column_name:str)->pd.DataFrame:
        unique_values = sorted(self.df[column_name].unique())
        value_map = {value:index for index, value in enumerate(unique_values)}
        self.df[column_name] = self.df[column_name].apply(lambda x:value_map.get(x))
        return self.df
    
    def __convert_object_to_one_hot(self, column_name:str)->None:
        encoded = pd.get_dummies(self.df[column_name])
        self.df = pd.concat([self.df, encoded], axis=1)
        self.df.drop(column_name, axis=1, inplace=True)
        
    def __preprocessing_objects(self)->None:
        ## TODO : 'addr_state' 필드 해결하기
        # term
        self.df['term'] = self.df['term'].apply(self.__delete_suffix)
        # emp_length
        self.df['emp_length'] = self.df['emp_length'].apply(lambda x: x.replace(' years','').replace(' year','').replace('+','').replace('< 1', '0'))
        self.df['emp_length'] = self.df['emp_length'].astype(int)
        # revol_util
        self.df['revol_util'] = self.df['revol_util'].apply(self.__delete_suffix_percentage)
        
        ## numeric
        # application_type
        self.df = self.__convert_object_to_numeric('application_type')
        # sub_grade
        self.df = self.__convert_object_to_numeric('sub_grade')

        ## one-hot
        # home_ownership
        self.df['home_ownership'] = self.df['home_ownership'].replace(['ANY', 'OTHER', 'NONE'], 'OTHERS')
        self.__convert_object_to_one_hot('home_ownership')
        # purpose
        self.__convert_object_to_one_hot('purpose')
        # verification_status
        self.__convert_object_to_one_hot('verification_status')
        # addr_state : 해야함...

    def preprocess(self)->None:
        # loan_status 제외 모든 column이 결측치(na)인 행 제거 (1개 행 제거됨)
        self.df.dropna(subset=self.df.columns.difference(['loan_status']),how='all', inplace=True)
        self.__preprocess_target_variable()
        # 결측치 제거
        self.__preprocessing_na()
        ## object 처리하기
        self.__preprocessing_objects()
        # index 재설정
        self.df.reset_index(drop=True, inplace=True)
        self.df.dropna(subset=self.df.columns.difference(['loan_status']),how='all', inplace=True)
        
    def get_df(self)->pd.DataFrame:
        return self.df

In [8]:
p = Preprocessor()
# lending_club_2020_train.csv 파일이 있는 절대 경로 혹은 상대 경로를 명시해주세요
p.load_origin_file(file_path="lending_club_2020_train.csv")
# drop_columns_0410.txt 파일의 위치를 명시해주세요
p.drop_columns(drop_columns_file_path='drop_columns_0410.txt')
# preprocess를 돌리면, addr_state를 제외한 object field 및, na(결측치) 처리됩니다.
p.preprocess()

In [9]:
## 여기 뭔가가 이상함,,,
df = p.get_df()
columns_with_na = df.columns[df.isna().any()].tolist()
print(columns_with_na)

['acc_open_past_24mths', 'avg_cur_bal']


아래는 RFECV 관련 코드입니다

In [6]:
y_train = df['loan_status']
X_train = df.drop(columns=['addr_state', 'loan_status','acc_open_past_24mths', 'avg_cur_bal'])

In [8]:
# 학습시킬 모델 지정
#estimator = LGBMClassifier(random_state=1111, n_estimators=100, learning_rate=0.01)
# 한 step에 제거할 featrue 개수 및 cross validation fold 수 지정
#selector = RFECV(estimator, step=1, cv = 5, min_features_to_select=20)
#selector = selector.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 182948, number of negative: 722397
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7412
[LightGBM] [Info] Number of data points in the train set: 905345, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.202075 -> initscore=-1.373373
[LightGBM] [Info] Start training from score -1.373373
[LightGBM] [Info] Number of positive: 182948, number of negative: 722397
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7385
[LightGBM] [Info] Number of data points in the train set: 905345, number of used features: 87
[LightGBM] [

KeyboardInterrupt: 

In [9]:
#selected_columns = X_train.columns[selector.support_]
#selected_columns

AttributeError: 'RFECV' object has no attribute 'support_'

In [10]:
#X_train_selected = X_train[X_train.columns[selector.support_]]
#y_train

AttributeError: 'RFECV' object has no attribute 'support_'

In [7]:
X_train_selected = X_train[['loan_amnt', 'term', 'sub_grade', 'annual_inc', 'dti', 'fico_range_low',
       'revol_bal', 'tot_cur_bal', 'open_rv_24m', 'max_bal_bc',
       'mo_sin_old_rev_tl_op', 'mort_acc', 'mths_since_recent_bc',
       'num_actv_rev_tl', 'num_tl_op_past_12m', 'tot_hi_cred_lim',
       'revol_bal_joint', 'is_after_2015', 'MORTGAGE', 'RENT']]

from sklearn import svm
clf = svm.SVC()
clf.fit(X_train_selected, y_train)
SVC()

In [8]:
X_train_selected = X_train[['loan_amnt', 'term', 'sub_grade', 'annual_inc', 'dti', 'fico_range_low',
       'revol_bal', 'tot_cur_bal', 'open_rv_24m', 'max_bal_bc',
       'mo_sin_old_rev_tl_op', 'mort_acc', 'mths_since_recent_bc',
       'num_actv_rev_tl', 'num_tl_op_past_12m', 'tot_hi_cred_lim',
       'revol_bal_joint', 'is_after_2015', 'MORTGAGE', 'RENT']]
from sklearn.model_selection import train_test_split

In [12]:
from ISLP import confusion_table
import pygam
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, precision_recall_curve
X_train_selected, y_train
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size = 0.3, random_state = 1111)#X_train_Selected로 바꿀 수도 있음.

In [59]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier().fit(X_tr, y_tr)
clf.score(X_te, y_te)

y_pred = clf.predict(X_te)
print(classification_report(y_te, y_pred))
confusion_table(y_te, y_pred)

              precision    recall  f1-score   support

           0       0.80      0.99      0.89    270805
           1       0.57      0.03      0.06     68700

    accuracy                           0.80    339505
   macro avg       0.69      0.51      0.47    339505
weighted avg       0.76      0.80      0.72    339505



Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,269181,1624
1,66510,2190


In [62]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis().fit(X_tr, y_tr)
clf.score(X_te, y_te)

y_pred = clf.predict(X_te)
print(classification_report(y_te, y_pred))
confusion_table(y_te, y_pred)

              precision    recall  f1-score   support

           0       0.82      0.83      0.83    270805
           1       0.31      0.30      0.31     68700

    accuracy                           0.72    339505
   macro avg       0.57      0.57      0.57    339505
weighted avg       0.72      0.72      0.72    339505



Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,224358,46447
1,47955,20745


In [63]:
y_pred = (clf.predict_proba(X_te)[:,1] >= 0.2).astype(bool)
print(clf.score(X_te, y_pred))
print(classification_report(y_te, y_pred))
confusion_table(y_te, y_pred)

0.8936657781181426
              precision    recall  f1-score   support

           0       0.84      0.73      0.78    270805
           1       0.30      0.46      0.37     68700

    accuracy                           0.68    339505
   macro avg       0.57      0.60      0.58    339505
weighted avg       0.73      0.68      0.70    339505



Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,199001,71804
1,37211,31489


In [64]:
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis().fit(X_tr, y_tr)
clf.score(X_te, y_te)

y_pred = clf.predict(X_te)
print(classification_report(y_te, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.98      0.89    270805
           1       0.51      0.10      0.16     68700

    accuracy                           0.80    339505
   macro avg       0.66      0.54      0.52    339505
weighted avg       0.75      0.80      0.74    339505



In [65]:
#LDA
y_pred = (clf.predict_proba(X_te)[:,1] >= 0.2).astype(bool)
print(clf.score(X_te, y_pred))
print(classification_report(y_te, y_pred))
confusion_table(y_te, y_pred)

0.6425708016082238
              precision    recall  f1-score   support

           0       0.88      0.67      0.76    270805
           1       0.33      0.64      0.43     68700

    accuracy                           0.66    339505
   macro avg       0.60      0.65      0.59    339505
weighted avg       0.77      0.66      0.69    339505



Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,180283,90522
1,25021,43679


In [68]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB().fit(X_tr, y_tr)
y_pred = clf.predict(X_te)
print(classification_report(y_te, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.95      0.87    270805
           1       0.40      0.13      0.20     68700

    accuracy                           0.78    339505
   macro avg       0.60      0.54      0.54    339505
weighted avg       0.73      0.78      0.74    339505



In [69]:
y_pred = (clf.predict_proba(X_te)[:,1] >= 0.2).astype(bool)
print(clf.score(X_te, y_pred))
print(classification_report(y_te, y_pred))

0.8096552333544424
              precision    recall  f1-score   support

           0       0.85      0.79      0.81    270805
           1       0.34      0.43      0.38     68700

    accuracy                           0.72    339505
   macro avg       0.59      0.61      0.60    339505
weighted avg       0.74      0.72      0.73    339505



import scipy
from pygam import LogisticGAM, s, f, te

gam = LogisticGAM(s(0, n_splines=200) + te(3, 1) + s(2)).fit(X_te, y_te)

gam.summary()

from sklearn import linear_model
from sklearn.svm import l1_min_c

cs = l1_min_c(X_train_selected, y_train, loss="log") * np.logspace(0, 10, 16)

clf = linear_model.LogisticRegression(
    penalty="l1",
    solver="liblinear",
    tol=1e-6,
    max_iter=int(1e6),
    warm_start=True,
    intercept_scaling=10000.0,
)
coefs_ = []
for c in cs:
    clf.set_params(C=c)
    clf.fit(X_train_selected, y_train)
    coefs_.append(clf.coef_.ravel().copy())

coefs_ = np.array(coefs_)