In [1]:
## 1. import libraries
import pandas as pd
import numpy as np

## 2. file path and load csv
FOLDER_PATH = "" ## 파일 경로
file_path = FOLDER_PATH+"lending_club_2020_train.csv"
origin_data = pd.read_csv(file_path)
origin_data.head()

## 3. drop할 컬럼들을 일단 처리한다.
with open('drop_columns.txt', mode='r') as f:
    drop_fields = f.readlines()
    drop_fields = [drop_field.strip('\n') for drop_field in drop_fields]
    
drop_df = origin_data.drop(columns=drop_fields)

# 컬럼 개수 출력
print(len(drop_df.columns)) # 96

## 4. 반응변수 처리
# loan_status가 "current", "issued", "policy" 인 행을 필터링하여 삭제
modified_df = drop_df[~drop_df['loan_status'].isin(['Current', 'Issued', 'Does not meet the credit policy. Status:Fully Paid', 'Does not meet the credit policy. Status:Charged Off'])]

# risk = 1, safe = 0 으로 처리
modified_df.loc[modified_df['loan_status'].isin(['Fully Paid', 'In Grace Period']), 'loan_status'] = 0
modified_df.loc[modified_df['loan_status'].isin(['Charged Off', 'Default', 'Late (16-30 days)', 'Late (31-120 days)']), 'loan_status'] = 1

print(len(modified_df)) # 1133666


## 5. 데이터 처리용 함수
def delete_suffix(term:str)->int:
    '''첫 단어만을 저장하는 함수'''
    term = term.strip().split()[0]
    return int(term)

def delete_suffix_percentage(term:str)->float:
    '''%를 자르는 함수'''
    term = term.strip('%')
    return float(term)

def delete_suffix_xx(term:str)->int:
    '''xx를 자르는 함수'''
    term = term.strip('xx')
    return int(term)

def apply_to_multiple_columns(date_columns:list)->None:
    '''modified_df에서 %b-%Y(Aug-2006) 형식의 날짜 정보를 %Y-%m-%d(2006-08-01)형태로 저장하는 함수'''
    for col in date_columns:
        modified_df[col]=pd.to_datetime(modified_df[col], format='%b-%Y')

  origin_data = pd.read_csv(file_path)


79
1133666


In [2]:
THRESHOLD = 100

null_percentages = (modified_df.isnull().sum() * 100) / len(modified_df)
columns_to_drop = null_percentages[null_percentages >= THRESHOLD].index
columns_to_keep = null_percentages[null_percentages < THRESHOLD].index

# 필드 추출
filtered_df = modified_df[columns_to_keep]
filtered_df.columns
print(columns_to_drop)

Index([], dtype='object')


In [3]:
filtered_df = filtered_df.dropna(subset=filtered_df.columns.difference(['loan_status']),how='all')

In [4]:
## 6. 결측 처리
# 6-1) 결측 개수가 1천 건 이하인 경우는 해당 데이터(row) 삭제
filtered_df = filtered_df.dropna(subset=['chargeoff_within_12_mths','collections_12_mths_ex_med','dti',
                                         'pub_rec_bankruptcies','revol_util','tax_liens'])
print(filtered_df.shape) # (1131682, 79)

# 6-2) 대체하기 어려운 변수 또는 결측이 많은 변수 삭제
filtered_df = filtered_df.drop(columns=['bc_util', 'mths_since_last_delinq','mths_since_last_major_derog',
                                        'mths_since_last_record','mths_since_recent_bc_dlq',
                                        'mths_since_recent_revol_delinq','percent_bc_gt_75'])
print(filtered_df.shape) #(1131682, 72)

# 6-3) 결측 대체
# A. joint 관련 변수 : 결측 0 대체
filtered_df[['annual_inc_joint','dti_joint','revol_bal_joint']] = filtered_df[['annual_inc_joint','dti_joint','revol_bal_joint']].fillna(0)

# B. 최빈값 대체
filtered_df['mo_sin_old_il_acct'] = filtered_df['mo_sin_old_il_acct'].fillna(129)
filtered_df['mths_since_recent_bc'] = filtered_df['mths_since_recent_bc'].fillna(13)
filtered_df['mths_since_recent_inq'] = filtered_df['mths_since_recent_inq'].fillna(5)
filtered_df['bc_open_to_buy'] = filtered_df['bc_open_to_buy'].fillna(5200.0)
filtered_df['emp_length'] = filtered_df['emp_length'].fillna('10+ years')

# C. 2012년 대체
### 업데이트예정 ###


# D. 2015년 대체
### 업데이트 예정 ###



(1131682, 79)
(1131682, 72)


In [None]:
### 수정중입니다 ###

# 수치형 처리
# 변수 20번
filtered_df['emp_length'] = filtered_df['emp_length'].apply(lambda x: x.replace(' years','').replace(' year','').replace('+','').replace('< ', ''))
# 변수 92번
filtered_df['revol_util'].apply(delete_suffix_percentage)

#### sklearn test

In [5]:
filtered_df

Unnamed: 0,loan_amnt,term,sub_grade,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint
0,6000.0,36 months,A5,2 years,MORTGAGE,45000.0,Verified,0,debt_consolidation,MN,...,0.0,4.0,97.1,0.0,0.0,78611.0,50054.0,15000.0,56511.0,3926.0
2,23200.0,60 months,E4,10+ years,MORTGAGE,110000.0,Verified,1,debt_consolidation,OK,...,0.0,5.0,94.8,0.0,0.0,650914.0,294461.0,20300.0,291465.0,0.0
5,16000.0,36 months,A2,10+ years,MORTGAGE,65000.0,Not Verified,1,home_improvement,TX,...,0.0,1.0,100.0,0.0,0.0,196139.0,32551.0,59100.0,49339.0,0.0
6,4500.0,36 months,B3,5 years,MORTGAGE,50000.0,Not Verified,0,home_improvement,FL,...,0.0,1.0,77.8,2.0,0.0,211837.0,9573.0,9300.0,11970.0,0.0
7,20000.0,36 months,B3,10+ years,MORTGAGE,60000.0,Source Verified,0,credit_card,IL,...,0.0,2.0,77.5,0.0,0.0,186765.0,66283.0,11000.0,73090.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1755288,26500.0,60 months,C5,2 years,MORTGAGE,60000.0,Verified,0,debt_consolidation,MN,...,0.0,2.0,100.0,0.0,0.0,258273.0,34014.0,18900.0,16784.0,0.0
1755290,3600.0,36 months,E1,7 years,MORTGAGE,33800.0,Not Verified,1,debt_consolidation,TX,...,0.0,2.0,100.0,0.0,0.0,87427.0,11973.0,6500.0,13375.0,0.0
1755291,19000.0,36 months,A3,4 years,MORTGAGE,71000.0,Not Verified,0,debt_consolidation,UT,...,0.0,2.0,96.9,0.0,0.0,281551.0,50883.0,17600.0,43550.0,0.0
1755292,9600.0,36 months,C1,9 years,MORTGAGE,44000.0,Verified,0,debt_consolidation,TX,...,0.0,2.0,100.0,0.0,0.0,151287.0,13537.0,4200.0,10414.0,0.0


In [19]:
from sklearn.model_selection import train_test_split

# train, test 셋 분리
X = filtered_df.iloc[:, [0, 5]] # target column을 제외한 모든 column을 후보 feature로 간주
y = filtered_df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 37)

In [20]:
X

Unnamed: 0,loan_amnt,annual_inc
0,6000.0,45000.0
2,23200.0,110000.0
5,16000.0,65000.0
6,4500.0,50000.0
7,20000.0,60000.0
...,...,...
1755288,26500.0,60000.0
1755290,3600.0,33800.0
1755291,19000.0,71000.0
1755292,9600.0,44000.0


In [21]:
y

0          0
2          1
5          1
6          0
7          0
          ..
1755288    0
1755290    1
1755291    0
1755292    0
1755293    0
Name: loan_status, Length: 1131682, dtype: object

In [22]:
clf = LGBMClassifier(random_state=1234, n_estimators=100, learning_rate=0.01)
min_features_to_select = 30
step = 10
selector = RFECV(clf, step=step, cv=5, min_features_to_select=min_features_to_select)
selector = selector.fit(X, y)


NameError: name 'LGBMClassifier' is not defined

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
X_train.astype('int')
y_train.astype('int')

estimator = LogisticRegression() # 학습시킬 모델 지정
selector = RFECV(estimator, step=1, cv = 5) # 한 step에 제거할 featrue 개수 및 cross validation fold 수 지정
selector = selector.fit(X_train, y_train) # feature selection 진행

ValueError: Unknown label type: 'unknown'

Unnamed: 0,loan_amnt,annual_inc
1513147,18225,50000
313041,35000,84000
1030951,4000,62000
428910,4950,42000
1447854,6000,31200
...,...,...
678792,15000,65000
1064609,15000,49000
1538867,8000,70000
438643,12000,75000
