In [1]:
# !pip install pandas
# !pip install numpy
# !pip install seaborn
# !pip install matplotlib
# !pip install xgboost
# !pip install lightgbm
# !pip install sklearn

# 라이브러리 임포트
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, RidgeCV, RidgeClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV

import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# EDA

## 변수 설명
  - **int_rate** : 대출자에 부여된 이자율 (Interest rate of the loan the applicant received)
  - **annual_inc** : 연 소득 (annual income)
  - **dti** : 소득 대비 부채 비율 (Debt-to-income ratio)
  - **delinq_2yrs** : 지난 2년 간 체납 발생 횟수 (Delinquencies on lines of credit in the last 2 years)
  - **inq_last_6mths** : 지난 6개월 간 신용 조회 수 (Inquiries into the applicant's credit during the last 6 months)
  - **pub_rec** : 파산 횟수 (Number of bankruptcies listed in the public record)
  - **revol_bal** : 리볼빙 잔액 (Total credit revolving balance)
  - **total_acc** : 지금까지 소유했던 신용카드 개수 (num_total_cc_accounts : Total number of credit card accounts in the applicant's history)
  - **collections_12_mths_ex_med** : 의료부문을 제외한 지난 12개월 간 추심 발생 횟수 (num_collections_last_12m : Number of collections in the last 12 months. This excludes medical collections)
  - **acc_now_delinq** : 대출자가 체납 상태에 있지 않은 계좌의 수 (The number of accounts on which the borrower is now delinquent)
  - **tot_coll_amt** : 대출자에 대한 현재까지의 총 추심액 (total_collection_amount_ever : The total amount that the applicant has had against them in collections)
  - **tot_cur_bal** : 전 계좌의 현재 통합 잔고 (Total current balance of all accounts)
  - **chargeoff_within_12_mths** : 대출 부 신청인의 대출 신청 직전 12개월 간 세금 공제 횟수 (Number of charge-offs within last 12 months at time of application for the secondary applicant)
  - **delinq_amnt** : 체납 금액 (delinquency amount)
  - **tax_liens** : 세금 저당권의 수 (Number of tax liens)
  - **emp_length1 ~ 12** : 고용 연수 (Number of years in the job)
  - **home_ownership1 ~ 6** : 대출 신청자의 주거 소유 형태 (The ownership status of the applicant's residence)
  - **verification_status1 ~ 3** : 공동 소득 발생 여부 및 형태 (verification_income_joint : Type of verification of the joint income)
  - **purpose1 ~ 14** : 대출 목적 (The purpose of the loan)
  - **initial_list_status1 ~ 2** : 최초 대출 상태 (Initial listing status of the loan)
  - **mths_since_last_delinq1 ~ 11** : 마지막 체납이 지금으로부터 몇개월 전에 있었는지를 나타내는 변수 (Months since the last delinquency)
  
  - **funded_amnt** : 대출액 (Funded amount)
  - **funded_amnt_inv** : 사채 대출액 (Funded amount by investors)
  - **total_rec_late_fee** : 총 연체료 중 납부액 (Late fees received to date)
  - **term1** : 상환 기간 (The number of payments on the loan. Values are in months and can be either 36 or 60)
  - **open_acc** : 개설 개좌 수 (The number of open credit lines in the borrower's credit file)
  - **installment** : 대출 발생 시 월 상환액 (The monthly payment owed by the borrower if the loan originates)
  - **revol_util** : 리볼빙 한도 대비 리볼빙 사용 비율 (Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit)
  - **out_prncp** : 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded)
  - **out_prncp_inv** : 사채 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded by investors)
  - **total_rec_int** : 이자 상환액 (Interest received to date)
  - **fico_range_low** : FICO(일종의 신용점수) 최저값 (The lower boundary range the borrower’s FICO at loan origination belongs to)
  - **fico_range_high** : FICO(일종의 신용점수) 최고값 (The upper boundary range the borrower’s FICO at loan origination belongs to)
  
  - **depvar** : 고객의 부도 여부 (dependent variable)

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/220126/data/train.csv')
df.head()

Unnamed: 0,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,pub_rec,revol_bal,total_acc,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,chargeoff_within_12_mths,delinq_amnt,tax_liens,emp_length1,emp_length2,emp_length3,emp_length4,emp_length5,emp_length6,emp_length7,emp_length8,emp_length9,emp_length10,emp_length11,emp_length12,home_ownership1,home_ownership2,home_ownership3,home_ownership4,home_ownership5,home_ownership6,verification_status1,verification_status2,verification_status3,purpose1,purpose2,purpose3,purpose4,purpose5,purpose6,purpose7,purpose8,purpose9,purpose10,purpose11,purpose12,purpose13,purpose14,initial_list_status1,initial_list_status2,mths_since_last_delinq1,mths_since_last_delinq2,mths_since_last_delinq3,mths_since_last_delinq4,mths_since_last_delinq5,mths_since_last_delinq6,mths_since_last_delinq7,mths_since_last_delinq8,mths_since_last_delinq9,mths_since_last_delinq10,mths_since_last_delinq11,funded_amnt,funded_amnt_inv,total_rec_late_fee,term1,open_acc,installment,revol_util,out_prncp,out_prncp_inv,total_rec_int,fico_range_low,fico_range_high,depvar
0,0.0824,21000.0,29.19,0,1,0,3016,26,0,0,0,11773,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1200,1200.0,0.0,1,18,37.74,0.076,0.0,0.0,157.94,765,769,0
1,0.1299,80000.0,4.82,0,1,1,5722,24,0,0,0,21875,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,8000,8000.0,0.0,1,8,269.52,0.447,0.0,0.0,1702.42,665,669,0
2,0.1299,38000.0,23.66,0,3,0,6511,18,0,0,0,31868,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,5000,5000.0,0.0,1,7,168.45,0.88,0.0,0.0,1066.64,670,674,0
3,0.1367,100000.0,16.27,4,2,0,6849,30,0,0,0,326049,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,15000,15000.0,0.0,1,12,510.27,0.457,0.0,0.0,1256.24,680,684,1
4,0.1269,30000.0,25.28,0,1,2,8197,12,0,0,2506,8840,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,10000,10000.0,0.0,1,8,335.45,0.416,0.0,0.0,871.04,660,664,1


### 컬럼 합쳐주기

In [None]:
df['emp_length'] = 0

for i, column in enumerate(df.loc[:,'emp_length1':'emp_length12'].columns,1):
    for j, data in enumerate(df[column]):
        if data == 1:
            df['emp_length'][j] = i

sum(df['emp_length'] > 0)

In [None]:
df = df.drop(df.loc[:,'emp_length1':'emp_length12'].columns, axis=1)

In [None]:
df['home_ownership'] = 0

for i, column in enumerate(df.loc[:,'home_ownership1':'home_ownership6'].columns,1):
    for j, data in enumerate(df[column]):
        if data == 1:
            df['home_ownership'][j] = i

sum(df['home_ownership'] > 0)

In [None]:
df = df.drop(df.loc[:,'home_ownership1':'home_ownership6'].columns, axis=1)

In [None]:
df['verification_status'] = 0

for i, column in enumerate(df.loc[:,'verification_status1':'verification_status3'].columns,1):
    for j, data in enumerate(df[column]):
        if data == 1:
            df['verification_status'][j] = i

sum(df['verification_status'] > 0)

In [None]:
df = df.drop(df.loc[:,'verification_status1':'verification_status3'].columns, axis=1)

In [None]:
df['purpose'] = 0

for i, column in enumerate(df.loc[:,'purpose1':'purpose14'].columns,1):
    for j, data in enumerate(df[column]):
        if data == 1:
            df['purpose'][j] = i

sum(df['purpose'] > 0)

In [None]:
df = df.drop(df.loc[:,'purpose1':'purpose14'].columns, axis=1)

In [None]:
df['initial_list_status'] = 0

for i, column in enumerate(df.loc[:,'initial_list_status1':'initial_list_status2'].columns,1):
    for j, data in enumerate(df[column]):
        if data == 1:
            df['initial_list_status'][j] = i

sum(df['initial_list_status'] > 0)

In [None]:
df = df.drop(df.loc[:,'initial_list_status1':'initial_list_status2'].columns, axis=1)

In [None]:
df['mths_since_last_delinq'] = 0

for i, column in enumerate(df.loc[:,'mths_since_last_delinq1':'mths_since_last_delinq11'].columns,1):
    for j, data in enumerate(df[column]):
        if data == 1:
            df['mths_since_last_delinq'][j] = i

sum(df['mths_since_last_delinq'] > 0)

In [None]:
df = df.drop(df.loc[:,'mths_since_last_delinq1':'mths_since_last_delinq11'].columns, axis=1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# 결측치 확인
pd.DataFrame(df.isnull().sum()).rename(columns={0:'Non-Null Count'}).T

### 다중공선성 제거 시도

In [None]:
plt.figure(figsize = (40,40))
sns.heatmap(df.corr(), annot=True, cmap='Blues')
plt.show()

In [None]:
plt.figure(figsize = (10,10))

In [None]:
# open_acc 추가
df = df.drop(['out_prncp_inv','funded_amnt_inv','fico_range_high','funded_amnt'],axis=1)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(
    df.values, i) for i in range(df.shape[1])]
vif["features"] = df.columns
vif

In [None]:
vif[vif["VIF Factor"] > 10].sort_values('VIF Factor',ascending=False)

In [None]:
for column in vif[vif["VIF Factor"] > 10]['features']:
    print(column, df[column].corr(df['depvar']))

In [None]:
# 라벨(depvar) 비율 확인
print('y=1 ratio :', df.depvar.sum()/len(df))

In [None]:
X = df.drop('depvar', axis=1)
y = df['depvar']

# Train Valid Split

In [None]:
# data를 training set과 validation set으로 나누기
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [None]:
# training set과 validation set의 데이터 수 확인
print('training set length :', len(X_train))
print('validation set length :', len(X_valid))

In [None]:
# validation set의 라벨 비율 확인
y_valid.sum()/len(y_valid)

### Robust scaler(median & IQR) -> StandardScaler로 변경

In [None]:
from sklearn.preprocessing import RobustScaler
sc = RobustScaler()
print(sc.fit(X_train))
X_train_scaled = sc.transform(X_train)
X_valid_scaled = sc.transform(X_valid)

X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
X_valid = pd.DataFrame(X_valid_scaled, index=X_valid.index, columns=X_valid.columns)

### Oversampling Undersampling

In [None]:
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

In [None]:
X_train_tomek, y_train_tomek = TomekLinks().fit_resample(X_train, y_train)

In [None]:
X_train_smote, y_train_smote = SMOTE().fit_resample(X_train, y_train)

# Single Model(XGBoost)
## XGBClassifier의 하이퍼 파라미터 설명
- **LINK** : https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
- **max_depth=3** : 디시전 트리의 최대 깊이
- **learning_rate=0.1** : 0과 1 사이의 값을 가지는 부스팅에 대한 학습률(eta). 매 부스팅 단계 이후 새로이 추가된 가중치는 이 파라미터로 조정된다. 이 값이 낮을수록 보수적이며, 수렴에 필요한 더 많은 디시전 트리가 필요하다.
- **n_estimators=100** : 라운드의 횟수 또는 부스팅된 디시전 트리의 개수
- **silent=True** : 부스팅의 수행 동안 메시지를 출력할지에 대한 여부
- **objective="reg:linear"** : 모델이 학습할 작업의 분류, 미리 정의된 작업은 문자열로 지정이 가능하지만, 그렇지 않은 경우 콜러블callable을 만들어서 지정할 수도 있다.
- **booster="gbtree"** : 'gbtree', 'gblinear', 'dart'일 수 있다. 'dart'는 드롭아웃(dropout)이라는 특성을 추가한다(과적합 방지를 위해 무작위로 디시전 트리를 선택해 제거(드롭)한다). 'gblinear'는 정규화된 선형 모델을 만든다(디시전 트리가 아니라 라소 회귀와 유사하다).
- **nthread=None** : 더 이상 사용되지 않는다.
- **n_jobs** : 사용할 스레드의 개수
- **gamma=0** : 노드 분할에 필요한 최소 손실 감소
- **min_child_weight=1** : 자식 노드 생성에 필요한 헤시안(hessian) 합의 최솟값
- **max_delta_step=0** : 보다 보수적으로 갱신을 수행하도록 만드는 값. 불균형 범주의 데이터셋에 대해서는 1부터 10까지의 값으로 설정한다.
- **subsample=1** : 부스팅에 사용할 샘플의 비율
- **colsample_bytree=1** : 부스팅에 사용할 특징 열의 비율
- **colsample_bylevel=1** : 각 디시전 트리의 수준별 사용할 특징 열의 비율
- **colsample_bynode=1** : 각 디시전 트리의 노드별 사용할 특징 열의 비율
- **reg_alpha=0** : L1 정규화(가중치의 평균). 이 값이 클수록 보수적이게 된다.
- **reg_lambda=1** : L2 정규화(가중치의 제곱근). 이 값이 클수록 보수적이게 된다.
- **base_score=.5** : 초기 편향치(bias)
- **seed=None** : 더 이상 사용되지 않는다.
- **random_state=0** : 난수 생성 시드
- **missing=None** : 누락된 데이터가 해석될 값. None은 np.nan을 의미한다.
- **importance_type='gain'** : 특징 중요도의 유형. 'gain', 'weight', 'cover', 'total_gain', 'total_cover'로 설정될 수 있다.

In [None]:
# # 하이퍼 파라미터 튜닝
# xgb_clf = xgboost.XGBClassifier()

# param_grid = {'max_depth':[10], # 20 < 15 < 10
#               'n_estimators':[300, 400], # 100 < 150 < 160 < 170 < 180 < 200 < 300
#               #learning rate는 0.1이 0.001보다 높음
#               }

# cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3) #n_repeats=2

# # grid_search = GridSearchCV(estimator=xgb_clf,
# #                            param_grid=param_grid, 
# #                            n_jobs=-1,
# #                            cv=cv,
# #                            scoring='f1_macro', 
# #                            error_score=0) 

# n_estimators = [int(x) for x in range(300, 400, 5)]
# max_features = ['log2','sqrt']
# max_depth = [int(x) for x in range(5, 15)]
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 4]
# bootstrap = [True, False]

# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf':min_samples_leaf,
#                'bootstrap':bootstrap}

# rnd_search = RandomizedSearchCV(estimator=xgb_clf,
#                            param_distributions=random_grid,
#                            n_iter=50,
#                            n_jobs=-1,
#                            cv=cv,
#                            scoring='f1_macro', 
#                            verbose=3,
#                            error_score=0,
#                            random_state=42)

# results=rnd_search.fit(X_train, y_train)

# results.best_params_

In [None]:
# best parameter를 사용하여 prediction 수행
xgb_clf_tomek = xgboost.XGBClassifier(max_depth = 10,
                                n_estimators = 300
                                  )

xgb_clf_smote = xgboost.XGBClassifier(max_depth = 10,
                                n_estimators = 300
                                  )

# xgb_clf_tomek = xgboost.XGBClassifier(max_depth = results.best_params_['max_depth'],
#                                 n_estimators = results.best_params_['n_estimators'],
#                                 max_features = results.best_params_['max_features'],
#                                 min_samples_split = results.best_params_['min_samples_split'],
#                                 min_samples_leaf = results.best_params_['min_samples_leaf'],
#                                 bootstrap = results.best_params_['bootstrap'],
#                                 eval_metric='mlogloss',
#                                 # tree_method='gpu_hist',
#                                 # predictor='gpu_predictor'
#                                 )

# xgb_clf_smote = xgboost.XGBClassifier(max_depth = results.best_params_['max_depth'],
#                                 n_estimators = results.best_params_['n_estimators'],
#                                 max_features = results.best_params_['max_features'],
#                                 min_samples_split = results.best_params_['min_samples_split'],
#                                 min_samples_leaf = results.best_params_['min_samples_leaf'],
#                                 bootstrap = results.best_params_['bootstrap'],
#                                 eval_metric='mlogloss',
#                                 # tree_method='gpu_hist',
#                                 # predictor='gpu_predictor'
#                                 )

xgb_clf_tomek.fit(X_train_tomek, y_train_tomek)
xgb_clf_smote.fit(X_train_smote, y_train_smote)

y_pred_xgb_tomek = xgb_clf_tomek.predict(X_valid)
y_pred_xgb_smote = xgb_clf_smote.predict(X_valid)

In [None]:
# y_valid가 0 또는 1일 확률 출력
y_pred_prob_tomek = xgb_clf_tomek.predict_proba(X_valid)

print(len(y_pred_prob_tomek))
y_pred_prob_tomek

In [None]:
y_pred_prob_smote = xgb_clf_smote.predict_proba(X_valid)

print(len(y_pred_prob_smote))
y_pred_prob_smote

In [None]:
# y_valid가 1일 확률 출력
y_pred_prob_smote[:,1]

In [None]:
# y_valid가 1일 확률 출력
y_pred_prob_tomek[:,1]

In [None]:
# y_valid 예측값 출력
# y_pred_prob[:,1]이 0.5(threshold)보다 큰 경우 1의 예측값을 return한다

print(len(y_pred_xgb_smote))
print(len(y_pred_xgb_tomek))
y_pred_xgb_smote

In [None]:
y_pred_xgb_tomek

In [None]:
# 1로 예측된 y_valid 갯수 및 비율 출력
print(y_pred_xgb_smote.sum())
print(y_pred_xgb_smote.sum()/len(y_pred_xgb_smote))

In [None]:
# 1로 예측된 y_valid 갯수 및 비율 출력
print(y_pred_xgb_tomek.sum())
print(y_pred_xgb_tomek.sum()/len(y_pred_xgb_tomek))

In [None]:
# 평가 함수 정의
def get_clf_eval(y_actual, y_pred):
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    recall = recall_score(y_actual, y_pred)
    AUC = roc_auc_score(y_actual, y_pred)
    F1 = f1_score(y_actual, y_pred)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('AUC: {:.4f}'.format(AUC))
    print('F1: {:.4f}'.format(F1))
    
    sns.heatmap(confusion_matrix(y_actual, y_pred), annot=True, fmt='d', cmap='YlGnBu')

In [None]:
# xgboost 성능 확인
get_clf_eval(y_valid, y_pred_xgb_smote)
# 정확도 0.7488, 정밀도: 0.6404 재현율: 0.5105 AUC: 0.6866 F1: 0.5681

In [None]:
# xgboost 성능 확인
get_clf_eval(y_valid, y_pred_xgb_tomek)
# 정확도: 0.7515 정밀도: 0.6324 재현율: 0.5548 AUC: 0.7002 F1: 0.5911

In [None]:
# threshold 변경 및 성능 확인
get_clf_eval(y_valid, y_pred_prob_smote[:,1]>0.38)
# 정확도: 0.7135 정밀도: 0.5409 재현율: 0.7592 AUC: 0.7254 F1: 0.6317

In [None]:
# threshold 변경 및 성능 확인
get_clf_eval(y_valid, y_pred_prob_tomek[:,1]>0.38)
# 정확도: 0.7433 정밀도: 0.5925 재현율: 0.6636 AUC: 0.7226 F1: 0.6260

### RandomForest

In [None]:
# rnd_clf = RandomForestClassifier()

# rnd_clf.fit(X_train, y_train)

# y_pred = rnd_clf.predict(X_valid)

# get_clf_eval(y_valid, y_pred)

# param_grid = {'max_depth':[30, 32], # 10 < 20 < 25, 40, 35 < 30 < 32
#               'n_estimators':[300, 350] # 100 < 150 < 200 < 250, 400, 350 < 300
#               }


# cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=2) #n_repeats=2

# grid_search = GridSearchCV(estimator=rnd_clf,
#                            param_grid=param_grid, 
#                            n_jobs=-1,
#                            cv=cv,
#                            scoring='f1_macro', 
#                            error_score=0) 

# results=grid_search.fit(X_train, y_train)

# results.best_params_

In [None]:
# best parameter를 사용하여 prediction 수행
# rnd_clf = RandomForestClassifier(max_depth = results.best_params_['max_depth'],
#                                 n_estimators = results.best_params_['n_estimators'],
#                                 )

# rnd_clf = RandomForestClassifier(max_depth = 32,
#                                 n_estimators = 300,
#                                 )

# rnd_clf.fit(X_train, y_train)

###LGBM

In [None]:
# lgbm_clf = LGBMClassifier()

# # param_grid = {'max_depth':[15], #10, 20 < 15
# #               'n_estimators':[400, 450] # 100 < 150 < 200 < 250 < 300, 500, 450 < 400
# #               }


# cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3) #n_repeats=2

# # lgbm_clf = GridSearchCV(estimator=lgbm_clf,
# #                            param_grid=param_grid, 
# #                            n_jobs=-1,
# #                            cv=cv,
# #                            scoring='f1_macro', 
# #                            error_score=0) 

# n_estimators = [int(x) for x in range(100, 450, 5)]
# max_features = ['log2','sqrt']
# max_depth = [int(x) for x in range(10, 20)]
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 4]
# bootstrap = [True, False]

# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf':min_samples_leaf,
#                'bootstrap':bootstrap}

# rnd_search = RandomizedSearchCV(estimator=xgb_clf,
#                            param_distributions=random_grid,
#                            n_iter=50,
#                            n_jobs=-1,
#                            cv=cv,
#                            scoring='f1_macro', 
#                            verbose=2,
#                            error_score=0,
#                            random_state=42)

# results=rnd_search.fit(X_train, y_train)

# results.best_params_

In [None]:
lgbm_clf_tomek = LGBMClassifier(max_depth = 15,
                          n_estimators = 400
                          )

lgbm_clf_smote = LGBMClassifier(max_depth = 15,
                          n_estimators = 400
                          )

# lgbm_clf_tomek = LGBMClassifier(max_depth = results.best_params_['max_depth'],
#                                 n_estimators = results.best_params_['n_estimators'],
#                                 max_features = results.best_params_['max_features'],
#                                 min_samples_split = results.best_params_['min_samples_split'],
#                                 min_samples_leaf = results.best_params_['min_samples_leaf'],
#                                 bootstrap = results.best_params_['bootstrap']
#                           )

# lgbm_clf_smote = LGBMClassifier(max_depth = results.best_params_['max_depth'],
#                                 n_estimators = results.best_params_['n_estimators'],
#                                 max_features = results.best_params_['max_features'],
#                                 min_samples_split = results.best_params_['min_samples_split'],
#                                 min_samples_leaf = results.best_params_['min_samples_leaf'],
#                                 bootstrap = results.best_params_['bootstrap']
#                           )

lgbm_clf_tomek.fit(X_train_tomek, y_train_tomek)
lgbm_clf_smote.fit(X_train_smote, y_train_smote)

In [None]:
y_pred_tomek = lgbm_clf_tomek.predict(X_valid)
y_pred_smote = lgbm_clf_smote.predict(X_valid)

get_clf_eval(y_valid, y_pred_tomek)
get_clf_eval(y_valid, y_pred_smote)

In [None]:
y_pred_prob_tomek = lgbm_clf_tomek.predict_proba(X_valid)
y_pred_prob_smote = lgbm_clf_smote.predict_proba(X_valid)

get_clf_eval(y_valid, y_pred_prob_tomek[:,1] > 0.38)
get_clf_eval(y_valid, y_pred_prob_smote[:,1] > 0.38)

# Ensemble(Voting)

In [None]:
# rnd_clf = RandomForestClassifier()
# lgbm_clf = LGBMClassifier()
# rid_clf = RidgeClassifier()
# dtc_clf = DecisionTreeClassifier()
# svm_clf = SVC()
# ada_clf = AdaBoostClassifier()
# gbc_clf = GradientBoostingClassifier()
# bag_clf = BaggingClassifier()
# ext_clf = ExtraTreesClassifier()
# knn_clf = KNeighborsClassifier()
# svc_clf = SVC()

# voting_clf = VotingClassifier(estimators=[('rnd', rnd_clf), ('lgbm', lgbm_clf), ('xgb', xgb_clf)], voting='soft')

In [None]:
# for clf in (rnd_clf, lgbm_clf, xgb_clf, voting_clf):
#     clf.fit(X_train, y_train)
#     y_pred_vote = clf.predict(X_valid)

# # ensemble(voting) 성능 확인
# get_clf_eval(y_valid, y_pred_vote)

# Ensemble(Stacking)

In [None]:
stacking_clf = StackingClassifier(estimators=[('lgbm1', lgbm_clf_tomek), ('lgbm2', lgbm_clf_smote), ('xgb1', xgb_clf_tomek), ('xgb2', xgb_clf_smote)], final_estimator = LGBMClassifier())
stacking_clf.fit(X_train, y_train)

In [None]:
y_pred_stack = stacking_clf.predict_proba(X_valid)
y_pred_stack

In [None]:
y_pred_stack = y_pred_stack[:,1] > 0.41
# ensemble(stacking) 성능 확인
get_clf_eval(y_valid, y_pred_stack)
#정확도: 0.7564, F1: 0.6095 -> random forest, lgbm, xgb
#정확도: 0.7550, F1: 0.6074 -> lgbm, xgb
#정확도: 0.7511, F1: 0.6058 -> rnd, lgbm, xgb 파라미터 튜닝
#정확도: 0.7337, F1: 0.6357 -> rnd, lgbm, xgb threshold 0.4 적용 -> 실제 테스트에서 점수 낮아짐
#정확도: 0.7548, F1: 0.6216 -> rnd, lgbm, xgb
#정확도: 0.7322 정밀도: 0.5695 재현율: 0.7042 AUC: 0.7249 F1: 0.6297 -> smote, tomek로 각각 lgbm, xgb 총 4개 모델 stack으로 학습 후 threshold 0.38일 경우
# smote만 학습한 lgbm, xgb 모델로 데이터도 smote 적용된 데이터셋으로 다시 학습했더니 성능 저하

# 정확도: 0.7477 정밀도: 0.6005 재현율: 0.6591 AUC: 0.7246 F1: 0.6284 -> 동일 조건에 데이터 합쳤을 경우 테스트에서도 0.713으로 떨어짐
# 정확도: 0.7407 정밀도: 0.5849 재현율: 0.6927 AUC: 0.7282 F1: 0.6343 -> StandardScaler로 변경 성능 저하
# 정확도: 0.7431 정밀도: 0.5934 재현율: 0.6863 AUC: 0.7285 F1: 0.6345 -> 0.7198775585 threshold 0.41일때가 가장 높음

# Submission

In [None]:
# 제출 양식 다운로드
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/220126//data/sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/220126//data/test.csv')

df_test = df_test.drop(columns=['ID'])


### 컬럼 합쳐주기

In [None]:
df_test['emp_length'] = 0

for i, column in enumerate(df_test.loc[:,'emp_length1':'emp_length12'].columns,1):
    for j, data in enumerate(df_test[column]):
        if data == 1:
            df_test['emp_length'][j] = i

df_test = df_test.drop(df_test.loc[:,'emp_length1':'emp_length12'].columns, axis=1)

In [None]:
df_test['home_ownership'] = 0

for i, column in enumerate(df_test.loc[:,'home_ownership1':'home_ownership6'].columns,1):
    for j, data in enumerate(df_test[column]):
        if data == 1:
            df_test['home_ownership'][j] = i

df_test = df_test.drop(df_test.loc[:,'home_ownership1':'home_ownership6'].columns, axis=1)

In [None]:
df_test['verification_status'] = 0

for i, column in enumerate(df_test.loc[:,'verification_status1':'verification_status3'].columns,1):
    for j, data in enumerate(df_test[column]):
        if data == 1:
            df_test['verification_status'][j] = i

df_test = df_test.drop(df_test.loc[:,'verification_status1':'verification_status3'].columns, axis=1)

In [None]:
df_test['purpose'] = 0

for i, column in enumerate(df_test.loc[:,'purpose1':'purpose14'].columns,1):
    for j, data in enumerate(df_test[column]):
        if data == 1:
            df_test['purpose'][j] = i

df_test = df_test.drop(df_test.loc[:,'purpose1':'purpose14'].columns, axis=1)

In [None]:
df_test['initial_list_status'] = 0

for i, column in enumerate(df_test.loc[:,'initial_list_status1':'initial_list_status2'].columns,1):
    for j, data in enumerate(df_test[column]):
        if data == 1:
            df_test['initial_list_status'][j] = i

df_test = df_test.drop(df_test.loc[:,'initial_list_status1':'initial_list_status2'].columns, axis=1)

In [None]:
df_test['mths_since_last_delinq'] = 0

for i, column in enumerate(df_test.loc[:,'mths_since_last_delinq1':'mths_since_last_delinq11'].columns,1):
    for j, data in enumerate(df_test[column]):
        if data == 1:
            df_test['mths_since_last_delinq'][j] = i

df_test = df_test.drop(df_test.loc[:,'mths_since_last_delinq1':'mths_since_last_delinq11'].columns, axis=1)

In [None]:
df_test = df_test.drop(['out_prncp_inv','funded_amnt_inv','fico_range_high','funded_amnt'],axis=1)

### 스케일링 및 학습

In [None]:
df_test_scaled = sc.transform(df_test)
df_test = pd.DataFrame(df_test_scaled, index=df_test.index, columns=df_test.columns)

y_predict_test = stacking_clf.predict_proba(df_test)
y_predict_test

### 결과



In [None]:
y_predict_test.shape

In [None]:
y_predict_test[:,1] > 0.41

In [None]:
submit['answer'] = (y_predict_test[:,1] > 0.41).astype(int)

sum(submit['answer'])/len(submit['answer'])

In [None]:
# 제출 파일 저장
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/220126//data/prediction.csv', index=False)