In [None]:
import pandas as pd
import numpy as np
import copy
import re
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import log_loss
import wandb
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc, plot_calibration_curve, plot_summary_metrics, plot_precision_recall, plot_feature_importances

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv('sample_submission.csv')

In [None]:
#핫덱대체
#근로기간 Unknown <<< 나머지 학습해서 결과 뽑기?

# 근로기간
# 10+ years    31585
# 2 years       8450
# < 1 year      7774
# 3 years       7581
# 1 year        6249
# Unknown       5671 << 5.8% 정도
# 5 years       5665
# 4 years       5588
# 8 years       4888
# 6 years       3874
# 7 years       3814
# 9 years       3744
# 10+years       896
# <1 year        370
# 3               89
# 1 years         56


In [None]:
# 0: valid score: 0.9010355506977307
# 1: valid score: 0.9050661554969067
# 2: valid score: 0.901121413402887
# 3: valid score: 0.9037491766255578
# 4: valid score: 0.9032539417389811
# 5: valid score: 0.9051953433118983
# 6: valid score: 0.9071750113042895
# 7: valid score: 0.9024298085584201
# 8: 
# 9: 
# 10: 

In [None]:
def only_int(string):
    num = re.sub(r'[^0-9]', '', string)
    if num:
        return num
    else:
        return "6"

def extract_categorical_columns(df):
    data = []
    for e, i in enumerate(df.columns):
        if df[i].dtypes == 'object':
            data.append(i)
    return data

def ordinal_encoding(train_df, test_df, categorical_columns):
    from sklearn.preprocessing import OrdinalEncoder
    train, test = train_df.copy(), test_df.copy()
    data = {}
    for col in categorical_columns:
        ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        ordinal_encoder.fit(train[col].values.reshape(-1, 1))
        train[col] = ordinal_encoder.transform(train[col].values.reshape(-1, 1)).reshape(-1)
        if col in test:
            test[col] = ordinal_encoder.transform(test[col].values.reshape(-1, 1)).reshape(-1)
        data[col] = ordinal_encoder
    return train, test, data

def sep_ml_xy(df, target):
    y = df[target]
    x = df.drop(columns=target)
    return x, y

def ml_train_valid(model, metric, metric_options, train_data, train_target, test_data, test_target):
    model = model.fit(train_data, train_target)
    pred = model.predict(test_data)
    evaluate = metric(test_target, pred, **metric_options)
    return pred, evaluate, model

def ml_predict(model, test_data):
    pred = model.predict(test_data)
    return pred

In [None]:
train = train.drop(columns=['ID'])
test = test.drop(columns=['ID'])

for span in ["근로기간"]:
    train[span] = train[span].apply(lambda x: int(only_int(x)))
    test[span] = test[span].apply(lambda x: int(only_int(x)))

for span in ["대출기간"]:
    train[span] = train[span].apply(lambda x: int(int(only_int(x))/12))
    test[span] = test[span].apply(lambda x: int(int(only_int(x))/12))
    
for span in ["총상환이자", "총연체금액", "연체계좌수"]:
    train[span] = train[span].apply(lambda x: int(x))
    test[span] = test[span].apply(lambda x: int(x))
# display(train.head(3), test.head(3))

In [None]:
# 대출금액, 대출기간, 근로기간, 주택소유상태, 연간소득, 부채_대비_소득_비율, 총계좌수, 대출목적, 최근_2년간_연체_횟수, 총상환원금, 총상환이자, 총연체금액, 연쳬계좌수

def add_var(train):
    train['총상환원금+총상환이자-총연체금액/대출금액'] = (train['총상환원금'] + train['총상환이자'] - train['총연체금액']) / train['대출금액'] * 100
    # train['대출금액/대출기간/연간소득 %'] = train['대출금액'] / train['대출기간'] / train['연간소득'] * 100
    train['총상환원금/대출금액'] = (train['총상환원금']) / train['대출금액'] * 100
    train['대출금액/대출기간'] = train['대출금액'] / train['대출기간'] * 100
    train['대출금액/연간소득'] = train['대출금액'] / train['연간소득'] * 100
    # train['총연체금액/대출금액 %'] = train['총연체금액'] / train['대출금액'] * 100
    train['총상환이자/총상환원금'] = train['총상환이자'] / train['총상환원금'] * 100
    train['근로기간/대출기간'] = train['근로기간'] / train['대출기간'] * 100
    train['연간소득/대출기간'] = train['연간소득'] / train['대출기간'] * 100
    train['최근_2년간_연체_횟수/대출기간'] = train['최근_2년간_연체_횟수'] / train['대출기간'] *12 * 100
    train['총상환원금/대출기간'] = train['총상환원금'] / train['대출기간'] * 100
    train['총상환이자/대출기간'] = train['총상환이자'] / train['대출기간'] * 100
    # train['총연체금액/대출기간 %'] = train['총연체금액'] / train['대출기간'] * 100
    train['근로기간*연간소득'] = train['근로기간'] * train['연간소득']
    train['주택소유상태_대출목적'] = train['주택소유상태'] + "_" + train['대출목적']
    # train['연체계좌수/총계좌수'] = train['연체계좌수'] / train['총계좌수']
    return train

In [None]:
def drop_var(train):
    train = train.drop(columns=['총연체금액', '연체계좌수', '최근_2년간_연체_횟수'])
    return train

In [None]:
def numeric_process(train):
    train.loc[train['총상환원금']==0,'총상환이자/총상환원금'] = 0

    train['연간소득(만)'] = train['연간소득']/10000
    train['대출금액(만)'] = train['대출금액']/10000
    
    train['연간소득(만).편차^2'] =  (train['연간소득(만)'] - train['연간소득(만)'].mean())**2
    train['연간소득(만).log'] = train['연간소득(만)'].apply(np.log1p)
    train['대출금액(만).편차^2'] =  (train['대출금액(만)'] - train['대출금액(만)'].mean())**2
    train['대출금액(만).log'] = train['대출금액(만)'].apply(np.log1p)
    
    train['근로기간.log'] = train['근로기간'].apply(np.log1p)
    train['대출기간.log'] = train['대출기간'].apply(np.log1p)
    
    # train['총상환이자/총상환원금.편차^2'] = (train['총상환이자/총상환원금'] - train['총상환이자/총상환원금'].mean())**2
    train['총상환이자/총상환원금.log'] = train['총상환이자/총상환원금'].apply(np.log1p)

    train['총상환원금/대출금액.편차^2'] = (train['총상환원금/대출금액'] - train['총상환원금/대출금액'].mean())**2
    train['총상환원금/대출금액.log'] = train['총상환원금/대출금액'].apply(np.log1p)

    train['부채_대비_소득_비율.편차^2'] = (train['부채_대비_소득_비율'] - train['부채_대비_소득_비율'].mean())**2
    train['부채_대비_소득_비율.log'] = train['부채_대비_소득_비율'].apply(np.log1p)

    # train['대출금액/연간소득.편차^2'] = (train['대출금액/연간소득'] - train['대출금액/연간소득'].mean())**2
    train['대출금액/연간소득.log'] = train['대출금액/연간소득'].apply(np.log1p)

    train['총상환원금+총상환이자-총연체금액/대출금액.편차^2'] = (train['총상환원금+총상환이자-총연체금액/대출금액'] - train['총상환원금+총상환이자-총연체금액/대출금액'].mean())**2
    train['총상환원금+총상환이자-총연체금액/대출금액.log'] = train['총상환원금+총상환이자-총연체금액/대출금액'].apply(np.log1p)

    train['근로기간*연간소득.편차^2'] = (train['근로기간*연간소득'] - train['근로기간*연간소득'].mean())**2
    train['근로기간*연간소득.log'] = train['근로기간*연간소득'].apply(np.log1p)

    train['연간소득/대출기간.편차^2'] = (train['연간소득/대출기간'] - train['연간소득/대출기간'].mean())**2
    train['연간소득/대출기간.log'] = train['연간소득/대출기간'].apply(np.log1p)

    train['대출금액/대출기간.편차^2'] = (train['대출금액/대출기간'] - train['대출금액/대출기간'].mean())**2
    train['대출금액/대출기간.log'] = train['대출금액/대출기간'].apply(np.log1p)

    train['총상환이자/대출기간.편차^2'] = (train['총상환이자/대출기간'] - train['총상환이자/대출기간'].mean())**2
    train['총상환이자/대출기간.log'] = train['총상환이자/대출기간'].apply(np.log1p)

    train['총상환이자.편차^2'] = (train['총상환이자'] - train['총상환이자'].mean())**2
    train['총상환이자.log'] = train['총상환이자'].apply(np.log1p)

    # train['최근_2년간_연체_횟수/대출기간.편차^2'] = (train['최근_2년간_연체_횟수/대출기간'] - train['최근_2년간_연체_횟수/대출기간'].mean())**2
    # train['최근_2년간_연체_횟수/대출기간.log'] = train['최근_2년간_연체_횟수/대출기간'].apply(np.log1p)

    # train['총상환원금/대출기간.편차^2'] = (train['총상환원금/대출기간'] - train['총상환원금/대출기간'].mean())**2
    # train['총상환원금/대출기간.log'] = train['총상환원금/대출기간'].apply(np.log1p)

    train['총상환원금.편차^2'] = (train['총상환원금'] - train['총상환원금'].mean())**2
    # train['총상환원금.log'] = train['총상환원금'].apply(np.log1p)

    # train['최근_2년간_연체_횟수.편차^2'] = (train['최근_2년간_연체_횟수'] - train['최근_2년간_연체_횟수'].mean())**2
    # train['최근_2년간_연체_횟수.log'] = train['최근_2년간_연체_횟수'].apply(np.log1p)

    
    return train

In [None]:
train = add_var(train)
test = add_var(test)

train = drop_var(train)
test = drop_var(test)

train = numeric_process(train)
test = numeric_process(test)

In [None]:
categorical_features = ['주택소유상태', '대출목적', '주택소유상태_대출목적']

for i in categorical_features:
    le = LabelEncoder()
    le=le.fit(train[i]) 
    train[i]=le.transform(train[i])
    
    for case in np.unique(test[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case) 
    test[i]=le.transform(test[i])

In [None]:
# columns_to_scale = ['대출금액','연간소득', '부채_대비_소득_비율', '총상환원금', '총상환이자', '총연체금액']
# 대출금액, 대출기간, 근로기간, 주택소유상태, 연간소득, 부채_대비_소득_비율, 총계좌수, 대출목적, 최근_2년간_연체_횟수, 총상환원금, 총상환이자, 총연체금액, 연쳬계좌수

# scaler = MinMaxScaler(feature_range=(0, 10))
# train[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])
# test[columns_to_scale] = scaler.transform(test[columns_to_scale])

display(train.head(5), test.head(5))
train.info()

In [None]:
from sklearn.model_selection import GridSearchCV

categorical_columns = extract_categorical_columns(train)
train, test, ord_dict = ordinal_encoding(train, test, categorical_columns)
train_x, train_y = sep_ml_xy(train, "대출등급")
train_x, valid_x, train_y, valid_y = tts(train_x, train_y, train_size=0.8, shuffle=True, random_state=0)

# model = CatBoostClassifier()
# model = model.fit(train_x, train_y)
model = CatBoostClassifier(n_estimators=1000, learning_rate=0.1, max_depth=10, verbose=100, task_type='GPU', bootstrap_type ='Bernoulli')
model = model.fit(train_x, train_y, cat_features=categorical_features, early_stopping_rounds=50, eval_set=[(valid_x, valid_y)])

In [None]:
model_params = model.get_params()
# [n_estimators=1000, learning_rate=0.1, max_depth=10, verbose=100, task_type='GPU', bootstrap_type ='Bernoulli']
y_pred = model.predict(valid_x)
y_probas = model.predict_proba(valid_x)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

wandb.init(project='Dacon-loan-rating', config=model_params)

# wandb.config.update({
#                     "test_size" : 0.2,
#                     "train_len" : len(train_x),
#                     "test_len" : len(valid_x),
#                     "learning_rate" : (0.001, 0.3),
#                     "max_depth" : (5, 15),
#                     })

# plot_class_proportions(train_y, valid_y, '대출등급')
# plot_learning_curve(model, train_x, train_y)
# plot_roc(valid_y, y_probas, '대출등급')
# plot_precision_recall(valid_y, y_probas, '대출등급')
# plot_feature_importances(model, train_x.columns, "Feature Importances")
# # plot_calibration_curve(model, train_x, train_y, "CatBoostClassifier")
# plot_summary_metrics(model, train_x, train_y, valid_x, valid_y)
# # wandb.finish()


In [None]:

# # sweep 설정
# sweep_config = {
#     'method': 'bayes',  # 'random' or 'grid' or 'bayes'
#     'metric': {
#       'name': 'accuracy',
#       'goal': 'maximize'   
#     },
#     'parameters': {
#         'learning_rate': {
#             'min': 0.05,
#             'max': 0.5
#         },
#         'max_depth': {
#             'min': 5,
#             'max': 15
#         }
#     }
# }

# sweep_id = wandb.sweep(sweep_config, project="Dacon-loan-rating")

# # sweep 실행
# def train():
#     # wandb.init()을 사용하여 새로운 실행을 시작
#     with wandb.init() as run:
#         model = CatBoostClassifier(
#             learning_rate=run.config.learning_rate,
#             max_depth=run.config.max_depth,
#             task_type='GPU',
#             bootstrap_type='Bernoulli',
#             verbose=100
#         )
        
#         model.fit(train_x, train_y)
#         y_pred = model.predict(valid_x)
#         accuracy = f1_score(valid_y, y_pred,average='macro')
        
#         # log the metric
#         wandb.log({"accuracy": accuracy})
#         wandb.log({"learning_rate": run.config.learning_rate})
#         wandb.log({"max_depth": run.config.max_depth})
        

# wandb.agent(sweep_id, function=train)
# wandb.finish()

In [None]:

def to_categorical(y, num_classes=None, dtype="float32"):
    y = np.array(y, dtype="int")
    input_shape = y.shape

    # Shrink the last dimension if the shape is (..., 1).
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])

    y = y.reshape(-1)
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=dtype)
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

cat_models={}

def cat_kfold(max_depth, learning_rate, random_seed):
    
    folds=StratifiedKFold(n_splits=8, shuffle=True, random_state=55)
    outcomes=[]
    sub=np.zeros((test.shape[0], 7))  
    
    for seed in random_seed:
        for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
            print(f'===================================={n_fold+1}============================================')
            
            X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
            y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

            # early_stopping 50에서 가장 좋은 점수를 내는 learning_rate를 활용
            cat = CatBoostClassifier(n_estimators=3000, max_depth=max_depth, random_seed=seed, learning_rate=learning_rate, bootstrap_type ='Bernoulli')
            cat.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  early_stopping_rounds=50,
                  cat_features=categorical_features,
                  verbose=100)

            cat_models[n_fold] = cat

            # val 데이터 예측
            predictions = cat.predict_proba(X_val)
            # test 데이터 예측
            test_predictions = cat.predict_proba(test)

            # val 데이터 예측 logloss 값 저장
            logloss=log_loss(to_categorical(y_val), predictions)
            outcomes.append(logloss)
            print(f"FOLD {n_fold+1} : logloss:{logloss}")

            # test 데이터 예측 결과 종합
            # 최종 적으로는 kolds 횟수 만큼 나눠서 평균 값을 활용
            sub+=test_predictions
            print(f'================================================================================\n\n')

    # 저장된 val 데이터 예측 logloss 값의 평균 값으로 성능을 비교
    mean_outcome=np.mean(outcomes)
    print("Mean:{}".format(mean_outcome))
    
    return sub/(folds.n_splits * len(random_seed))

In [None]:
my_submission = cat_kfold(5, 0.07556, [1042])

In [None]:
plt.rc('font', family='NanumGothic')
def plot_feature_importance(importance, names, model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

plot_feature_importance(cat_models[0].get_feature_importance(), train_x.columns,'CatBOOST')

In [None]:

model = CatBoostClassifier(n_estimators=1000, learning_rate=0.07556, max_depth=5, verbose=100, task_type='GPU', bootstrap_type ='Bernoulli')
model = model.fit(train_x, train_y, cat_features=categorical_features, early_stopping_rounds=50, eval_set=[(valid_x, valid_y)])
pred = model.predict(valid_x)
evaluate = f1_score(valid_y, pred, average="macro")

#_, evaluate, model = ml_train_valid(model, f1_score, {"average": "macro"}, train_x, train_y, valid_x, valid_y)

In [None]:
from datetime import datetime

print("valid score:", evaluate)
pred = ml_predict(model, test)
submission['대출등급'] = ord_dict["대출등급"].inverse_transform(pred.reshape(-1, 1)).reshape(-1)

print(submission)

current_datetime = datetime.now().strftime("%y%m%d.%H%M")
print(current_datetime)

submission.to_csv('cat'+current_datetime+'.csv', index=False)