In [129]:
import pandas as pd
import numpy as np
import copy
import re
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv('sample_submission.csv')

In [None]:
#핫덱대체
#근로기간 Unknown <<< 나머지 학습해서 결과 뽑기?
"""
    근로기간
    10+ years    31585
    2 years       8450
    < 1 year      7774
    3 years       7581
    1 year        6249
    Unknown       5671 << 5.8% 정도
    5 years       5665
    4 years       5588
    8 years       4888
    6 years       3874
    7 years       3814
    9 years       3744
    10+years       896
    <1 year        370
    3               89
    1 years         56
"""

In [130]:
def only_int(string):
    num = re.sub(r'[^0-9]', '', string)
    if num:
        return num
    else:
        return "5"

def extract_categorical_columns(df):
    data = []
    for e, i in enumerate(df.columns):
        if df[i].dtypes == 'object':
            data.append(i)
    return data

def ordinal_encoding(train_df, test_df, categorical_columns):
    from sklearn.preprocessing import OrdinalEncoder
    train, test = train_df.copy(), test_df.copy()
    data = {}
    for col in categorical_columns:
        ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        ordinal_encoder.fit(train[col].values.reshape(-1, 1))
        train[col] = ordinal_encoder.transform(train[col].values.reshape(-1, 1)).reshape(-1)
        if col in test:
            test[col] = ordinal_encoder.transform(test[col].values.reshape(-1, 1)).reshape(-1)
        data[col] = ordinal_encoder
    return train, test, data

def sep_ml_xy(df, target):
    y = df[target]
    x = df.drop(columns=target)
    return x, y

def ml_train_valid(model, metric, metric_options, train_data, train_target, test_data, test_target):
    model = model.fit(train_data, train_target)
    pred = model.predict(test_data)
    evaluate = metric(test_target, pred, **metric_options)
    return pred, evaluate, model

def ml_predict(model, test_data):
    pred = model.predict(test_data)
    return pred

In [131]:
train = train.drop(columns=['ID'])
test = test.drop(columns=['ID'])

for span in ["근로기간"]:
    train[span] = train[span].apply(lambda x: int(only_int(x)))
    test[span] = test[span].apply(lambda x: int(only_int(x)))

for span in ["대출기간"]:
    train[span] = train[span].apply(lambda x: int(int(only_int(x))/12))
    test[span] = test[span].apply(lambda x: int(int(only_int(x))/12))
    
for span in ["총상환이자", "총연체금액", "연체계좌수"]:
    train[span] = train[span].apply(lambda x: int(x))
    test[span] = test[span].apply(lambda x: int(x))
# display(train.head(3), test.head(3))

In [132]:
categorical_features = ['주택소유상태', '대출목적']

for i in categorical_features:
    le = LabelEncoder()
    le=le.fit(train[i]) 
    train[i]=le.transform(train[i])
    
    for case in np.unique(test[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case) 
    test[i]=le.transform(test[i])

# display(train.head(3), test.head(3))

In [133]:
# columns_to_scale = ['대출금액','연간소득', '부채_대비_소득_비율', '총상환원금', '총상환이자', '총연체금액']
# 대출금액, 대출기간, 근로기간, 주택소유상태, 연간소득, 부채_대비_소득_비율, 총계좌수, 대출목적, 최근_2년간_연체_횟수, 총상환원금, 총상환이자, 총연체금액, 연쳬계좌수

# 의미 있는 값 : 근로기간, 주택소유상태, 연간소득, 부채_대비_소득_비율, 대출목적, 최근_2년간_연체_횟수, 총상환원금, 총상환이자, 총연체금액
# 의미 없는 값 : 대출금액, 대출기간, 총계좌수, 연체계좌수, 
# 추가 하는 값 : 연체계좌수/총계좌수, 총상환원금+총상환이자-총연체금액/대출원금, 대출금액/대출기간/연간소득, 

# scaler = MinMaxScaler(feature_range=(0, 10))
# train[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])
# test[columns_to_scale] = scaler.transform(test[columns_to_scale])

# Calculate additional values
train['연체계좌수/총계좌수'] = train['연체계좌수'] / train['총계좌수']
train['총상환원금+총상환이자-총연체금액/대출원금'] = (train['총상환원금'] + train['총상환이자'] - train['총연체금액']) / train['대출금액']

test['연체계좌수/총계좌수'] = test['연체계좌수'] / test['총계좌수']
test['총상환원금+총상환이자-총연체금액/대출원금'] = (test['총상환원금'] + test['총상환이자'] - test['총연체금액']) / test['대출금액']

columns_to_scale = ['연체계좌수/총계좌수', '총상환원금+총상환이자-총연체금액/대출원금']
scaler = MinMaxScaler(feature_range=(0, 10))
train[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])
test[columns_to_scale] = scaler.transform(test[columns_to_scale])

# Remove meaningless values
# '대출금액', '대출기간', '총계좌수', '연체계좌수'

train = train.drop(columns=['최근_2년간_연체_횟수', '총계좌수'])
test = test.drop(columns=['최근_2년간_연체_횟수', '총계좌수'])

display(train.head(3), test.head(3))

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,대출목적,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,연체계좌수/총계좌수,총상환원금+총상환이자-총연체금액/대출원금
0,12480000,3,6,3,72000000,18.9,1,0,0,0,0,C,0.0,0.002985
1,14400000,5,10,1,130800000,22.33,10,373572,234060,0,0,B,0.0,0.406111
2,12000000,3,5,1,96000000,8.6,1,928644,151944,0,0,A,0.0,0.863269


Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,대출목적,총상환원금,총상환이자,총연체금액,연체계좌수,연체계좌수/총계좌수,총상환원금+총상환이자-총연체금액/대출원금
0,16800000,3,8,1,132000000,19.64,10,394692,146604,0,0,0.0,0.310799
1,8400000,3,5,3,89971200,15.84,1,0,0,0,0,0.0,0.002985
2,17280000,3,6,3,150000000,8.41,3,1786980,281820,0,0,0.0,1.146752


In [134]:
from sklearn.model_selection import GridSearchCV

categorical_columns = extract_categorical_columns(train)
train, test, ord_dict = ordinal_encoding(train, test, categorical_columns)
train_x, train_y = sep_ml_xy(train, "대출등급")
train_x, valid_x, train_y, valid_y = tts(train_x, train_y, train_size=0.8, shuffle=True, random_state=0)

model = XGBClassifier()
model = model.fit(train_x, train_y)

In [135]:
# # {'learning_rate': 0.1, 'max_depth': 10, 'reg_alpha': 0.01}
# params = {'max_depth':[8,9,10], 'reg_alpha':[0.08,0.010,0.12], 'learning_rate':[0.08, 0.09, 0.10]}
# gridcv = GridSearchCV(model, param_grid=params, cv=3)
# gridcv.fit(train_x, train_y, early_stopping_rounds=30, eval_metric='merror', eval_set=[(valid_x, valid_y)])
# print(gridcv.best_params_)

In [136]:
# model = XGBClassifier(n_estimators=1000, learning_rate=0.5, max_depth=12, reg_alpha=0.05)
model = XGBClassifier(n_estimators=1000, learning_rate=0.1, max_depth=10, reg_alpha=0.01)
model = model.fit(train_x, train_y, early_stopping_rounds=200, eval_metric='merror', eval_set=[(valid_x, valid_y)])
pred = model.predict(valid_x)
evaluate = f1_score(valid_y, pred, **{"average": "macro"})

#_, evaluate, model = ml_train_valid(model, f1_score, {"average": "macro"}, train_x, train_y, valid_x, valid_y)

[0]	validation_0-merror:0.19736
[1]	validation_0-merror:0.19357
[2]	validation_0-merror:0.19394
[3]	validation_0-merror:0.19362
[4]	validation_0-merror:0.19207




[5]	validation_0-merror:0.19129
[6]	validation_0-merror:0.18864
[7]	validation_0-merror:0.18589
[8]	validation_0-merror:0.18568
[9]	validation_0-merror:0.18480
[10]	validation_0-merror:0.18423
[11]	validation_0-merror:0.18334
[12]	validation_0-merror:0.18241
[13]	validation_0-merror:0.18246
[14]	validation_0-merror:0.18220
[15]	validation_0-merror:0.18116
[16]	validation_0-merror:0.18002
[17]	validation_0-merror:0.17877
[18]	validation_0-merror:0.17753
[19]	validation_0-merror:0.17623
[20]	validation_0-merror:0.17488
[21]	validation_0-merror:0.17275
[22]	validation_0-merror:0.17166
[23]	validation_0-merror:0.17098
[24]	validation_0-merror:0.17057
[25]	validation_0-merror:0.16953
[26]	validation_0-merror:0.16854
[27]	validation_0-merror:0.16735
[28]	validation_0-merror:0.16668
[29]	validation_0-merror:0.16610
[30]	validation_0-merror:0.16522
[31]	validation_0-merror:0.16429
[32]	validation_0-merror:0.16377
[33]	validation_0-merror:0.16263
[34]	validation_0-merror:0.16133
[35]	validation

In [137]:
from datetime import datetime

print("valid score:", evaluate)
pred = ml_predict(model, test)
submission['대출등급'] = ord_dict["대출등급"].inverse_transform(pred.reshape(-1, 1)).reshape(-1)

print(submission)

current_datetime = datetime.now().strftime("%y%m%d.%H%M")
print(current_datetime)

submission.to_csv(current_datetime+'.csv', index=False)

valid score: 0.8179394085534073
               ID 대출등급
0      TEST_00000    B
1      TEST_00001    B
2      TEST_00002    A
3      TEST_00003    C
4      TEST_00004    C
...           ...  ...
64192  TEST_64192    D
64193  TEST_64193    D
64194  TEST_64194    D
64195  TEST_64195    C
64196  TEST_64196    A

[64197 rows x 2 columns]
240124.1557
