In [1]:
import random
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch

seed = 42
deterministic = False

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if deterministic:
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False

In [2]:
DEBUG = False

In [3]:
train_df = pd.read_csv("./data/train_preproc.csv").drop('ID', axis=1)
test_df = pd.read_csv("./data/test_preproc.csv").drop('ID', axis=1)

In [4]:
train_df.head()

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,12480000,36,6,0,72000000,18.9,15,4,0,0,0.0,0.0,0.0,2
1,14400000,60,10,2,130800000,22.33,21,2,0,373572,234060.0,0.0,0.0,1
2,12000000,36,5,2,96000000,8.6,14,4,0,928644,151944.0,0.0,0.0,0
3,14400000,36,8,2,132000000,15.09,15,4,0,325824,153108.0,0.0,0.0,2
4,18000000,60,-1,0,71736000,25.39,19,3,0,228540,148956.0,0.0,0.0,1


In [5]:
test_df.head()

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,16800000,36,8,2,132000000,19.64,12,2,0,394692,146604.0,0.0,0.0
1,8400000,36,5,0,89971200,15.84,25,4,0,0,0.0,0.0,0.0
2,17280000,36,6,0,150000000,8.41,20,0,0,1786980,281820.0,0.0,0.0
3,14400000,36,5,2,66000000,13.72,30,0,1,669024,281724.0,0.0,0.0
4,27600000,36,5,0,55200000,30.5,12,0,0,1250052,614844.0,0.0,0.0


In [6]:
x_data = train_df.drop('대출등급', axis=1)
y_data = train_df['대출등급']

In [7]:
# scaler = MaxAbsScaler()
# scaler = MinMaxScaler()
# scaler = RobustScaler()
# sclaer = StandardScaler()

x_scaler = StandardScaler()
x_data = x_scaler.fit_transform(x_data)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, shuffle=True)
print(len(x_train), len(x_test))

77035 19259


In [9]:
# XGB objective
    # reg:squarederror
    # reg:squaredlogerror
    # reg:logistic
    # reg:pseudohubererror
    # reg:absoluteerror
    # reg:quantileerror

# LGBM objective
    # regression
    # regression_l1
    # huber
    # fair
    # poisson
    # quantile
    # mape
    # tweedie

In [10]:
n_estimators = 10000
clf_list = [
    XGBClassifier(
        n_estimators=n_estimators if not DEBUG else 2,
        learning_rate=0.08,
        gamma=0,
        subsample=0.75,
        colsample_bytree=1,
        max_depth=7
    ),
    LGBMClassifier(
        n_estimators=n_estimators if not DEBUG else 2,
        learning_rate=0.08,
    )
]

In [11]:
from sklearn.metrics import f1_score
import numpy as np
import xgboost

def f1_eval_xgb(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.argmax(y_pred, axis=1), average='macro')
    return 'f1_err', err

def f1_eval_lgbm(y_true, y_pred):
    err = f1_score(y_true, np.argmax(y_pred, axis=1), average='macro')
    return 'f1', err, True

In [12]:
eval_result = {}
def fit_clf(classifier, x_train, y_train, eval_set, early_stopping_rounds):
    if type(classifier) == XGBClassifier:
        classifier.fit(x_train, y_train, early_stopping_rounds=early_stopping_rounds, eval_metric=f1_eval_xgb, eval_set=[eval_set])
        return classifier
    
    elif type(classifier) == LGBMClassifier:
        callback = lgb.early_stopping(stopping_rounds=early_stopping_rounds)
        classifier.fit(x_train, y_train, eval_metric=f1_eval_lgbm, eval_set=eval_set, callbacks=[callback])
        return classifier

In [13]:
models = []
for clf in clf_list:
    models.append(fit_clf(clf, x_train, y_train, (x_test, y_test), 30))

[0]	validation_0-mlogloss:1.83925	validation_0-f1_err:0.41373




[1]	validation_0-mlogloss:1.73880	validation_0-f1_err:0.36236
[2]	validation_0-mlogloss:1.65806	validation_0-f1_err:0.34543
[3]	validation_0-mlogloss:1.58608	validation_0-f1_err:0.34107
[4]	validation_0-mlogloss:1.51604	validation_0-f1_err:0.32553
[5]	validation_0-mlogloss:1.45446	validation_0-f1_err:0.31436
[6]	validation_0-mlogloss:1.40034	validation_0-f1_err:0.30371
[7]	validation_0-mlogloss:1.35155	validation_0-f1_err:0.29936
[8]	validation_0-mlogloss:1.30926	validation_0-f1_err:0.30232
[9]	validation_0-mlogloss:1.25955	validation_0-f1_err:0.29188
[10]	validation_0-mlogloss:1.21440	validation_0-f1_err:0.28420
[11]	validation_0-mlogloss:1.17456	validation_0-f1_err:0.28037
[12]	validation_0-mlogloss:1.14213	validation_0-f1_err:0.28090
[13]	validation_0-mlogloss:1.11113	validation_0-f1_err:0.27754
[14]	validation_0-mlogloss:1.08149	validation_0-f1_err:0.27536
[15]	validation_0-mlogloss:1.05272	validation_0-f1_err:0.27336
[16]	validation_0-mlogloss:1.02620	validation_0-f1_err:0.27248
[

In [25]:
test_x = x_scaler.transform(test_df)
predictions = []

for model in models:
    predictions.append(model.predict_proba(test_x))

pred_value = np.argmax(np.mean(predictions, axis=0), axis=1)

In [29]:
len(pred_value)

64197

In [28]:
pred_chr = []

for i, pred in enumerate(pred_value):
    pred_chr.append(chr(int(pred + 65)))
len(pred_chr)

(64197,
 ['B',
  'B',
  'A',
  'C',
  'C',
  'A',
  'B',
  'B',
  'B',
  'C',
  'C',
  'C',
  'C',
  'C',
  'B',
  'B',
  'C',
  'D',
  'B',
  'C',
  'C',
  'C',
  'C',
  'F',
  'A',
  'B',
  'A',
  'D',
  'B',
  'A',
  'D',
  'B',
  'A',
  'B',
  'B',
  'D',
  'B',
  'B',
  'B',
  'E',
  'C',
  'C',
  'B',
  'C',
  'C',
  'E',
  'D',
  'E',
  'A',
  'F',
  'B',
  'C',
  'D',
  'C',
  'D',
  'D',
  'A',
  'C',
  'B',
  'C',
  'C',
  'B',
  'A',
  'C',
  'B',
  'D',
  'C',
  'C',
  'B',
  'E',
  'B',
  'B',
  'C',
  'C',
  'C',
  'C',
  'C',
  'A',
  'B',
  'C',
  'B',
  'F',
  'A',
  'B',
  'B',
  'C',
  'C',
  'D',
  'C',
  'C',
  'B',
  'E',
  'C',
  'B',
  'C',
  'D',
  'B',
  'D',
  'B',
  'C',
  'B',
  'C',
  'B',
  'C',
  'C',
  'B',
  'C',
  'E',
  'C',
  'D',
  'B',
  'C',
  'C',
  'B',
  'D',
  'D',
  'B',
  'C',
  'B',
  'F',
  'C',
  'B',
  'C',
  'C',
  'D',
  'B',
  'E',
  'C',
  'A',
  'C',
  'C',
  'A',
  'D',
  'B',
  'B',
  'B',
  'B',
  'D',
  'C',
  'G',
  'C',
  'A'

In [30]:
submission_df = pd.read_csv('./data/sample_submission.csv')
submission_df['대출등급'] = pred_chr
submission_df.to_csv('./data/submission/lgbm+gxb_1.csv', index=False)
pd.read_csv('./data/submission/lgbm+gxb_1.csv')

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C
