In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('./data/train.csv')
train = train.drop(['index'], axis=1)

test = pd.read_csv('./data/test.csv')
test = test.drop(['index'], axis=1)

submit = pd.read_csv('./data/sample_submission.csv')

# 전처리

## Occyp_type
- DAYS_EMPLOYED가 양수인 사람의 occyp_type을 'No Job'으로 변경, 나머지 결측값은 unknown으로 채운다.

In [3]:
def occyptype_pre(data):
    data.loc[data["DAYS_EMPLOYED"] >= 0,'occyp_type'] = "No job"
    data["occyp_type"] = data["occyp_type"].fillna("unknown")
    return

In [4]:
occyptype_pre(train)
occyptype_pre(test)

- 빈도수가 적은 카테고리 : Sales staff, specialist로 카테고리화

In [5]:
# 빈도수가 적은 카테고리 확인(1% 기준)
def find_minority_cats(cat_cols, data_df, composite_category='Sales staff', threshold=0.03):
    """ Find minority categories for each feature column, and create a 
        dictionary that maps those to selected composite category """
    minority_col_dict = {}
    minority_mapping_dict = {}
    
    # find all feature categories with less than 1% proportion
    for feature in cat_cols:
        minority_col_dict[feature] = []
        minority_mapping_dict[feature] = {}
        
        for category, proportion in data_df[feature].value_counts(normalize=True).iteritems():
            if proportion < threshold:
                minority_col_dict[feature].append(category)
                
                # map those minority cats to chosen composite feature
                minority_mapping_dict[feature] = { x : composite_category for x 
                                                  in minority_col_dict[feature]}
                
    return minority_mapping_dict, minority_col_dict

In [6]:
cat_min_mappings, minority_cols = find_minority_cats(["occyp_type"], train)
minority_cols

{'occyp_type': ['Cooking staff',
  'Security staff',
  'Cleaning staff',
  'Private service staff',
  'Low-skill Laborers',
  'Waiters/barmen staff',
  'Secretaries',
  'Realty agents',
  'HR staff',
  'IT staff']}

In [7]:
cat_min_mappings

{'occyp_type': {'Cooking staff': 'Sales staff',
  'Security staff': 'Sales staff',
  'Cleaning staff': 'Sales staff',
  'Private service staff': 'Sales staff',
  'Low-skill Laborers': 'Sales staff',
  'Waiters/barmen staff': 'Sales staff',
  'Secretaries': 'Sales staff',
  'Realty agents': 'Sales staff',
  'HR staff': 'Sales staff',
  'IT staff': 'Sales staff'}}

In [8]:
train["occyp_type"] = train["occyp_type"].replace(cat_min_mappings["occyp_type"])
test["occyp_type"] = test["occyp_type"].replace(cat_min_mappings["occyp_type"])

In [9]:
special_type = ['High skill tech staff','Medicine staff','Accountants']

def specialist(data):
    data.loc[data["occyp_type"] == 'High skill tech staff','occyp_type'] = "specialist"
    data.loc[data["occyp_type"] == 'Medicine staff','occyp_type'] = "specialist"
    data.loc[data["occyp_type"] == 'Accountants','occyp_type'] = "specialist"

    data["occyp_type"] = data["occyp_type"].fillna("unknown")
    return

In [10]:
specialist(train)
specialist(test)

In [11]:
train["occyp_type"].unique()

array(['unknown', 'Laborers', 'Managers', 'Sales staff', 'specialist',
       'Core staff', 'Drivers', 'No job'], dtype=object)

## 필요없는 컬럼 제거 - Child_num(다중공선성-family_size), FLAG_MOBIL(only 1)

In [12]:
train= train.drop(['child_num', 'FLAG_MOBIL'],axis=1)
test= test.drop(['child_num', 'FLAG_MOBIL'],axis=1)

In [13]:
train.count()

gender           26457
car              26457
reality          26457
income_total     26457
income_type      26457
edu_type         26457
family_type      26457
house_type       26457
DAYS_BIRTH       26457
DAYS_EMPLOYED    26457
work_phone       26457
phone            26457
email            26457
occyp_type       26457
family_size      26457
begin_month      26457
credit           26457
dtype: int64

## binary 변수 0,1로 변경

In [14]:
#Binary variables
def binary(data):
    data['reality'] = data['reality'].replace(['N','Y'],[0,1])
    data['gender'] = data['gender'].replace(['F','M'],[0,1])
    data['car'] = data['car'].replace(['N','Y'],[0,1])
    print('--------------')

In [15]:
binary(train)
binary(test)

--------------
--------------


In [16]:
train.columns

Index(['gender', 'car', 'reality', 'income_total', 'income_type', 'edu_type',
       'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'work_phone', 'phone', 'email', 'occyp_type', 'family_size',
       'begin_month', 'credit'],
      dtype='object')

## 인코딩 - LabelEncoder

In [79]:
def preprocess(data):
    label_encoder = LabelEncoder()
    data['income_type']=label_encoder.fit_transform(data['income_type'])
    data['occyp_type']=label_encoder.fit_transform(data['occyp_type'])
    data['edu_type']=label_encoder.fit_transform(data['edu_type'])
    data['family_type']=label_encoder.fit_transform(data['family_type'])
    data['house_type']=label_encoder.fit_transform(data['house_type'])
    return

In [80]:
preprocess(train)
preprocess(test)

In [81]:
train.head()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,F,0,0,202500.0,0,1,1,2,-13899,-4709,0,0,0,7,2.0,-6.0,1.0
1,F,0,1,247500.0,0,4,0,1,-11380,-1540,0,0,1,2,3.0,-5.0,1.0
2,M,1,1,450000.0,4,1,1,1,-19087,-4434,0,1,0,3,2.0,-22.0,2.0
3,F,0,1,202500.0,0,4,1,1,-15088,-2092,0,1,0,5,2.0,-37.0,0.0
4,F,1,1,157500.0,2,1,1,1,-15037,-2105,0,0,0,3,2.0,-26.0,2.0


## 소득 파생변수 생성(소득0~9)

In [82]:
count, bin_dividers =np.histogram(train['income_total'], bins=10)
count1, bin_dividers1 =np.histogram(test['income_total'], bins=10)

bin_names=['소득'+str(i) for i in range(10) ]
#bin_dividers는 train기준!!
train['income_cut']=pd.cut(x=train['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)
test['income_cut']=pd.cut(x=test['income_total'], bins=bin_dividers1, labels=bin_names, include_lowest=True)

train['income_cut'] = train['income_cut'].astype('object')
test['income_cut'] = test['income_cut'].astype('object')

In [83]:
label_encoder = LabelEncoder()
train['income_cut']=label_encoder.fit_transform(train['income_cut'])
test['income_cut']=label_encoder.fit_transform(test['income_cut'])

## 날짜변수 파생변수 생성

In [84]:
# DAYS_BIRTH
train['DAYS_BIRTH_month']=np.floor((-train['DAYS_BIRTH'])/30)-((np.floor((-train['DAYS_BIRTH'])/30)/12).astype(int)*12)
train['DAYS_BIRTH_week']=np.floor((-train['DAYS_BIRTH'])/7)-((np.floor((-train['DAYS_BIRTH'])/7)/4).astype(int)*4)


# DAYS_EMPLOYED
train['DAYS_EMPLOYED_month']=np.floor((-train['DAYS_EMPLOYED'])/30)-((np.floor((-train['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
train['DAYS_EMPLOYED_week']=np.floor((-train['DAYS_EMPLOYED'])/7)-((np.floor((-train['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
train['before_EMPLOYED']=train['DAYS_BIRTH']-train['DAYS_EMPLOYED']
train['before_EMPLOYED_month']=np.floor((-train['before_EMPLOYED'])/30)-((np.floor((-train['before_EMPLOYED'])/30)/12).astype(int)*12)
train['before_EMPLOYED_week']=np.floor((-train['before_EMPLOYED'])/7)-((np.floor((-train['before_EMPLOYED'])/7)/4).astype(int)*4)

# DAYS_BIRTH
test['DAYS_BIRTH_month']=np.floor((-test['DAYS_BIRTH'])/30)-((np.floor((-test['DAYS_BIRTH'])/30)/12).astype(int)*12)
test['DAYS_BIRTH_week']=np.floor((-test['DAYS_BIRTH'])/7)-((np.floor((-test['DAYS_BIRTH'])/7)/4).astype(int)*4)


# DAYS_EMPLOYED
test['DAYS_EMPLOYED_month']=np.floor((-test['DAYS_EMPLOYED'])/30)-((np.floor((-test['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
test['DAYS_EMPLOYED_week']=np.floor((-test['DAYS_EMPLOYED'])/7)-((np.floor((-test['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
test['before_EMPLOYED']=test['DAYS_BIRTH']-test['DAYS_EMPLOYED']
test['before_EMPLOYED_month']=np.floor((-test['before_EMPLOYED'])/30)-((np.floor((-test['before_EMPLOYED'])/30)/12).astype(int)*12)
test['before_EMPLOYED_week']=np.floor((-test['before_EMPLOYED'])/7)-((np.floor((-test['before_EMPLOYED'])/7)/4).astype(int)*4)

# Normalization
## 수치형(연속형) 변수 표준화

In [85]:
continuos_variable = ['income_total','DAYS_BIRTH', 'DAYS_EMPLOYED','begin_month']
train_continuos = train[continuos_variable]
test_continuos = test[continuos_variable]
train_continuos_copy = train[continuos_variable]
test_continuos_copy = test[continuos_variable]

In [86]:
train_continuos_copy.columns

Index(['income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month'], dtype='object')

In [87]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(train_continuos)
train_continuos_scaled = scaler.transform(train_continuos)
test_continuos_scaled = scaler.transform(test_continuos)

train_df_continuos_scaled = pd.DataFrame(train_continuos_scaled, columns=train_continuos_copy.columns, index=list(train_continuos_copy.index.values))
test_df_continuos_scaled = pd.DataFrame(test_continuos_scaled, columns=test_continuos_copy.columns, index=list(test_continuos_copy.index.values))

In [88]:
train_df_continuos_scaled

Unnamed: 0,income_total,DAYS_BIRTH,DAYS_EMPLOYED,begin_month
0,0.149136,0.490075,-0.463930,1.215231
1,0.590848,1.089621,-0.440878,1.275620
2,2.578550,-0.744719,-0.461929,0.249003
3,0.149136,0.207081,-0.444893,-0.656836
4,-0.292575,0.219220,-0.444988,0.007446
...,...,...,...,...
26452,0.369992,0.923252,-0.444108,1.456788
26453,-0.071719,0.158765,-0.447679,-1.260729
26454,1.032559,1.398558,-0.444333,0.067835
26455,-0.160062,1.383563,-0.430454,-1.985400


In [89]:
train = train.drop(columns=continuos_variable)
test = test.drop(columns=continuos_variable)
train = pd.concat([train,train_df_continuos_scaled], axis = 1)
test = pd.concat([test,test_df_continuos_scaled], axis = 1)

In [95]:
train

Unnamed: 0,gender,car,reality,income_type,edu_type,family_type,house_type,work_phone,phone,email,...,DAYS_BIRTH_week,DAYS_EMPLOYED_month,DAYS_EMPLOYED_week,before_EMPLOYED,before_EMPLOYED_month,before_EMPLOYED_week,income_total,DAYS_BIRTH,DAYS_EMPLOYED,begin_month
0,0,0,0,0,1,1,2,0,0,0,...,1.0,0.0,0.0,-9190,6.0,0.0,0.149136,0.490075,-0.463930,1.215231
1,0,0,1,0,4,0,1,0,0,1,...,1.0,3.0,0.0,-9840,4.0,1.0,0.590848,1.089621,-0.440878,1.275620
2,1,1,1,4,1,1,1,0,1,0,...,2.0,3.0,1.0,-14653,8.0,1.0,2.578550,-0.744719,-0.461929,0.249003
3,0,0,1,0,4,1,1,0,1,0,...,3.0,9.0,2.0,-12996,1.0,0.0,0.149136,0.207081,-0.444893,-0.656836
4,0,1,1,2,1,1,1,0,0,0,...,0.0,10.0,0.0,-12932,11.0,3.0,-0.292575,0.219220,-0.444988,0.007446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,2,4,1,1,0,0,0,...,1.0,6.0,3.0,-10095,0.0,2.0,0.369992,0.923252,-0.444108,1.456788
26453,0,0,1,4,1,2,1,0,0,0,...,0.0,10.0,1.0,-12816,7.0,2.0,-0.071719,0.158765,-0.447679,-1.260729
26454,0,1,0,4,4,0,5,0,0,0,...,0.0,7.0,3.0,-8067,4.0,0.0,1.032559,1.398558,-0.444333,0.067835
26455,1,0,1,4,2,3,1,0,0,0,...,1.0,3.0,3.0,-10038,10.0,2.0,-0.160062,1.383563,-0.430454,-1.985400


# Optuna로 하이퍼 파라미터 찾기(사용안함)

In [96]:
X=train.drop('credit',axis=1)
y=train['credit']

In [97]:
def objective(trial,train=train,target=y):

    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

    param={
      'max_depth':trial.suggest_int('max_depth',20,30),
      'learning_rate':trial.suggest_categorical('learning_rate',[0.01]),
      'subsample':trial.suggest_categorical('subsample',[0.3,0.4,0.5,0.6,0.7,0.8,0.9]),
      'colsample_bytree':trial.suggest_categorical('colsample_bytree',[0.3,0.4,0.5,0.6,0.7,0.8,0.9]),
      'eval_metric': 'mlogloss',
      'objective' : 'multi:softprob',
      'random_state': 2,
      'n_estimators': 1000
  }

    model=XGBClassifier(**param)

    model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=30,verbose=False)

    predictions=model.predict_proba(X_test)

    logloss = log_loss(y_test,predictions)
    return logloss

In [98]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
study.best_params

[32m[I 2021-05-24 16:37:44,740][0m A new study created in memory with name: no-name-a71a793d-51fb-4985-9ec3-c098c91dd557[0m


KeyboardInterrupt: 

# 모델링 - XGBClassifier
- **10Fold로 학습**

In [99]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [101]:
import random
random.seed(2)
lgb_models={}
for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = XGBClassifier(n_estimators=1000,
                        objective='multi:softprob',
                        eval_metric='mlogloss',
                        learning_rate=0.01,
                        max_depth=27,
                        subsample=0.7,
                        colsample_bytree=0.5
                        )
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=100,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

[0]	validation_0-mlogloss:1.09122	validation_1-mlogloss:1.09351
[100]	validation_0-mlogloss:0.65121	validation_1-mlogloss:0.81705
[200]	validation_0-mlogloss:0.45586	validation_1-mlogloss:0.73140
[300]	validation_0-mlogloss:0.34901	validation_1-mlogloss:0.70884
[400]	validation_0-mlogloss:0.28197	validation_1-mlogloss:0.70917
[444]	validation_0-mlogloss:0.25981	validation_1-mlogloss:0.71309


[0]	validation_0-mlogloss:1.09129	validation_1-mlogloss:1.09353
[100]	validation_0-mlogloss:0.65343	validation_1-mlogloss:0.81234
[200]	validation_0-mlogloss:0.45809	validation_1-mlogloss:0.72269
[300]	validation_0-mlogloss:0.35127	validation_1-mlogloss:0.69653
[400]	validation_0-mlogloss:0.28421	validation_1-mlogloss:0.69432
[460]	validation_0-mlogloss:0.25523	validation_1-mlogloss:0.69859


[0]	validation_0-mlogloss:1.09116	validation_1-mlogloss:1.09347
[100]	validation_0-mlogloss:0.65291	validation_1-mlogloss:0.80993
[200]	validation_0-mlogloss:0.45739	validation_1-mlogloss:0.71911
[300]	valida

In [102]:
test

Unnamed: 0,gender,car,reality,income_type,edu_type,family_type,house_type,work_phone,phone,email,...,DAYS_BIRTH_week,DAYS_EMPLOYED_month,DAYS_EMPLOYED_week,before_EMPLOYED,before_EMPLOYED_month,before_EMPLOYED_week,income_total,DAYS_BIRTH,DAYS_EMPLOYED,begin_month
0,1,1,0,1,4,0,1,0,1,0,...,1.0,-7.0,-2.0,-387233,7.0,3.0,-0.734287,-1.435662,2.227162,-2.045789
1,0,0,1,2,1,1,1,0,1,0,...,1.0,1.0,2.0,-10293,7.0,2.0,-0.513431,-0.715444,-0.492750,-0.596447
2,0,0,1,4,4,1,1,1,1,0,...,1.0,7.0,3.0,-15670,6.0,2.0,-1.157623,0.016912,-0.431254,-0.838004
3,1,1,0,0,4,1,1,1,0,0,...,0.0,0.0,1.0,-16739,5.0,3.0,-0.734287,-0.788275,-0.448087,-0.898393
4,0,1,1,2,1,1,1,1,0,0,...,2.0,0.0,0.0,-8437,5.0,1.0,0.369992,-0.443637,-0.497944,1.094452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,1,1,4,2,1,1,1,1,0,...,0.0,1.0,0.0,-13159,6.0,3.0,0.149136,-0.627143,-0.469204,0.430170
9996,1,1,1,4,4,0,1,1,0,0,...,3.0,7.0,3.0,-9571,7.0,3.0,0.149136,1.207198,-0.439241,-0.475668
9997,0,0,1,4,4,1,1,0,0,0,...,2.0,11.0,2.0,-6998,5.0,3.0,1.032559,-1.203840,-0.531645,-1.743843
9998,0,1,0,0,4,1,1,0,1,0,...,3.0,0.0,3.0,-15456,11.0,0.0,-0.071719,-0.138747,-0.437568,-0.415279


## 각 Fold에서 test예측 후 평균 값 저장

In [123]:
submit.iloc[:,1:]=0
for fold in range(10):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/10



In [124]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.120542,0.159349,0.720109
1,26458,0.327916,0.302545,0.369539
2,26459,0.041448,0.058088,0.900464
3,26460,0.07347,0.078621,0.847909
4,26461,0.093778,0.209428,0.696795
5,26462,0.056275,0.070091,0.873634
6,26463,0.516964,0.349179,0.133858
7,26464,0.050674,0.059157,0.890169
8,26465,0.056693,0.095136,0.848172
9,26466,0.069137,0.348355,0.582508


In [126]:
submit.tail(20)

Unnamed: 0,index,0,1,2
9980,36437,0.075559,0.163162,0.761279
9981,36438,0.066307,0.06688,0.866813
9982,36439,0.075908,0.08761,0.836482
9983,36440,0.056962,0.418324,0.524714
9984,36441,0.07094,0.469137,0.459923
9985,36442,0.063205,0.572513,0.364282
9986,36443,0.055227,0.078308,0.866465
9987,36444,0.08356,0.052287,0.864153
9988,36445,0.139319,0.218896,0.641785
9989,36446,0.056817,0.049174,0.894009


In [127]:
submit.to_csv('./data/sub_final_xgb.csv', index=False)

# ** 모델 FOLD 지정해서 해봄 - Not good

In [128]:
pred = lgb_models[9].predict_proba(test)
pred



array([[0.11430281, 0.13718724, 0.74850994],
       [0.34229144, 0.34029436, 0.3174142 ],
       [0.03757434, 0.05160284, 0.9108228 ],
       ...,
       [0.04597031, 0.10831912, 0.8457106 ],
       [0.06054073, 0.6106652 , 0.32879403],
       [0.07838729, 0.38789922, 0.53371346]], dtype=float32)

In [129]:
submit1 = submit.copy()
submit1.iloc[:,1:]=0
submit1.iloc[:,1:] += lgb_models[9].predict_proba(test)



In [132]:
lgb_models[9]

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=27, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [130]:
submit1

Unnamed: 0,index,0,1,2
0,26457,0.114303,0.137187,0.748510
1,26458,0.342291,0.340294,0.317414
2,26459,0.037574,0.051603,0.910823
3,26460,0.058444,0.064084,0.877472
4,26461,0.089859,0.225346,0.684795
...,...,...,...,...
9995,36452,0.075330,0.262400,0.662270
9996,36453,0.248099,0.394262,0.357639
9997,36454,0.045970,0.108319,0.845711
9998,36455,0.060541,0.610665,0.328794


In [131]:
submit1.to_csv('./data/sub_final_xgb1.csv', index=False)