## 목차
### 1. Library Import
### 2. 학습데이터 생성
### 3. Light-gbm 모델 훈련
### 4. 교차 검증 점수 확인
### 5. 제출 파일 생성

# 1. Library Import

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc #garbage collection 감시
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
import seaborn as sns
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    min  = int(x[10:12])
    sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)


PATH = '/content/drive/MyDrive/YBIGTA/21-1 겨울방학/lg 품질변화 공모전/quality_change_prediction/data/'

# 3. Light-gbm 모델 훈련

In [2]:
# 변수 이름 변경
# error  -> train_x
# problem-> train_y
train = pd.read_csv("data/0201_train.csv")
test = pd.read_csv("data/0201_test.csv")
train_x = train.drop(columns=["Unnamed: 0","user_id","label"])
train_y = train["label"]
test_x = test.drop(columns=["Unnamed: 0","user_id"])


print(train_x.shape)
print(train_y.shape)

(15000, 61)
(15000,)


In [68]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.55

# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=22)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x.iloc[train_idx,:]
    y = train_y[train_idx]
    valid_x = train_x.iloc[val_idx,:]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = CatBoostClassifier(iterations=100000,
                                 learning_rate=0.01,
                                 bootstrap_type='Bernoulli',
                                 eval_metric='AUC',
                                 metric_period=20,
                                 od_type='Iter',
                                 od_wait=50,
                                 random_seed=345,
                                 allow_writing_files=False)

    model.fit(trn_x, trn_y,
             eval_set=(val_x, val_y),
             cat_features=[],
             use_best_model=True,
             verbose=500)
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')



0:	test: 0.7684140	best: 0.7684140 (0)	total: 18.2ms	remaining: 30m 21s
500:	test: 0.8162714	best: 0.8162714 (500)	total: 4.59s	remaining: 15m 11s
1000:	test: 0.8197352	best: 0.8197798 (994)	total: 8.95s	remaining: 14m 45s
1500:	test: 0.8212048	best: 0.8213022 (1453)	total: 13.3s	remaining: 14m 32s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8213021795
bestIteration = 1453

Shrink model to first 1454 iterations.
0:	test: 0.7684140	best: 0.7684140 (0)	total: 10.6ms	remaining: 17m 44s




500:	test: 0.8162714	best: 0.8162714 (500)	total: 4.64s	remaining: 15m 22s


KeyboardInterrupt: 

# 4. 교차검증 점수 확인

In [65]:
auc_scores

[0.7569701665329958, 0.7628791142845875, 0.7521404818489825, 0.740648470583917]

In [66]:
print(np.mean(auc_scores))

0.7531595583126207


In [26]:
import lightgbm as lgbm

In [21]:
import numpy as np
import pandas as pd
import os
from scipy import stats
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn import linear_model
import datetime

# K Fold 및 기본 데이터 준비
folds = KFold(n_splits = 5, shuffle = True, random_state = 231)
oof_preds = np.zeros(train_x.shape[0])

In [23]:
# catboost 이용
print("using catboost")

sub_preds_catboost = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x)) :
    trn_x, trn_y = train_x.iloc[trn_idx,:], train_y[trn_idx]
    val_x, val_y = train_x.iloc[val_idx,:], train_y[val_idx]
    
    cb_model = CatBoostClassifier(iterations=1000,
                                 learning_rate=0.01,
                                 depth=16,
                                 l2_leaf_reg=4,
                                 bootstrap_type='Bernoulli',
                                 subsample=0.7,
                                 eval_metric='AUC',
                                 metric_period=50,
                                 od_type='Iter',
                                 od_wait=45,
                                 random_seed=43,
                                 allow_writing_files=False)
    cb_model.fit(trn_x, trn_y,
             eval_set=(val_x, val_y),
             cat_features=[],
             use_best_model=True,
             verbose=True)
    oof_preds_catboost[val_idx] = cb_model.predict(val_x)
    sub_preds_catboost += cb_model.predict(test_x) / folds.n_splits

using catboost




0:	test: 0.7796408	best: 0.7796408 (0)	total: 5.93s	remaining: 1h 38m 43s
50:	test: 0.8093445	best: 0.8093445 (50)	total: 4m 2s	remaining: 1h 15m 4s


KeyboardInterrupt: 

In [None]:
0:	test: 0.7644054	best: 0.7644054 (0)	total: 1.83s	remaining: 30m 31s
50:	test: 0.7953621	best: 0.7955361 (36)	total: 1m 18s	remaining: 24m 11s
100:	test: 0.8029209	best: 0.8029209 (100)	total: 2m 35s	remaining: 23m 2s
150:	test: 0.8062789	best: 0.8063342 (149)	total: 4m	remaining: 22m 30s
200:	test: 0.8093402	best: 0.8093402 (200)	total: 5m 30s	remaining: 21m 52s
250:	test: 0.8114553	best: 0.8114553 (250)	total: 6m 56s	remaining: 20m 42s
300:	test: 0.8126050	best: 0.8126232 (299)	total: 8m 27s	remaining: 19m 37s
350:	test: 0.8140343	best: 0.8140343 (350)	total: 9m 49s	remaining: 18m 9s
400:	test: 0.8149283	best: 0.8149323 (397)	total: 11m 15s	remaining: 16m 49s
450:	test: 0.8152823	best: 0.8155497 (438)	total: 12m 38s	remaining: 15m 23s
500:	test: 0.8157984	best: 0.8158212 (498)	total: 14m 3s	remaining: 14m
550:	test: 0.8159630	best: 0.8160268 (541)	total: 15m 27s	remaining: 12m 35s
600:	test: 0.8163525	best: 0.8163905 (593)	total: 16m 57s	remaining: 11m 15s
650:	test: 0.8166073	best: 0.8166407 (648)	total: 18m 26s	remaining: 9m 53s
700:	test: 0.8169411	best: 0.8169411 (700)	total: 19m 56s	remaining: 8m 30s

In [None]:
sub_preds_catboost += cb_model.predict(x_test) / folds.n_splits 

sub_cat = pd.DataFrame()
sub_cat['ID'] = test_1['ID']
sub_cat['target'] = np.exp(sub_preds_catboost)-1
sub_cat.to_csv('sansander_catboost.csv', index = False)

# 5. 제출 파일 생성

In [None]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
display(test_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2
2,30000,20201101030228,model_1,04.16.3553,15,1
3,30000,20201101030256,model_1,04.16.3553,22,1
4,30000,20201101030300,model_1,04.16.3553,11,1


In [None]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [None]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████| 16532648/16532648 [00:40<00:00, 408044.66it/s]

(14999, 42)





In [None]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [None]:
pred_ensemble

array([[0.77981028],
       [0.21930262],
       [0.29921489],
       ...,
       [0.43839372],
       [0.76868059],
       [0.35637135]])

In [None]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [None]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [None]:
SUB_PATH = '/content/drive/MyDrive/YBIGTA/21-1 겨울방학/lg 품질변화 공모전/quality_change_prediction/submissions/'
sample_submssion.to_csv(SUB_PATH+"dacon_baseline_0111(1).csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.779810
1,30001,0.219303
2,30002,0.299215
3,30003,0.648380
4,30004,0.578343
...,...,...
14994,44994,0.330235
14995,44995,0.311575
14996,44996,0.438394
14997,44997,0.768681
