## 목차
### 1. Library Import
### 2. 학습데이터 생성
### 3. Light-gbm 모델 훈련
### 4. 교차 검증 점수 확인
### 5. 제출 파일 생성

# 1. Library Import

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc #garbage collection 감시
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
import seaborn as sns
from functools import partial
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier

warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    min  = int(x[10:12])
    sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)

# 2. 학습 데이터 생성

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings

PATH = "data/"

## 2.1 train_err

* train_err_data.csv 파일의 errtype column의 value_counts를 카운트하여 학습에 사용.  
* 주어진 기간동안 user_id별로 peroblem가 한 번이라도 발생했는 지를 확인.  

In [3]:
train = pd.read_csv(PATH+"0203_train.csv")
test = pd.read_csv(PATH+"0203_test.csv")

In [4]:
print(train.columns)
print(test.columns)

Index(['user_id', 'model_nm', 'update', 'err_1', 'err_2', 'err_3', 'err_4',
       'err_5', 'err_6', 'err_7', 'err_8', 'err_9', 'err_10', 'err_11',
       'err_12', 'err_13', 'err_14', 'err_15', 'err_16', 'err_17', 'err_18',
       'err_19', 'err_20', 'err_21', 'err_22', 'err_23', 'err_24', 'err_25',
       'err_26', 'err_27', 'err_28', 'err_30', 'err_31', 'err_32', 'err_33',
       'err_34', 'err_35', 'err_36', 'err_37', 'err_38', 'err_39', 'err_40',
       'err_41', 'err_42', 'fw_rank', 'q_meanmax_0', 'q_meanmax_1',
       'q_meanmax_2', 'q_meanmax_5', 'q_meanmax_6', 'q_meanmax_7',
       'q_meanmax_8', 'q_meanmax_9', 'q_meanmax_10', 'q_meanmax_11',
       'q_meanmax_12', 'q_chg_0', 'q_chg_1', 'q_chg_2', 'q_chg_5', 'q_chg_6',
       'q_chg_11', 'q1_nonzero_counts', 'q2_nonzero_counts',
       'q5_nonzero_counts', 'q6_nonzero_counts', 'q11_nonzero_counts',
       'have_quality', 'weekend_error', 'sunday_error', 'night_error',
       'max_err_day', 'max_err_sunday', 'max_err_weekend', 

In [5]:
# 변수 이름 변경
# error  -> train_x
# problem-> train_y

train_x = train.drop(columns = ["user_id", "update", "label", "30min_second"])
train_y = train.label

test_x = test.drop(columns = ["user_id", "update", "30min_second"])

print(train_x.shape)
print(train_y.shape)

(15000, 82)
(15000,)


In [8]:
def lgbm_cv(feature_fraction, 
            bagging_fraction,
            bagging_freq,
            lr,
            lamb,
          x_data=None, y_data=None, n_splits=5, output='score'):
    
    auc_scores   = []
    k_fold = KFold(n_splits=5, shuffle=True, random_state=718917)
    models = []
    for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
        X = train_x.iloc[train_idx,:]
        y = train_y[train_idx]
        valid_x = train_x.iloc[val_idx,:]
        valid_y = train_y[val_idx]

        d_train= lgb.Dataset(X, y)
        d_val  = lgb.Dataset(valid_x, valid_y)
    
        params =      {
                'boosting_type' : 'dart',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'feature_fraction' : feature_fraction,
                'bagging_fraction': bagging_fraction, 
                'bagging_freq': int(bagging_freq),
                'seed': 1015,
                'learning_rate' : lr,
                "categorical_feature" : None,
                "lambda" : lamb
                }
        
        model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1500,
                        valid_sets      = d_val,
                        verbose_eval    = 1500,
        )
        
        valid_prob = model.predict(valid_x)
        valid_pred = np.where(valid_prob > 0.55, 1, 0)

        # cal scores
        auc_score = roc_auc_score(   valid_y, valid_prob)

        # append scores
        models.append(model)
        auc_scores.append(auc_score)
    
    if output == 'score':
        return np.mean(auc_scores)
    if output == 'model':
        return models

In [9]:
func_fixed = partial(lgbm_cv, x_data=train_x, y_data=train_y, n_splits=5, output="score")
regBO = BayesianOptimization(
    func_fixed, 
    {
                'feature_fraction' : (0.6, 0.9),
                'bagging_fraction': (0.7,1), 
                'bagging_freq': (10,100),
                'lr' : (0.001,0.01),
                "lamb" : (0.2,0.4)
    }, random_state=231)

regBO.maximize(init_points=5, n_iter=10000)

|   iter    |  target   | baggin... | baggin... | featur... |   lamb    |    lr     |
-------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 4024, number of negative: 7976
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12596
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.335333 -> initscore=-0.684161
[LightGBM] [Info] Start training from score -0.684161
[1500]	valid_0's auc: 0.827701
[LightGBM] [Info] Number of positive: 3975, number of negative: 8025
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12566
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.331250 -> initscore=-0.702537
[LightGBM] [Info] Start training from score -0.702537
[1500]	valid_0's au

KeyboardInterrupt: 

In [None]:
params = regBO.max['params']

In [None]:
params

{'colsample_bytree': 0.9532847935680113,
 'learning_rate': 0.09278453605939078,
 'max_depth': 578.1033906895178,
 'n_estimators': 39.602849442683194,
 'num_leaves': 413.07521762936597}

In [None]:
models = lgbm_cv(
    params['colsample_bytree'], 
    params['learning_rate'],
    params["max_depth"],
    params["n_estimators"],
    params["num_leaves"],
    x_data=train_x, y_data=train_y, n_splits=5, output='model')

# 5. 제출 파일 생성

In [None]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
display(test_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2
2,30000,20201101030228,model_1,04.16.3553,15,1
3,30000,20201101030256,model_1,04.16.3553,22,1
4,30000,20201101030300,model_1,04.16.3553,11,1


In [None]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [None]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████| 16532648/16532648 [00:40<00:00, 404165.80it/s]

(14999, 42)





In [None]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [None]:
pred_ensemble

array([[1. ],
       [0. ],
       [0. ],
       ...,
       [0.6],
       [1. ],
       [0.2]])

In [None]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [None]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [None]:
SUB_PATH = '/content/drive/MyDrive/YBIGTA/21-1 겨울방학/lg 품질변화 공모전/quality_change_prediction/submissions/'
sample_submssion.to_csv(SUB_PATH+"dacon_baseline_0111(3).csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,1.0
1,30001,0.0
2,30002,0.0
3,30003,1.0
4,30004,1.0
...,...,...
14994,44994,0.0
14995,44995,0.0
14996,44996,0.6
14997,44997,1.0
