## 목차
### 1. Library Import
### 2. 학습데이터 생성
### 3. Light-gbm 모델 훈련
### 4. 교차 검증 점수 확인
### 5. 제출 파일 생성

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. Library Import

In [20]:
!pip install bayesian-optimization

Collecting bayesian-optimization

  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz





Building wheels for collected packages: bayesian-optimization

  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone

  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp36-none-any.whl size=11685 sha256=be707e9d030dba1a2f1fba1083b38d5998b627672c71bed9c514aa79956ee513

  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec

Successfully built bayesian-optimization

Installing collected packages: bayesian-optimization

Successfully installed bayesian-optimization-1.2.0


In [23]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc #garbage collection 감시
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
import seaborn as sns
from functools import partial
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier

warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    min  = int(x[10:12])
    sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)


PATH = '/content/drive/MyDrive/YBIGTA/21-1 겨울방학/lg 품질변화 공모전/quality_change_prediction/data/'

# 2. 학습 데이터 생성

## 2.1 train_err

* train_err_data.csv 파일의 errtype column의 value_counts를 카운트하여 학습에 사용.  
* 주어진 기간동안 user_id별로 peroblem가 한 번이라도 발생했는 지를 확인.  

In [6]:
train_err  = pd.read_csv(PATH+'train_err_data.csv')
display(train_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,model_3,05.15.2138,15,1
1,10000,20201101030309,model_3,05.15.2138,12,1
2,10000,20201101030309,model_3,05.15.2138,11,1
3,10000,20201101050514,model_3,05.15.2138,16,1
4,10000,20201101050515,model_3,05.15.2138,4,0


In [7]:
# 데이터 설명을 확인하면
# ueser_id가 10000부터 24999까지 총 15000개가 연속적으로 존재.
display(train_err.head())
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,model_3,05.15.2138,15,1
1,10000,20201101030309,model_3,05.15.2138,12,1
2,10000,20201101030309,model_3,05.15.2138,11,1
3,10000,20201101050514,model_3,05.15.2138,16,1
4,10000,20201101050515,model_3,05.15.2138,4,0


In [8]:
print(np.sort(train_err.errtype.unique()))
# errtype이 1부터 42까지 29를 제외한 41개가 존재. 
#이후 나올 데이터에서는 29 또는 42 이후 error 존재할 확률

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24

 25 26 27 28 30 31 32 33 34 35 36 37 38 39 40 41 42]


In [9]:
# user_id와 errtype만을 사용하여 데이터 셋 생성
# 모든 일자에 대해서 errtype별 발생 건수를 count
# pandas의 groupby를 활용할 경우 큰 연산 자원이 필요.
# numpy로 placeholder를 만들어 구현함.
id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,42))

#사람별로 어떤 error가 몇번 일어났는지
for person_idx, err in tqdm(id_error):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - train_user_id_min,err - 1] += 1
error.shape

100%|██████████| 16554663/16554663 [00:40<00:00, 411976.72it/s]


(15000, 42)

In [10]:
#참고) 일별 합산 데이터 생성 코드      
'''
train_err             = pd.read_csv(PATH+'train_err_data.csv')
train_err['datetime'] = train_err['time'].apply(make_datetime)

min_day = train_err.datetime.min().date()

train_err['days'] = train_err.loc[:,'datetime'].dt.date - min_day
train_err['days'] = train_err['days'].dt.days
display(train_err)

id_error = train_err[['user_id','errtype','days']].values
error = np.zeros((train_user_number,42, 33))
for idx, err, days in tqdm(id_error):
    error[idx - train_user_id_min,err - 1, days] += 1
error.shape
'''

"\ntrain_err             = pd.read_csv(PATH+'train_err_data.csv')\ntrain_err['datetime'] = train_err['time'].apply(make_datetime)\n\nmin_day = train_err.datetime.min().date()\n\ntrain_err['days'] = train_err.loc[:,'datetime'].dt.date - min_day\ntrain_err['days'] = train_err['days'].dt.days\ndisplay(train_err)\n\nid_error = train_err[['user_id','errtype','days']].values\nerror = np.zeros((train_user_number,42, 33))\nfor idx, err, days in tqdm(id_error):\n    error[idx - train_user_id_min,err - 1, days] += 1\nerror.shape\n"

## 2.2 problem

In [11]:
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
# error와 동일한 방법으로 person_idx - 10000 위치에 
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem[train_prob.user_id.unique()-10000] = 1 
problem.shape

(15000,)

## EDA

# 3. Light-gbm 모델 훈련

In [12]:
# 변수 이름 변경
# error  -> train_x
# problem-> train_y

train_x = error
train_y = problem
del error, problem
print(train_x.shape)
print(train_y.shape)

(15000, 42)

(15000,)


In [84]:
def lgbm_cv(colsample_bytree, 
            learning_rate,
            max_depth,
            n_estimators,
            num_leaves,
          x_data=None, y_data=None, n_splits=5, output='score'):
    
    score = 0
    kf = KFold(n_splits=n_splits, random_state=4321)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data[train_index], y_data[train_index]
        x_valid, y_valid = x_data[valid_index], y_data[valid_index]

        model = LGBMClassifier(
            colsample_bytree = colsample_bytree,
            learning_rate = learning_rate,
            max_depth = int(max_depth),
            n_estimators = int(n_estimators),
            num_leaves = int(num_leaves)                        
        )
        

        
        model.fit(x_train, y_train)
        models.append(model)
        
        prob = model.predict(x_valid)
        pred = np.where(prob > 0.6, 1, 0)
        true = y_valid
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [85]:
func_fixed = partial(lgbm_cv, x_data=train_x, y_data=train_y, n_splits=5, output="score")
regBO = BayesianOptimization(
    func_fixed, 
    {
         'colsample_bytree' : (0.5,1),
         'learning_rate':(0.01, 0.1),
         "max_depth" : (200,700),
         "n_estimators" : (30,50),
         "num_leaves" : (200,600),
    }, random_state=1029)

regBO.maximize(init_points=100, n_iter=0)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | num_le... |

-------------------------------------------------------------------------------------

| [0m 1       [0m | [0m 0.6975  [0m | [0m 0.5205  [0m | [0m 0.04688 [0m | [0m 494.4   [0m | [0m 48.0    [0m | [0m 248.4   [0m |

| [95m 2       [0m | [95m 0.7042  [0m | [95m 0.5851  [0m | [95m 0.09462 [0m | [95m 497.7   [0m | [95m 36.71   [0m | [95m 376.7   [0m |

| [0m 3       [0m | [0m 0.6642  [0m | [0m 0.7479  [0m | [0m 0.01791 [0m | [0m 337.1   [0m | [0m 43.2    [0m | [0m 232.1   [0m |

| [0m 4       [0m | [0m 0.7017  [0m | [0m 0.5193  [0m | [0m 0.07076 [0m | [0m 293.7   [0m | [0m 48.79   [0m | [0m 548.5   [0m |

| [0m 5       [0m | [0m 0.7011  [0m | [0m 0.6791  [0m | [0m 0.06111 [0m | [0m 639.6   [0m | [0m 39.05   [0m | [0m 482.0   [0m |

| [95m 6       [0m | [95m 0.705   [0m | [95m 0.8981  [0m | [95m 0.09494 [0m | [95m 351.9   [0m

In [86]:
params = regBO.max['params']

In [87]:
params

{'colsample_bytree': 0.9532847935680113,
 'learning_rate': 0.09278453605939078,
 'max_depth': 578.1033906895178,
 'n_estimators': 39.602849442683194,
 'num_leaves': 413.07521762936597}

In [88]:
models = lgbm_cv(
    params['colsample_bytree'], 
    params['learning_rate'],
    params["max_depth"],
    params["n_estimators"],
    params["num_leaves"],
    x_data=train_x, y_data=train_y, n_splits=5, output='model')

# 5. 제출 파일 생성

In [89]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
display(test_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2
2,30000,20201101030228,model_1,04.16.3553,15,1
3,30000,20201101030256,model_1,04.16.3553,22,1
4,30000,20201101030300,model_1,04.16.3553,11,1


In [90]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [91]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████| 16532648/16532648 [00:40<00:00, 404165.80it/s]

(14999, 42)





In [92]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [93]:
pred_ensemble

array([[1. ],
       [0. ],
       [0. ],
       ...,
       [0.6],
       [1. ],
       [0.2]])

In [94]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [95]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [96]:
SUB_PATH = '/content/drive/MyDrive/YBIGTA/21-1 겨울방학/lg 품질변화 공모전/quality_change_prediction/submissions/'
sample_submssion.to_csv(SUB_PATH+"dacon_baseline_0111(3).csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,1.0
1,30001,0.0
2,30002,0.0
3,30003,1.0
4,30004,1.0
...,...,...
14994,44994,0.0
14995,44995,0.0
14996,44996,0.6
14997,44997,1.0
