## 목차
### 1. Library Import
### 2. 학습데이터 생성
### 3. Light-gbm 모델 훈련
### 4. 교차 검증 점수 확인
### 5. 제출 파일 생성

# 1. Library Import

In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc #garbage collection 감시
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
import seaborn as sns
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    min  = int(x[10:12])
    sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)


PATH = '/content/drive/MyDrive/YBIGTA/21-1 겨울방학/lg 품질변화 공모전/quality_change_prediction/data/'

# 3. Light-gbm 모델 훈련

In [7]:
# 변수 이름 변경
# error  -> train_x
# problem-> train_y
train = pd.read_csv("data/0201_train.csv")
test = pd.read_csv("data/0201_test.csv")
train_x = train.drop(columns=["Unnamed: 0","user_id","label"])
train_y = train["label"]
test_x = test.drop(columns=["Unnamed: 0","user_id"])


print(train_x.shape)
print(train_y.shape)

(15000, 61)
(15000,)


In [65]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'dart',
                'objective'     : 'cross_entropy',
                'metric'        : 'auc',
                "learning_rate" : 0.007,
                'seed': 234,
                "xgboost_dart_mode": True,
    "linear_tree": True,
    "min_data_in_leaf": 10
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x.iloc[train_idx,:]
    y = train_y[train_idx]
    valid_x = train_x.iloc[val_idx,:]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 2000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] [cross_entropy:Init]: (objective) labels passed interval [0, 1] check
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8294
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 61
[LightGBM] [Info] [cross_entropy:BoostFromScore]: pavg = 0.302750 -> initscore = -0.834237
[LightGBM] [Info] Start training from score -0.834237
[20]	valid_0's auc: 0.79024	valid_0's pr_auc: 0.796019
[40]	valid_0's auc: 0.794663	valid_0's pr_auc: 0.800061
[60]	valid_0's auc: 0.798625	valid_0's pr_auc: 0.802555
[80]	valid_0's auc: 0.799326	valid_0's pr_auc: 0.801028
[100]	valid_0's auc: 0.800152	valid_0's pr_auc: 0.801653
[120]	valid_0's auc: 0.801291	valid_0's pr_auc: 0.801802
[140]	valid_0's auc: 0.803843	valid_0's pr_auc: 0.806334
[160]	valid_0's auc: 0.80512	valid_0's pr_auc: 0.807408
[180]	valid_0's auc: 0.806738	valid_0's pr_auc: 0.808976
[200]	valid

# 4. 교차검증 점수 확인

In [66]:
auc_scores

[0.8226635983964601,
 0.8151726094536363,
 0.7870755248696633,
 0.8232235462871014,
 0.823227991000582]

In [68]:
print(np.mean(auc_scores))

0.8142726540014886


In [26]:
import lightgbm as lgbm

In [None]:
params = {
    'objective' :'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 76,
    'feature_fraction': 0.64, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'gbdt',
    'metric': 'binary_logloss'
}

from sklearn.model_selection import train_test_split


for i, j in enumerate(col):
    print('fitting column : '+j)
    # making test - valid sets
    X_train, X_valid, Y_train, Y_valid = train_test_split(countvec_df_train_,  train[j], random_state=7, test_size=0.33)
    
    # making lgbm datasets for train and valid
    d_train = lgbm.Dataset(X_train, Y_train)
    d_valid = lgbm.Dataset(X_valid, Y_valid)
    
    # training with early stop
    bst = lgbm.train(params, d_train, 5000, valid_sets=[d_valid], verbose_eval=50, early_stopping_rounds=100)
    
    # making prediciton for one column
    print('predicting for :' +j)
    preds[:,i] = bst.predict(countvec_df_test_)

print('Fininshed Training')

# 5. 제출 파일 생성

In [None]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
display(test_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2
2,30000,20201101030228,model_1,04.16.3553,15,1
3,30000,20201101030256,model_1,04.16.3553,22,1
4,30000,20201101030300,model_1,04.16.3553,11,1


In [None]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [None]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████| 16532648/16532648 [00:40<00:00, 408044.66it/s]

(14999, 42)





In [None]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [None]:
pred_ensemble

array([[0.77981028],
       [0.21930262],
       [0.29921489],
       ...,
       [0.43839372],
       [0.76868059],
       [0.35637135]])

In [None]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [None]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [None]:
SUB_PATH = '/content/drive/MyDrive/YBIGTA/21-1 겨울방학/lg 품질변화 공모전/quality_change_prediction/submissions/'
sample_submssion.to_csv(SUB_PATH+"dacon_baseline_0111(1).csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.779810
1,30001,0.219303
2,30002,0.299215
3,30003,0.648380
4,30004,0.578343
...,...,...
14994,44994,0.330235
14995,44995,0.311575
14996,44996,0.438394
14997,44997,0.768681
