## 목차
### 1. Library Import
### 2. 학습데이터 생성
### 3. Light-gbm 모델 훈련
### 4. 교차 검증 점수 확인
### 5. 제출 파일 생성

# 1. Library Import

In [116]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc #garbage collection 감시
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
import seaborn as sns
from functools import partial
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier

warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    min  = int(x[10:12])
    sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)

# 2. 학습 데이터 생성

In [117]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings

PATH = "data/"

## 2.1 train_err

* train_err_data.csv 파일의 errtype column의 value_counts를 카운트하여 학습에 사용.  
* 주어진 기간동안 user_id별로 peroblem가 한 번이라도 발생했는 지를 확인.  

In [118]:
train = pd.read_csv(PATH+"0203_train(1pm).csv")
test = pd.read_csv(PATH+"0203_test(1pm).csv")

In [119]:
print(train.columns)
print(test.columns)

Index(['user_id', 'model_nm', 'update', 'err_1', 'err_2', 'err_3', 'err_4',
       'err_5', 'err_6', 'err_7',
       ...
       '2hours_maxerr_34', '2hours_maxerr_35', '2hours_maxerr_36',
       '2hours_maxerr_37', '2hours_maxerr_38', '2hours_maxerr_39',
       '2hours_maxerr_40', '2hours_maxerr_41', '2hours_maxerr_42', 'label'],
      dtype='object', length=258)
Index(['user_id', 'model_nm', 'update', 'err_1', 'err_2', 'err_3', 'err_4',
       'err_5', 'err_6', 'err_7',
       ...
       'day_maxerr_33', 'day_maxerr_34', 'day_maxerr_35', 'day_maxerr_36',
       'day_maxerr_37', 'day_maxerr_38', 'day_maxerr_39', 'day_maxerr_40',
       'day_maxerr_41', 'day_maxerr_42'],
      dtype='object', length=257)


In [120]:
# 변수 이름 변경
# error  -> train_x
# problem-> train_y

train_x = train.drop(columns = ["user_id", "label"])
train_y = train.label

test_x = test.drop(columns = ["user_id"])

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

(15000, 256)
(15000,)
(14999, 256)


In [123]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.55
# 파라미터 설정
params =      {
                    'bagging_fraction': 0.7,
                     'bagging_freq': 7,
                     'feature_fraction': 0.7188365561866897,
                     'lambda': 0.21873976026271127,
                     'learning_rate': 0.01,
                      'boosting_type' : 'dart',
                        'objective'     : 'binary',
                        'metric'        : 'auc',
                'seed': 1015,
                "categorical_feature" : None
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=34)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x.iloc[train_idx,:]
    y = train_y[train_idx]
    valid_x = train_x.iloc[val_idx,:]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1500,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 300,

    )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 4004, number of negative: 7996
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21543
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333667 -> initscore=-0.691648
[LightGBM] [Info] Start training from score -0.691648
[300]	valid_0's auc: 0.831212	valid_0's pr_auc: 0.755768
[600]	valid_0's auc: 0.836415	valid_0's pr_auc: 0.761694
[900]	valid_0's auc: 0.839717	valid_0's pr_auc: 0.76859
[1200]	valid_0's auc: 0.842587	valid_0's pr_auc: 0.771758
[1500]	valid_0's auc: 0.843822	valid_0's pr_auc: 0.773837
[LightGBM] [Info] Number of positive: 4013, number of negative: 7987
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21626
[LightGBM] [Info] Number of data p

In [124]:
print(np.mean(auc_scores))

0.8353387592357423


In [125]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [126]:
pred_ensemble

array([[0.89155388],
       [0.23112865],
       [0.36689337],
       ...,
       [0.59886577],
       [0.86749515],
       [0.34373481]])

In [127]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [128]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [129]:
sample_submssion.to_csv(PATH+"0203_bayesian.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.891554
1,30001,0.231129
2,30002,0.366893
3,30003,0.720104
4,30004,0.846370
...,...,...
14994,44994,0.249742
14995,44995,0.276339
14996,44996,0.598866
14997,44997,0.867495


In [130]:
best = pd.read_csv(PATH + "0202.csv")

In [131]:
best["mine2"] = pred_ensemble

In [132]:
best

Unnamed: 0,user_id,problem,mine2
0,30000,0.925746,0.891554
1,30001,0.179326,0.231129
2,30002,0.329626,0.366893
3,30003,0.745617,0.720104
4,30004,0.898933,0.846370
...,...,...,...
14994,44994,0.295214,0.249742
14995,44995,0.261761,0.276339
14996,44996,0.584120,0.598866
14997,44997,0.888072,0.867495


In [133]:
best.corr()

Unnamed: 0,user_id,problem,mine2
user_id,1.0,-0.006602,-0.008155
problem,-0.006602,1.0,0.97423
mine2,-0.008155,0.97423,1.0


In [81]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [82]:
train['label'].head()

0    0.0
1    1.0
2    0.0
3    0.0
4    1.0
Name: label, dtype: float64

In [93]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)

In [94]:
X_pca = pca.fit_transform(train_x)
PCA_df = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']) #, , 'PC8', 'PC9', 'PC10'
PCA_df = pd.concat([PCA_df, train['label']], axis = 1)
PCA_df['label'] = LabelEncoder().fit_transform(PCA_df['label'])

train = PCA_df.copy()

X_pca = pca.transform(test_x)
PCA_df = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']) 

test = PCA_df.copy()

In [96]:
train_x = train.drop(columns = [label"])
train_y = train.label

test_x = test

In [108]:
def lgbm_cv(feature_fraction, 
            bagging_fraction,
            bagging_freq,
            lr,
            lamb,
          x_data=None, y_data=None, n_splits=5, output='score'):
    
    auc_scores   = []
    k_fold = KFold(n_splits=5, shuffle=True, random_state=718917)
    models = []
    for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
        X = train_x.iloc[train_idx,:]
        y = train_y[train_idx]
        valid_x = train_x.iloc[val_idx,:]
        valid_y = train_y[val_idx]

        d_train= lgb.Dataset(X, y)
        d_val  = lgb.Dataset(valid_x, valid_y)
    
        params =      {
                'boosting_type' : 'dart',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'feature_fraction' : feature_fraction,
                'bagging_fraction': bagging_fraction, 
                'bagging_freq': int(bagging_freq),
                'seed': 1015,
                'learning_rate' : lr,
                "categorical_feature" : None,
                "lambda" : lamb
                }
        
        model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1500,
                        valid_sets      = d_val,
                        verbose_eval    = 1500,
        )
        
        valid_prob = model.predict(valid_x)
        valid_pred = np.where(valid_prob > 0.55, 1, 0)

        # cal scores
        auc_score = roc_auc_score(   valid_y, valid_prob)

        # append scores
        models.append(model)
        auc_scores.append(auc_score)
    
    if output == 'score':
        return np.mean(auc_scores)
    if output == 'model':
        return models

In [110]:
func_fixed = partial(lgbm_cv, x_data=train_x, y_data=train_y, n_splits=5, output="score")
regBO = BayesianOptimization(
    func_fixed, 
    {
                'feature_fraction' : (0.6, 0.9),
                'bagging_fraction': (0.7,1), 
                'bagging_freq': (5,10),
                'lr' : (0.001,0.01),
                "lamb" : (0.2,0.4)
    }, random_state=231)

regBO.maximize(init_points=5, n_iter=10000)

|   iter    |  target   | baggin... | baggin... | featur... |   lamb    |    lr     |
-------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 4024, number of negative: 7976
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21571
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.335333 -> initscore=-0.684161
[LightGBM] [Info] Start training from score -0.684161
[1500]	valid_0's auc: 0.840819
[LightGBM] [Info] Number of positive: 3975, number of negative: 8025
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21581
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.331250 -> initscore=-0.702537
[LightG

KeyboardInterrupt: 

In [112]:
params = regBO.max['params']

In [113]:
params

{'bagging_fraction': 0.7,
 'bagging_freq': 7.224086222877524,
 'feature_fraction': 0.7188365561866897,
 'lamb': 0.21873976026271127,
 'lr': 0.01}

In [115]:
models = lgbm_cv(
    params['feature_fraction'], 
    params['bagging_fraction'],
    params["bagging_freq"],
    params["lr"],
    params["lamb"],
    x_data=train_x, y_data=train_y, n_splits=5, output='model')

[LightGBM] [Info] Number of positive: 4024, number of negative: 7976
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21571
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.335333 -> initscore=-0.684161
[LightGBM] [Info] Start training from score -0.684161
[1500]	valid_0's auc: 0.842704
[LightGBM] [Info] Number of positive: 3975, number of negative: 8025
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21581
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 247
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.331250 -> initscore=-0.702537
[LightGBM] [Info] Start training from score -0.702537


KeyboardInterrupt: 

# 5. 제출 파일 생성

In [None]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
display(test_err.head())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2
2,30000,20201101030228,model_1,04.16.3553,15,1
3,30000,20201101030256,model_1,04.16.3553,22,1
4,30000,20201101030300,model_1,04.16.3553,11,1


In [None]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [None]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████| 16532648/16532648 [00:40<00:00, 404165.80it/s]

(14999, 42)





In [None]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [None]:
pred_ensemble

array([[1. ],
       [0. ],
       [0. ],
       ...,
       [0.6],
       [1. ],
       [0.2]])

In [None]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [None]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [None]:
SUB_PATH = '/content/drive/MyDrive/YBIGTA/21-1 겨울방학/lg 품질변화 공모전/quality_change_prediction/submissions/'
sample_submssion.to_csv(SUB_PATH+"dacon_baseline_0111(3).csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,1.0
1,30001,0.0
2,30002,0.0
3,30003,1.0
4,30004,1.0
...,...,...
14994,44994,0.0
14995,44995,0.0
14996,44996,0.6
14997,44997,1.0
