## 목차
### 1. Library Import
### 2. 학습데이터 생성
### 3. Light-gbm 모델 훈련
### 4. 교차 검증 점수 확인
### 5. 제출 파일 생성

# 1. Library Import

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')


# 3. Light-gbm 모델 훈련

In [2]:
train = pd.read_csv("Jupyter Temp/lg 공모전/0126 data/train.csv")
test = pd.read_csv("Jupyter Temp/lg 공모전/0126 data/test.csv")

In [24]:
# 변수 이름 변경
# error  -> train_x
# problem-> train_y

train_x =train.drop(columns=["label","user_id"])
train_y = train["label"]

test = test.drop(columns="user_id")

print(train_x.shape)
print(train_y.shape)

(15000, 69)
(15000,)


In [25]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x.iloc[train_idx]
    y = train_y.iloc[train_idx]
    valid_x = train_x.iloc[val_idx]
    valid_y = train_y.iloc[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6723
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[13]	valid_0's auc: 0.812305	valid_0's pr_auc: 0.812984
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6891
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
Training 

# 4. 교차검증 점수 확인

In [26]:
print(np.mean(auc_scores))

0.8029743085655111


# 5. 제출 파일 생성

In [27]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [28]:
pred_ensemble

array([[0.6505471 ],
       [0.27931295],
       [0.31776057],
       ...,
       [0.38540448],
       [0.65914218],
       [0.30982861]])

In [29]:
sample_submssion = pd.read_csv("Jupyter Temp/lg 공모전/data/sample_submission.csv")

In [30]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [31]:
sample_submssion.to_csv("Jupyter Temp/lg 공모전/submissions/0127(1).csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.650547
1,30001,0.279313
2,30002,0.317761
3,30003,0.596035
4,30004,0.676201
...,...,...
14994,44994,0.307844
14995,44995,0.342251
14996,44996,0.385404
14997,44997,0.659142


In [32]:
len(sample_submssion)

14999

In [16]:
!pip install pycaret



You should consider upgrading via the 'c:\users\sue_b\anaconda3\python.exe -m pip install --upgrade pip' command.


In [17]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

In [33]:
col = list(train_x.columns)

In [35]:
auto_train = train.drop(columns="user_id")

In [36]:
clf = setup(data = auto_train, target = 'label', numeric_features=col)

Unnamed: 0,Description,Value
0,session_id,1112
1,Target,label
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 70)"
5,Missing Values,False
6,Numeric Features,69
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [37]:
best_5 = compare_models(sort = 'Accuracy', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7935,0.8222,0.5201,0.7955,0.6285,0.4942,0.5159,22.402
gbc,Gradient Boosting Classifier,0.7917,0.8177,0.4932,0.8148,0.6139,0.4831,0.5118,3.149
lightgbm,Light Gradient Boosting Machine,0.7914,0.8141,0.528,0.7818,0.6297,0.4922,0.5108,1.11
rf,Random Forest Classifier,0.7886,0.8169,0.5039,0.7922,0.6155,0.4798,0.5033,2.098
et,Extra Trees Classifier,0.7886,0.8093,0.5133,0.7842,0.62,0.4825,0.5035,2.833
ada,Ada Boost Classifier,0.7836,0.8035,0.4994,0.7789,0.608,0.4684,0.4907,0.802
xgboost,Extreme Gradient Boosting,0.7803,0.802,0.5402,0.7365,0.623,0.4732,0.4846,3.962
lda,Linear Discriminant Analysis,0.7678,0.7646,0.3862,0.8349,0.5276,0.4004,0.4528,0.157
ridge,Ridge Classifier,0.7665,0.0,0.3794,0.8381,0.5218,0.3955,0.4499,0.071
qda,Quadratic Discriminant Analysis,0.7514,0.7556,0.369,0.7745,0.4993,0.3608,0.4048,0.104


In [38]:
blended = blend_models(estimator_list = best_5, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8024,0.8233,0.5431,0.8067,0.6492,0.5188,0.5385
1,0.7943,0.8288,0.5219,0.797,0.6308,0.4966,0.518
2,0.7838,0.8149,0.4837,0.7935,0.6011,0.4645,0.4913
3,0.8014,0.8291,0.512,0.8341,0.6345,0.5087,0.5373
4,0.7794,0.8144,0.4717,0.7872,0.5899,0.4517,0.4794
Mean,0.7923,0.8221,0.5065,0.8037,0.6211,0.4881,0.5129
SD,0.0092,0.0064,0.0259,0.0165,0.0221,0.0258,0.0239


In [39]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7969,0.8118,0.4939,0.808,0.613,0.486,0.5131


In [40]:
final_model = finalize_model(blended)


In [41]:
predictions = predict_model(final_model, data = test)

In [42]:
x = []
for i in range(len(predictions['Score'])):
    if predictions['Label'][i] =='1.0':
        x.append(predictions['Score'][i])
    else:
        x.append(1-predictions['Score'][i])

In [43]:
sample_submssion = pd.read_csv("Jupyter Temp/lg 공모전/data/sample_submission.csv")
sample_submssion['problem'] = x
sample_submssion.to_csv("Jupyter Temp/lg 공모전/submissions/0127(2).csv", index = False)

In [44]:
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.9429
1,30001,0.2141
2,30002,0.3186
3,30003,0.7615
4,30004,0.8329
...,...,...
14994,44994,0.4111
14995,44995,0.3111
14996,44996,0.5184
14997,44997,0.8479
