## Load Data

In [1]:
import pandas as pd
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
from dataset import custom_train_test_split, make_dataset

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import pandas as pd

from vecstack import stacking


dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}

def test_to_csv(preds, name:str):
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    from datetime import date, datetime, timezone, timedelta

    KST = timezone(timedelta(hours=9)); time_record = datetime.now(KST); _day = str(time_record)[:10]; _time = str(time_record.time())[:8]; now_time = _day+'_'+_time

    pd.DataFrame(result).to_csv(f'/opt/ml/level2-dkt-level2-recsys-08/LetsEnsemble/output/{name}_{now_time}.csv', index=None)

In [2]:
# train_data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/all.pkl')
train_data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/all.pkl')

train, valid = custom_train_test_split(train_data)
y_train, x_train, y_valid, x_valid = make_dataset(train, valid)

In [3]:
test_data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/test_data.pkl')
test_data = test_data[test_data.answerCode==-1]

## Cat Boost

In [4]:
from catboost import CatBoostClassifier, Pool

cate_cols = [
            'assessmentItemID',
            'testId',
            'KnowledgeTag',
            'hour',
            'dow',
            # 'i_head',
            # 'i_mid',
            # 'i_tail',
]
cont_cols = [                        
            'user_correct_answer',
            'user_total_answer',
            'user_acc',            
            't_elapsed',            
            'cum_correct',
            'last_problem',
            'head_term',
            # 'left_asymptote',
            'elo_prob',
            'pkt',
            'u_head_mean',
            'u_head_count',
            'u_head_std',
            'u_head_elapsed',
            'i_mid_elapsed',
            'i_mid_mean',
            'i_mid_std',
            'i_mid_sum',
            'i_mid_count',
            'i_mid_tag_count',
            # 'assessment_mean',
            # 'assessment_sum',
            # 'assessment_std',
            'tag_mean',
            'tag_sum',
            # 'tag_std',
            'tail_mean',
            'tail_sum',
            # 'tail_std',
            'hour_mean',
            'hour_sum',
            # 'hour_std',
            'dow_mean',
            'dow_sum',
            # 'dow_std',
            'tag_elapsed',
            'tag_elapsed_o',
            'tag_elapsed_x',
            'assessment_elapsed',
            'assessment_elapsed_o',
            'assessment_elapsed_x',
            'tail_elapsed',
            'tail_elapsed_o',
            'tail_elapsed_x'
            ]

CAT_FEATS = cate_cols + cont_cols

catboost_model = CatBoostClassifier().load_model('/opt/ml/level2-dkt-level2-recsys-08/catboost/model_save/catboost2022-05-07_11:21:56.cbm')

## LGBM

In [5]:
lstm_cat_cols = ['i_head', 'i_mid','i_tail', 'hour', 'dow']
lstm_cont_cols = [                        
        'user_correct_answer',
        'user_total_answer',
        'user_acc',            
        't_elapsed',            
        'cum_correct',
        'last_problem',
        'head_term',
        # 'left_asymptote',
        'elo_prob',
        'pkt',
        'u_head_mean',
        'u_head_count',
        'u_head_std',
        'u_head_elapsed',
        'i_mid_elapsed',
        'i_mid_mean',
        'i_mid_std',
        'i_mid_sum',
        'i_mid_count',
        'i_mid_tag_count',
        'assessment_mean',
        'assessment_sum',
        # 'assessment_std',
        'tag_mean',
        'tag_sum',
        # 'tag_std',
        'tail_mean',
        'tail_sum',
        # 'tail_std',
        'hour_mean',
        'hour_sum',
        # 'hour_std',
        'dow_mean',
        'dow_sum',
        # 'dow_std',
        'tag_elapsed',
        'tag_elapsed_o',
        'tag_elapsed_x',
        'assessment_elapsed',
        'assessment_elapsed_o',
        'assessment_elapsed_x',
        'tail_elapsed',
        'tail_elapsed_o',
        'tail_elapsed_x']

LGBM_FEATS = lstm_cat_cols + lstm_cont_cols

import lightgbm as lgb
lgbm_model = lgb.Booster(model_file='/opt/ml/level2-dkt-level2-recsys-08/lgbm/best_model/model_best.txt')


## Stacking

In [6]:
catboost_preds = catboost_model.predict(x_valid[CAT_FEATS], prediction_type='Probability')[:,1]
roc_auc_score(y_valid, catboost_preds)

0.8604425834937618

In [7]:
lgbm_preds = lgbm_model.predict(x_valid[LGBM_FEATS])
roc_auc_score(y_valid, lgbm_preds)

0.8981938435292222

In [8]:
new_data = np.array([catboost_preds, lgbm_preds])

In [9]:
new_data.shape

(2, 1488)

In [10]:
new_data = np.transpose(new_data)
new_data.shape

(1488, 2)

### Final model

In [50]:
final_model = lgb.train(
        {   
        'boosting': 'gbdt',
        'objective': 'binary',
        # 'max_depth' : 10,
        # 'num_leaves' : 6,
        'metric' : 'binary_logloss',
        'force_col_wise' : True,
        'learning_rate' : 0.01,
        },
        lgb.Dataset(new_data, y_valid),  
        

        )
    
            

[LightGBM] [Info] Number of positive: 781, number of negative: 707
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1488, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524866 -> initscore=0.099544
[LightGBM] [Info] Start training from score 0.099544


In [51]:
new_preds = final_model.predict(new_data)

In [52]:
roc_auc_score(y_valid, new_preds)

0.9313894166076568

## Test 추출

In [14]:
test_catboost_preds = catboost_model.predict(test_data[CAT_FEATS], prediction_type='Probability')[:,1]


In [15]:
test_lgbm_preds = lgbm_model.predict(test_data[LGBM_FEATS])


In [16]:
test_preds = np.array([test_catboost_preds, test_lgbm_preds])

In [17]:
test_preds = np.transpose(test_preds)
test_preds.shape

(744, 2)

## Final_model 통과

In [53]:
new_preds = final_model.predict(test_preds)

In [54]:
test_to_csv(new_preds,'ensemble')