In [1]:
# https://www.kaggle.com/code/sarmat/lgbm-stacking-example/notebook

In [13]:
import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds
import os

from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, PCA

import torch
from tqdm import tqdm
from dataset import custom_train_test_split, make_dataset

from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(name+'.csv', index=None)
    


In [2]:
train_data = pd.read_csv('/opt/ml/input/data/train_data.csv')
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

In [3]:
train_data.drop_duplicates(subset = ["userID", "assessmentItemID"], keep = "last", inplace = True)
train_data.drop(['Timestamp','testId','KnowledgeTag'], axis=1, inplace=True, errors='ignore')

In [4]:
matrix_train = train_data.pivot_table('answerCode', index='userID', columns='assessmentItemID')
matrix_train.fillna(0.5, inplace=True)

In [5]:
user_id2idx = {v:i for i,v in enumerate(matrix_train.index)}
user_idx2id = {i:v for i,v in enumerate(matrix_train.index)}

item_id2idx = {v:i for i,v in enumerate(matrix_train.columns)}
item_idx2id = {i:v for i,v in enumerate(matrix_train.columns)}

In [6]:
def predict(matrix, userid, itemid, user_id2idx, item_id2idx, pca_model):
 
    X = matrix
    
    X_pred = pca_model.inverse_transform(pca_model.transform(X))

    ret = [X_pred[user_id2idx[u], item_id2idx[i]] for u,i in zip(userid, itemid)]
    return ret

In [7]:
model01 = PCA(n_components= 10) 
model02 = PCA(n_components= 14 )
model03 = PCA(n_components= 18 )
model04 = PCA(n_components= 22 )
model05 = PCA(n_components= 26 )
model06 = PCA(n_components= 30 )
model07 = PCA(n_components= 34 )
model08 = PCA(n_components= 38 )
model09 = PCA(n_components= 42 )
model10 = PCA(n_components= 46 )

In [8]:
model01.fit(matrix_train)
model02.fit(matrix_train)
model03.fit(matrix_train)
model04.fit(matrix_train)
model05.fit(matrix_train)
model06.fit(matrix_train)
model07.fit(matrix_train)
model08.fit(matrix_train)
model09.fit(matrix_train)
model10.fit(matrix_train)

PCA(n_components=46)

In [9]:
valid_user  = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv').userID.unique()
all_train = pd.read_csv('/opt/ml/input/data/all.csv')
valid_data = all_train[all_train.userID.isin(valid_user)]
userid = sorted(list(set([u for u in valid_data.userID])))
user_id2idx_valid = {v:i for i,v in enumerate(userid)}

matrix_valid = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(valid_data.userID, valid_data.assessmentItemID, valid_data.answerCode):
    user,item = user_id2idx_valid[user],item_id2idx[item]
    matrix_valid[user,item] = a

valid_predict01 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model01)
valid_predict02 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model02)
valid_predict03 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model03)
valid_predict04 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model04)
valid_predict05 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model05)
valid_predict06 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model06)
valid_predict07 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model07)
valid_predict08 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model08)
valid_predict09 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model09)
valid_predict10 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, model10)

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    matrix_test[user,item] = a

test_data = test_data[test_data.answerCode==-1]

test_predict01 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model01)
test_predict02 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model02)
test_predict03 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model03)
test_predict04 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model04)
test_predict05 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model05)
test_predict06 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model06)
test_predict07 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model07)
test_predict08 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model08)
test_predict09 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model09)
test_predict10 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, model10)

# print('Fold no: {}'.format(fold_))
print("AUC SVD01:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict01))))
print("AUC SVD02:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict02))))
print("AUC SVD03:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict03))))
print("AUC SVD04:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict04))))
print("AUC SVD05:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict05))))
print("AUC SVD06:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict06))))
print("AUC SVD07:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict07))))
print("AUC SVD08:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict08))))
print("AUC SVD09:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict09))))
print("AUC SVD10:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict10))))



auc : 0.7736901133182142
acc : 0.7579689548337316
precision : 0.7736442273099363
recall : 0.891453633332332
AUC SVD01:None 
auc : 0.7802639083086065
acc : 0.7605120917103974
precision : 0.7753398372118488
recall : 0.8933701823425155
AUC SVD02:None 
auc : 0.7906537542325437
acc : 0.7637913997882032
precision : 0.7773394753572136
recall : 0.896229985881222
AUC SVD03:None 
auc : 0.7957495058662343
acc : 0.7657794557057205
precision : 0.7791600323666832
recall : 0.8967046171407973
AUC SVD04:None 
auc : 0.8038326773265547
acc : 0.7715625332162808
precision : 0.785557007100866
recall : 0.8959536183123554
AUC SVD05:None 
auc : 0.8084598575576581
acc : 0.774310380801285
precision : 0.7883249482095295
recall : 0.8962119619093394
AUC SVD06:None 
auc : 0.8117871114720614
acc : 0.7763811083510159
precision : 0.790605585123301
recall : 0.896049746162396
AUC SVD07:None 
auc : 0.8166637564790294
acc : 0.7794950731643945
precision : 0.7940911729772627
recall : 0.8957493466310192
AUC SVD08:None 
auc : 

In [14]:
test_path = '/opt/ml/level2-dkt-level2-recsys-08/LetsEnsemble/test4feature'
valid_path = '/opt/ml/level2-dkt-level2-recsys-08/LetsEnsemble/valid4feature'

test_to_csv( valid_predict01, os.path.join(valid_path, 'PCA_valid01') )
test_to_csv( valid_predict02, os.path.join(valid_path, 'PCA_valid02') )
test_to_csv( valid_predict03, os.path.join(valid_path, 'PCA_valid03') )
test_to_csv( valid_predict04, os.path.join(valid_path, 'PCA_valid04') )
test_to_csv( valid_predict05, os.path.join(valid_path, 'PCA_valid05') )
test_to_csv( valid_predict06, os.path.join(valid_path, 'PCA_valid06') )
test_to_csv( valid_predict07, os.path.join(valid_path, 'PCA_valid07') )
test_to_csv( valid_predict08, os.path.join(valid_path, 'PCA_valid08') )
test_to_csv( valid_predict09, os.path.join(valid_path, 'PCA_valid09') )
test_to_csv( valid_predict10, os.path.join(valid_path, 'PCA_valid10') )

test_to_csv( test_predict01, os.path.join(test_path, 'PCA_test01') )
test_to_csv( test_predict02, os.path.join(test_path, 'PCA_test02') )
test_to_csv( test_predict03, os.path.join(test_path, 'PCA_test03') )
test_to_csv( test_predict04, os.path.join(test_path, 'PCA_test04') )
test_to_csv( test_predict05, os.path.join(test_path, 'PCA_test05') )
test_to_csv( test_predict06, os.path.join(test_path, 'PCA_test06') )
test_to_csv( test_predict07, os.path.join(test_path, 'PCA_test07') )
test_to_csv( test_predict08, os.path.join(test_path, 'PCA_test08') )
test_to_csv( test_predict09, os.path.join(test_path, 'PCA_test09') )
test_to_csv( test_predict10, os.path.join(test_path, 'PCA_test10') )

In [26]:
new_valid = np.array([    
                    valid_predict01,
                    valid_predict02,
                    valid_predict03,
                    valid_predict04,
                    valid_predict05,
                    valid_predict06,
                    valid_predict07,
                    valid_predict08,
                    valid_predict09,
                    valid_predict10]).T

new_test = np.array([
                    test_predict01,
                    test_predict02,
                    test_predict03,
                    test_predict04,
                    test_predict05,
                    test_predict06,
                    test_predict07,
                    test_predict08,
                    test_predict09,
                    test_predict10,]).T

In [27]:
val = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv')
tail_idx = val.index[val.answerCode==-1].to_numpy()

In [28]:
y_valid = valid_data.answerCode.to_numpy()

valid_tail = [new_valid[i] for i in range(len(new_valid)) if i in tail_idx]
y_tail = [y_valid[i] for i in range(len(y_valid)) if i in tail_idx]

new_valid = [new_valid[i] for i in range(len(new_valid)) if not i in tail_idx]
y_new_valid = [y_valid[i] for i in range(len(y_valid)) if not i in tail_idx]

In [29]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(new_valid, y_new_valid)
# eval_pool = Pool(valid_tail , y_tail)
eval_pool = Pool(valid_tail, y_tail)

Final_cat = CatBoostClassifier(
            iterations = 3000,
            random_seed = 42,
            learning_rate = 0.001,
            loss_function = 'Logloss', 
            custom_metric = ['Logloss','AUC'],
            early_stopping_rounds = 30,
            use_best_model =  True,
            task_type = "GPU",
            bagging_temperature = 1,
            verbose = False)

Final_cat.fit(train_pool, eval_set=eval_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f9867a003a0>

In [30]:
preds = Final_cat.predict(new_test , prediction_type='Probability')[:,1]

from datetime import date, datetime, timezone, timedelta

KST = timezone(timedelta(hours=9))
time_record = datetime.now(KST)
_day = str(time_record)[:10]
_time = str(time_record.time())[:8]
now_time = _day+'_'+_time

test_to_csv(new_test.mean(axis=1),f'Stacking_PCA_{now_time}')