In [5]:
# https://www.kaggle.com/code/sarmat/lgbm-stacking-example/notebook

In [1]:
import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds
import os

from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split

import torch
from tqdm import tqdm
from dataset import custom_train_test_split, make_dataset

from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(f'output/{name}.csv', index=None)
    


In [2]:
train_data = pd.read_csv('/opt/ml/input/data/train_data.csv')
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

In [3]:
train_data.drop_duplicates(subset = ["userID", "assessmentItemID"], keep = "last", inplace = True)
train_data.drop(['Timestamp','testId','KnowledgeTag'], axis=1, inplace=True, errors='ignore')

In [4]:
matrix_train = train_data.pivot_table('answerCode', index='userID', columns='assessmentItemID')
matrix_train.fillna(0.5, inplace=True)

In [5]:
user_id2idx = {v:i for i,v in enumerate(matrix_train.index)}
user_idx2id = {i:v for i,v in enumerate(matrix_train.index)}

item_id2idx = {v:i for i,v in enumerate(matrix_train.columns)}
item_idx2id = {i:v for i,v in enumerate(matrix_train.columns)}

In [9]:
def predict(matrix, userid, itemid, user_id2idx, item_id2idx, k):
    A = matrix_train.values
    a_mean = np.mean(A, axis=1)
    Am = A - a_mean.reshape(-1,1)

    U, sigma, V = svds(Am, k=k)
    
    Sigma = np.diag(sigma)
    Sigma_i = np.diag(1/sigma)
    pred_matrix = V.T @ Sigma_i @ Sigma @ V
    
    B = matrix
    B_mean = np.mean(B, axis=1)
    Bm = B - B_mean.reshape(-1,1)

    B_pred =  B @ pred_matrix + B_mean.reshape(-1,1)

    ret = [B_pred[user_id2idx[u], item_id2idx[i]] for u,i in zip(userid, itemid)]
    return ret

In [38]:
valid_user  = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv').userID.unique()
all_train = pd.read_csv('/opt/ml/input/data/all.csv')
valid_data = all_train[all_train.userID.isin(valid_user)]
userid = sorted(list(set([u for u in valid_data.userID])))
user_id2idx_valid = {v:i for i,v in enumerate(userid)}

matrix_valid = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(valid_data.userID, valid_data.assessmentItemID, valid_data.answerCode):
    user,item = user_id2idx_valid[user],item_id2idx[item]
    matrix_valid[user,item] = a

valid_predict01 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 30)
valid_predict02 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 32)
valid_predict03 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 34)
valid_predict04 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 36)
valid_predict05 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 38)
valid_predict06 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 40)
valid_predict07 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 42)
valid_predict08 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 44)
valid_predict09 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 46)
valid_predict10 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 48)

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    matrix_test[user,item] = a

test_data = test_data[test_data.answerCode==-1]

test_predict01 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 30)
test_predict02 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 32)
test_predict03 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 34)
test_predict04 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 36)
test_predict05 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 38)
test_predict06 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 40)
test_predict07 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 42)
test_predict08 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 44)
test_predict09 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 46)
test_predict10 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 48)

# print('Fold no: {}'.format(fold_))
print("AUC SVD01:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict01))))
print("AUC SVD02:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict02))))
print("AUC SVD03:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict03))))
print("AUC SVD04:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict04))))
print("AUC SVD05:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict05))))
print("AUC SVD06:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict06))))
print("AUC SVD07:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict07))))
print("AUC SVD08:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict08))))
print("AUC SVD09:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict09))))
print("AUC SVD10:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict10))))

auc : 0.809582539643665
acc : 0.7753890487644528
precision : 0.7896275271914682
recall : 0.8958995463967077
AUC SVD01:None 
auc : 0.8110194269747696
acc : 0.7762787529968467
precision : 0.7907709611017975
recall : 0.8955150349965454
AUC SVD02:None 
auc : 0.8132519611518163
acc : 0.7780463512284611
precision : 0.7924289684416365
recall : 0.895959626302983
AUC SVD03:None 
auc : 0.8160304483911209
acc : 0.7798336331820311
precision : 0.7944299696829228
recall : 0.8957974105560396
AUC SVD04:None 
auc : 0.8177256284819437
acc : 0.7806249188046469
precision : 0.7953163341512856
recall : 0.8957313226591367
AUC SVD05:None 
auc : 0.8191915662575665
acc : 0.7816366621131656
precision : 0.796379679629956
recall : 0.8957853945747845
AUC SVD06:None 
auc : 0.8204886614668208
acc : 0.7823610230811323
precision : 0.7972775653197123
recall : 0.8955691069121933
AUC SVD07:None 
auc : 0.8224968640301832
acc : 0.7836325915194652
precision : 0.7985453532713483
recall : 0.8957793865841569
AUC SVD08:None 
auc

In [None]:
test_to_csv( valid_predict01, os.path.join(valid_path, 'SVD_valid01') )
test_to_csv( valid_predict02, os.path.join(valid_path, 'SVD_valid02') )
test_to_csv( valid_predict03, os.path.join(valid_path, 'SVD_valid03') )
test_to_csv( valid_predict04, os.path.join(valid_path, 'SVD_valid04') )
test_to_csv( valid_predict05, os.path.join(valid_path, 'SVD_valid05') )
test_to_csv( valid_predict06, os.path.join(valid_path, 'SVD_valid06') )
test_to_csv( valid_predict07, os.path.join(valid_path, 'SVD_valid07') )
test_to_csv( valid_predict08, os.path.join(valid_path, 'SVD_valid08') )
test_to_csv( valid_predict09, os.path.join(valid_path, 'SVD_valid09') )
test_to_csv( valid_predict10, os.path.join(valid_path, 'SVD_valid10') )

test_to_csv( test_predict01, os.path.join(test_path, 'SVD_test01') )
test_to_csv( test_predict02, os.path.join(test_path, 'SVD_test02') )
test_to_csv( test_predict03, os.path.join(test_path, 'SVD_test03') )
test_to_csv( test_predict04, os.path.join(test_path, 'SVD_test04') )
test_to_csv( test_predict05, os.path.join(test_path, 'SVD_test05') )
test_to_csv( test_predict06, os.path.join(test_path, 'SVD_test06') )
test_to_csv( test_predict07, os.path.join(test_path, 'SVD_test07') )
test_to_csv( test_predict08, os.path.join(test_path, 'SVD_test08') )
test_to_csv( test_predict09, os.path.join(test_path, 'SVD_test09') )
test_to_csv( test_predict10, os.path.join(test_path, 'SVD_test10') )

In [39]:
new_valid = np.array([    
                    valid_predict01,
                    valid_predict02,
                    valid_predict03,
                    valid_predict04,
                    valid_predict05,
                    valid_predict06,
                    valid_predict07,
                    valid_predict08,
                    valid_predict09,
                    valid_predict10]).T

new_test = np.array([
                    test_predict01,
                    test_predict02,
                    test_predict03,
                    test_predict04,
                    test_predict05,
                    test_predict06,
                    test_predict07,
                    test_predict08,
                    test_predict09,
                    test_predict10,]).T

In [40]:
val = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv')
tail_idx = val.index[val.answerCode==-1].to_numpy()

In [41]:
y_valid = valid_data.answerCode.to_numpy()

valid_tail = [new_valid[i] for i in range(len(new_valid)) if i in tail_idx]
y_tail = [y_valid[i] for i in range(len(y_valid)) if i in tail_idx]

new_valid = [new_valid[i] for i in range(len(new_valid)) if not i in tail_idx]
y_new_valid = [y_valid[i] for i in range(len(y_valid)) if not i in tail_idx]

In [42]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(new_valid, y_new_valid)
# eval_pool = Pool(valid_tail , y_tail)
eval_pool = Pool(valid_tail, y_tail)

Final_cat = CatBoostClassifier(
            iterations = 3000,
            random_seed = 42,
            learning_rate = 0.001,
            loss_function = 'Logloss', 
            custom_metric = ['Logloss','AUC'],
            early_stopping_rounds = 30,
            use_best_model =  True,
            task_type = "GPU",
            bagging_temperature = 1,
            verbose = False)

Final_cat.fit(train_pool, eval_set=eval_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f67d4d8c490>

In [43]:
preds = Final_cat.predict(new_test , prediction_type='Probability')[:,1]

from datetime import date, datetime, timezone, timedelta

KST = timezone(timedelta(hours=9))
time_record = datetime.now(KST)
_day = str(time_record)[:10]
_time = str(time_record.time())[:8]
now_time = _day+'_'+_time

test_to_csv(new_test.mean(axis=1),f'Stacking_SVD_{now_time}')

In [101]:
test_to_csv()

744