In [1]:
# https://www.kaggle.com/code/sarmat/lgbm-stacking-example/notebook

In [22]:
import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds
import os
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import torch
from tqdm import tqdm
from dataset import custom_train_test_split, make_dataset

from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(name+'.csv', index=None)
    


In [None]:
'''
NMF01 
auc : 0.7953
acc : 0.7684
precision : 0.7832
recall : 0.8940

auc : 0.7972041283367117
acc : 0.7695351098548522
precision : 0.7839268704708476
recall : 0.8949562918681847
AUC NMF02:None 
auc : 0.7998546055565555
acc : 0.7694918056665498
precision : 0.7834012776330167
recall : 0.8959235783592178
AUC NMF03:None 
auc : 0.8023755707592811
acc : 0.7718459788124417
precision : 0.7858219257479477
recall : 0.8960197062092583
AUC NMF04:None 
auc : 0.804514090366049
acc : 0.7735427156450159
precision : 0.787716934780886
recall : 0.8958094265372946
AUC NMF05:None 
auc : 0.8066111525597577
acc : 0.7742040887027246
precision : 0.7885876035025529
recall : 0.8954729790621526
AUC NMF06:None 
auc : 0.8082522468764606
acc : 0.7749717538589936
precision : 0.7896567078730307
recall : 0.8949743158400673
AUC NMF07:None 
auc : 0.8099958597197554
acc : 0.776227575319762
precision : 0.7909170537491705
recall : 0.8951245156057557
AUC NMF08:None 
auc : 0.8118606391044008
acc : 0.7777747158654735
precision : 0.7926297194909122
recall : 0.8950103637838325
AUC NMF09:None 
auc : 0.8139937635119844
acc : 0.7786368628871296
precision : 0.7937734893514939
recall : 0.8945898044399051
AUC NMF10:None 
'''

In [2]:
train_data = pd.read_csv('/opt/ml/input/data/all.csv')
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

In [3]:
train_data.drop_duplicates(subset = ["userID", "assessmentItemID"], keep = "last", inplace = True)
train_data.drop(['Timestamp','testId','KnowledgeTag'], axis=1, inplace=True, errors='ignore')

In [4]:
matrix_train = train_data.pivot_table('answerCode', index='userID', columns='assessmentItemID')
matrix_train.fillna(0.5, inplace=True)

In [5]:
user_id2idx = {v:i for i,v in enumerate(matrix_train.index)}
user_idx2id = {i:v for i,v in enumerate(matrix_train.index)}

item_id2idx = {v:i for i,v in enumerate(matrix_train.columns)}
item_idx2id = {i:v for i,v in enumerate(matrix_train.columns)}

In [16]:
nmf01 = NMF(n_components=20, max_iter=1000)
nmf02 = NMF(n_components=22, max_iter=1000)
nmf03 = NMF(n_components=24, max_iter=1000)
nmf04 = NMF(n_components=26, max_iter=1000)
nmf05 = NMF(n_components=28, max_iter=1000)
nmf06 = NMF(n_components=30, max_iter=1000)
nmf07 = NMF(n_components=32, max_iter=1000)
nmf08 = NMF(n_components=34, max_iter=1000)
nmf09 = NMF(n_components=36, max_iter=1000)
nmf10 = NMF(n_components=38, max_iter=1000)

nmf01.fit(matrix_train)
nmf02.fit(matrix_train)
nmf03.fit(matrix_train)
nmf04.fit(matrix_train)
nmf05.fit(matrix_train)
nmf06.fit(matrix_train)
nmf07.fit(matrix_train)
nmf08.fit(matrix_train)
nmf09.fit(matrix_train)
nmf10.fit(matrix_train)





NMF(max_iter=1000, n_components=38)

In [19]:
def predict(matrix, userid, itemid, user_id2idx, item_id2idx, model):

    X_pred = model.inverse_transform(model.transform(matrix))

    ret = [X_pred[user_id2idx[u], item_id2idx[i]] for u,i in zip(userid, itemid)]
    return ret

In [20]:
valid_user  = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv').userID.unique()
all_train = pd.read_csv('/opt/ml/input/data/all.csv')
valid_data = all_train[all_train.userID.isin(valid_user)]
userid = sorted(list(set([u for u in valid_data.userID])))
user_id2idx_valid = {v:i for i,v in enumerate(userid)}

matrix_valid = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(valid_data.userID, valid_data.assessmentItemID, valid_data.answerCode):
    user,item = user_id2idx_valid[user],item_id2idx[item]
    matrix_valid[user,item] = a



valid_predict01 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf01 )
valid_predict02 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf02 )
valid_predict03 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf03 )
valid_predict04 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf04 )
valid_predict05 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf05 )
valid_predict06 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf06 )
valid_predict07 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf07 )
valid_predict08 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf08 )
valid_predict09 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf09 )
valid_predict10 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, nmf10 )

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    if a<0:a=0.5
    matrix_test[user,item] = a

test_data = test_data[test_data.answerCode==-1]

test_predict01 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf01 )
test_predict02 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf02 )
test_predict03 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf03 )
test_predict04 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf04 )
test_predict05 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf05 )
test_predict06 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf06 )
test_predict07 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf07 )
test_predict08 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf08 )
test_predict09 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf09 )
test_predict10 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, nmf10 )

# print('Fold no: {}'.format(fold_))
print("AUC NMF01:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict01))))
print("AUC NMF02:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict02))))
print("AUC NMF03:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict03))))
print("AUC NMF04:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict04))))
print("AUC NMF05:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict05)))) 
print("AUC NMF06:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict06)))) 
print("AUC NMF07:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict07)))) 
print("AUC NMF08:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict08)))) 
print("AUC NMF09:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict09)))) 
print("AUC NMF10:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict10))))



auc : 0.7953082801102949
acc : 0.7684643153804667
precision : 0.7832797292246793
recall : 0.8940010213584066
AUC NMF01:None 
auc : 0.7972041283367117
acc : 0.7695351098548522
precision : 0.7839268704708476
recall : 0.8949562918681847
AUC NMF02:None 
auc : 0.7998546055565555
acc : 0.7694918056665498
precision : 0.7834012776330167
recall : 0.8959235783592178
AUC NMF03:None 
auc : 0.8023755707592811
acc : 0.7718459788124417
precision : 0.7858219257479477
recall : 0.8960197062092583
AUC NMF04:None 
auc : 0.804514090366049
acc : 0.7735427156450159
precision : 0.787716934780886
recall : 0.8958094265372946
AUC NMF05:None 
auc : 0.8066111525597577
acc : 0.7742040887027246
precision : 0.7885876035025529
recall : 0.8954729790621526
AUC NMF06:None 
auc : 0.8082522468764606
acc : 0.7749717538589936
precision : 0.7896567078730307
recall : 0.8949743158400673
AUC NMF07:None 
auc : 0.8099958597197554
acc : 0.776227575319762
precision : 0.7909170537491705
recall : 0.8951245156057557
AUC NMF08:None 
auc

In [23]:
test_path = '/opt/ml/level2-dkt-level2-recsys-08/LetsEnsemble/test4feature'
valid_path = '/opt/ml/level2-dkt-level2-recsys-08/LetsEnsemble/valid4feature'

test_to_csv( valid_predict01, os.path.join(valid_path, 'NMF_valid01') )
test_to_csv( valid_predict02, os.path.join(valid_path, 'NMF_valid02') )
test_to_csv( valid_predict03, os.path.join(valid_path, 'NMF_valid03') )
test_to_csv( valid_predict04, os.path.join(valid_path, 'NMF_valid04') )
test_to_csv( valid_predict05, os.path.join(valid_path, 'NMF_valid05') )
test_to_csv( valid_predict06, os.path.join(valid_path, 'NMF_valid06') )
test_to_csv( valid_predict07, os.path.join(valid_path, 'NMF_valid07') )
test_to_csv( valid_predict08, os.path.join(valid_path, 'NMF_valid08') )
test_to_csv( valid_predict09, os.path.join(valid_path, 'NMF_valid09') )
test_to_csv( valid_predict10, os.path.join(valid_path, 'NMF_valid10') )

test_to_csv( test_predict01, os.path.join(test_path, 'NMF_test01') )
test_to_csv( test_predict02, os.path.join(test_path, 'NMF_test02') )
test_to_csv( test_predict03, os.path.join(test_path, 'NMF_test03') )
test_to_csv( test_predict04, os.path.join(test_path, 'NMF_test04') )
test_to_csv( test_predict05, os.path.join(test_path, 'NMF_test05') )
test_to_csv( test_predict06, os.path.join(test_path, 'NMF_test06') )
test_to_csv( test_predict07, os.path.join(test_path, 'NMF_test07') )
test_to_csv( test_predict08, os.path.join(test_path, 'NMF_test08') )
test_to_csv( test_predict09, os.path.join(test_path, 'NMF_test09') )
test_to_csv( test_predict10, os.path.join(test_path, 'NMF_test10') )

In [15]:
new_valid = np.array([
                        valid_predict01,
                        valid_predict02,
                        valid_predict03,
                        valid_predict04,
                        valid_predict05,
                        valid_predict06,
                        valid_predict07,
                        valid_predict08,
                        valid_predict09,
                        valid_predict10,
                        valid_predict11,
                        valid_predict12,
                        valid_predict13,
                        valid_predict14,
                        valid_predict15,
                        valid_predict16,
                        valid_predict17,
                        valid_predict18,]).T

new_test = np.array([
                        test_predict01,
                        test_predict02,
                        test_predict03,
                        test_predict04,
                        test_predict05,
                        test_predict06,
                        test_predict07,
                        test_predict08,
                        test_predict09,
                        test_predict10,
                        test_predict11,
                        test_predict12,
                        test_predict13,
                        test_predict14,
                        test_predict15,
                        test_predict16,
                        test_predict17,
                        test_predict18,]).T

In [16]:
val = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv')
tail_idx = val.index[val.answerCode==-1].to_numpy()

In [17]:
y_valid = valid_data.answerCode.to_numpy()

valid_tail = [new_valid[i] for i in range(len(new_valid)) if i in tail_idx]
y_tail = [y_valid[i] for i in range(len(y_valid)) if i in tail_idx]

new_valid = [new_valid[i] for i in range(len(new_valid)) if not i in tail_idx]
y_new_valid = [y_valid[i] for i in range(len(y_valid)) if not i in tail_idx]

In [18]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(new_valid, y_new_valid)
# eval_pool = Pool(valid_tail , y_tail)
eval_pool = Pool(valid_tail, y_tail)

Final_cat = CatBoostClassifier(
            iterations = 500,
            random_seed = 42,
            learning_rate = 0.01,
            loss_function = 'Logloss', 
            custom_metric = ['Logloss','AUC'],
            early_stopping_rounds = 30,
            use_best_model =  True,
            task_type = "GPU",
            bagging_temperature = 1,
            verbose = False)

Final_cat.fit(train_pool, eval_set=eval_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f04c04f3a00>

In [20]:
preds = Final_cat.predict(new_test , prediction_type='Probability')[:,1]
val_preds = Final_cat.predict(valid_tail , prediction_type='Probability')[:,1]

get_metric(y_tail, val_preds)

from datetime import date, datetime, timezone, timedelta

KST = timezone(timedelta(hours=9))
time_record = datetime.now(KST)
_day = str(time_record)[:10]
_time = str(time_record.time())[:8]
now_time = _day+'_'+_time

test_to_csv(new_test.mean(axis=1),f'Stacking_NMF_{now_time}')

auc : 0.9045335626734506
acc : 0.8198924731182796
precision : 0.8145161290322581
recall : 0.8233695652173914


In [None]:
test_to_csv( new_test[:,0], 'NMF_38')
test_to_csv( new_test[:,1], 'NMF_40')
test_to_csv( new_test[:,2], 'NMF_42')
test_to_csv( new_test[:,3], 'NMF_44')
test_to_csv( new_test[:,4], 'NMF_46')
test_to_csv( new_test[:,5], 'NMF_48')
test_to_csv( new_test[:,6], 'NMF_50')
test_to_csv( new_test[:,7], 'NMF_52')


In [32]:
new_test[:,-1]

744