In [1]:
import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds

from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, PCA

import torch
from tqdm import tqdm

from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',round(auc,4))
    print('acc :',round(acc,4))
    print('precision :',round(precsion,4))
    print('recall :',round(recall,4))

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(f'output/{name}.csv', index=None)
    

In [2]:
def predict(matrix, userid, itemid, user_id2idx, item_id2idx, pca_model):
 
    X = matrix
    
    X_pred = pca_model.inverse_transform(pca_model.transform(X))

    ret = [X_pred[user_id2idx[u], item_id2idx[i]] for u,i in zip(userid, itemid)]
    return ret

In [3]:
valid_user  = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv').userID.unique()
all_train = pd.read_csv('/opt/ml/input/data/all.csv')
valid_data = all_train[all_train.userID.isin(valid_user)]
userid = sorted(list(set([u for u in valid_data.userID])))
user_id2idx_valid = {v:i for i,v in enumerate(userid)}

matrix_valid = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(valid_data.userID, valid_data.assessmentItemID, valid_data.answerCode):
    user,item = user_id2idx_valid[user],item_id2idx[item]
    matrix_valid[user,item] = a

valid_predict = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx)

NameError: name 'item_id2idx' is not defined

In [None]:
# item_id2idx는 train에서 사용한 것을 다시 사용한다.
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    if a<0:a=0.5
    matrix_test[user,item] = a

test_data = test_data[test_data.answerCode==-1]

test_predict = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, pca)

In [42]:
print("AUC NMF2:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict))))

auc : 0.780231974890282
acc : 0.7606853084636067
precision : 0.7755436285018334
recall : 0.8933161104268678
AUC NMF2:None 


In [43]:
test_to_csv(test_predict,'PCA')

In [6]:
print(
'''
PCA 
auc : 0.7802
acc : 0.7606
precision : 0.7755
recall : 0.8933
''')


PCA 
auc : 0.7802
acc : 0.7606
precision : 0.7755
recall : 0.8933



In [23]:
targets = [0, 0, 0, 0, 0, 0, 1]
preds = [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]

roc_auc_score(targets, preds)

0.5