In [21]:
from sklearn.linear_model import LogisticRegression, ElasticNet, SGDClassifier, MultiTaskElasticNet, BayesianRidge, ARDRegression
import pandas as pd
import numpy as np
from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(f'{name}', index=None)
    

In [2]:
cate_cols = [
            # 'assessmentItemID',
            # 'testId',
            # 'KnowledgeTag',
            'hour',
            'dow',
            'i_head',
            'i_mid',
            'i_tail',
]
cont_cols = [                        
            'user_correct_answer',
            'user_total_answer',
            'user_acc',            
            't_elapsed',            
            'cum_correct',
            'last_problem',
            'head_term',
            # 'left_asymptote',
            'elo_prob',
            'pkt',
            'u_head_mean',
            'u_head_count',
            'u_head_std',
            'u_head_elapsed',
            'i_mid_elapsed',
            'i_mid_mean',
            'i_mid_std',
            'i_mid_sum',
            'i_mid_count',
            'i_mid_tag_count',
            'assessment_mean',
            'assessment_sum',
            # 'assessment_std',
            'tag_mean',
            'tag_sum',
            # 'tag_std',
            'tail_mean',
            'tail_sum',
            # 'tail_std',
            'hour_mean',
            'hour_sum',
            # 'hour_std',
            'dow_mean',
            'dow_sum',
            # 'dow_std',
            'tag_elapsed',
            'tag_elapsed_o',
            'tag_elapsed_x',
            'assessment_elapsed',
            'assessment_elapsed_o',
            'assessment_elapsed_x',
            'tail_elapsed',
            'tail_elapsed_o',
            'tail_elapsed_x'
            ]

FEATS = cate_cols + cont_cols

In [3]:
data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/all.pkl')
label = data.answerCode.to_numpy()

valid_user = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv').userID.unique()
test_user = pd.read_csv('/opt/ml/input/data/test_data.csv').userID.unique()

train = data[data.userID.isin(valid_user)==False]
train = data[data.userID.isin(test_user)==False]

X_train = data[FEATS]
y_train = data.answerCode
X_valid = data[data.userID.isin(valid_user)][FEATS]
y_valid = data[data.userID.isin(valid_user)].answerCode

In [4]:
ARD = ARDRegression()

In [5]:
ARD.fit(X_train, y_train)

ARDRegression()

In [17]:
valid_preds = ARD.predict(X_valid[FEATS])

In [18]:
get_metric(y_valid,valid_preds)

auc : 0.8453184936669191
acc : 0.7900573583657786
precision : 0.8059670002704896
recall : 0.8950884676619905


In [9]:
test = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/test_data-1.pkl')

In [10]:
test = test[test.answerCode==-1]

In [12]:
test_preds = ARD.predict(test[FEATS])

In [24]:
test_to_csv(valid_preds,'/opt/ml/level2-dkt-level2-recsys-08/LetsEnsemble/valid4feature/ARD.csv')

test