In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, ElasticNet, SGDClassifier, MultiTaskElasticNet, BayesianRidge, Lasso
import pandas as pd
import numpy as np
from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.decomposition import NMF, PCA


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(f'output/{name}.csv', index=None)
    
    

In [2]:
cate_cols = [
            # 'assessmentItemID',
            # 'testId',
            # 'KnowledgeTag',
            'hour',
            'dow',
            'i_head',
            'i_mid',
            'i_tail',
]
cont_cols = [                        
            'user_correct_answer',
            'user_total_answer',
            'user_acc',            
            't_elapsed',            
            'cum_correct',
            'last_problem',
            'head_term',
            # 'left_asymptote',
            'elo_prob',
            'pkt',
            'u_head_mean',
            'u_head_count',
            'u_head_std',
            'u_head_elapsed',
            'i_mid_elapsed',
            'i_mid_mean',
            'i_mid_std',
            'i_mid_sum',
            'i_mid_count',
            'i_mid_tag_count',
            'assessment_mean',
            'assessment_sum',
            # 'assessment_std',
            'tag_mean',
            'tag_sum',
            # 'tag_std',
            'tail_mean',
            'tail_sum',
            # 'tail_std',
            'hour_mean',
            'hour_sum',
            # 'hour_std',
            'dow_mean',
            'dow_sum',
            # 'dow_std',
            'tag_elapsed',
            'tag_elapsed_o',
            'tag_elapsed_x',
            'assessment_elapsed',
            'assessment_elapsed_o',
            'assessment_elapsed_x',
            'tail_elapsed',
            'tail_elapsed_o',
            'tail_elapsed_x'
            ]

FEATS = cate_cols + cont_cols

In [3]:
data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/all.pkl')
label = data.answerCode.to_numpy()

In [4]:
valid = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv')
valid_index = valid[valid.answerCode==-1].index

In [5]:
X_train = data[data.index.isin(valid_index)==False]
y_train = X_train.answerCode.to_numpy()
X_valid = data[data.index.isin(valid_index)==True]
y_valid = X_valid.answerCode.to_numpy()

In [6]:
lasso = Lasso(random_state=42)

# cross_val_score(clf, data[FEATS], label, cv=10)

In [7]:
lasso.fit(data[FEATS],label)

Lasso(random_state=42)

In [15]:
test = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/test_data-1.pkl')
test = test[test.answerCode==-1][FEATS]

In [14]:
valid_preds = lasso.predict(X_valid[FEATS])

get_metric(y_valid, valid_preds)

auc : 0.7844065656565656
acc : 0.7661290322580645
precision : 0.780952380952381
recall : 0.9318181818181818


In [16]:
test_preds = lasso.predict(test[FEATS])

In [17]:
test_to_csv(test_preds, 'lasso')