In [None]:
# https://www.kaggle.com/code/sarmat/lgbm-stacking-example/notebook

In [1]:
import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds

from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, ElasticNet

import torch
from tqdm import tqdm
from dataset import custom_train_test_split, make_dataset

from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(f'output/{name}.csv', index=None)
    


In [5]:
t = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/test_data-1.pkl')
test_to_csv(t[t.answerCode==-1].elo_prob, 'elo_prob')

In [2]:
cate_cols = [
            # 'assessmentItemID',
            # 'testId',
            # 'KnowledgeTag',
            'hour',
            'dow',
            'i_head',
            'i_mid',
            'i_tail',
]
cont_cols = [                        
            'user_correct_answer',
            'user_total_answer',
            'user_acc',            
            't_elapsed',            
            'cum_correct',
            'last_problem',
            'head_term',
            # 'left_asymptote',
            'elo_prob',
            'pkt',
            'u_head_mean',
            'u_head_count',
            'u_head_std',
            'u_head_elapsed',
            'i_mid_elapsed',
            'i_mid_mean',
            'i_mid_std',
            'i_mid_sum',
            'i_mid_count',
            'i_mid_tag_count',
            'assessment_mean',
            'assessment_sum',
            # 'assessment_std',
            'tag_mean',
            'tag_sum',
            # 'tag_std',
            'tail_mean',
            'tail_sum',
            # 'tail_std',
            'hour_mean',
            'hour_sum',
            # 'hour_std',
            'dow_mean',
            'dow_sum',
            # 'dow_std',
            'tag_elapsed',
            'tag_elapsed_o',
            'tag_elapsed_x',
            'assessment_elapsed',
            'assessment_elapsed_o',
            'assessment_elapsed_x',
            'tail_elapsed',
            'tail_elapsed_o',
            'tail_elapsed_x'
            ]

FEATS = cate_cols + cont_cols

In [14]:
model01 = ElasticNet(max_iter=2000, alpha= 0.4, fit_intercept= True)
model02 = ElasticNet(max_iter=2000, alpha= 0.4, l1_ratio=0.1)
model03 = ElasticNet(max_iter=2000, alpha= 0.4, l1_ratio=0.9)
model04 = ElasticNet(max_iter=2000, alpha= 0.6, fit_intercept= True)
model05 = ElasticNet(max_iter=2000, alpha= 0.6, l1_ratio=0.1)
model06 = ElasticNet(max_iter=2000, alpha= 0.6, l1_ratio=0.9)
model07 = ElasticNet(max_iter=2000, alpha= 0.6)
model08 = ElasticNet(max_iter=2000, alpha= 0.8, fit_intercept= True)
model09 = ElasticNet(max_iter=2000, alpha= 0.8, l1_ratio=0.1)
model10 = ElasticNet(max_iter=2000, alpha= 0.8, l1_ratio=0.9)

In [15]:
data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/all.pkl')
label = data.answerCode.to_numpy()

valid = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv')
valid_index = valid[valid.answerCode==-1].index

X_train = data[data.index.isin(valid_index)==False]
y_train = X_train.answerCode.to_numpy()

X_valid = data[data.index.isin(valid_index)==True]
y_valid = X_valid.answerCode.to_numpy()

In [16]:
model01.fit(X_train[FEATS], y_train)
model02.fit(X_train[FEATS], y_train)
model03.fit(X_train[FEATS], y_train)
model04.fit(X_train[FEATS], y_train)
model05.fit(X_train[FEATS], y_train)
model06.fit(X_train[FEATS], y_train)
model07.fit(X_train[FEATS], y_train)
model08.fit(X_train[FEATS], y_train)
model09.fit(X_train[FEATS], y_train)
model10.fit(X_train[FEATS], y_train)

ElasticNet(alpha=0.8, l1_ratio=0.9, max_iter=2000)

In [17]:
valid_predict01 = model01.predict(X_valid[FEATS])
valid_predict02 = model02.predict(X_valid[FEATS])
valid_predict03 = model03.predict(X_valid[FEATS])
valid_predict04 = model04.predict(X_valid[FEATS])
valid_predict05 = model05.predict(X_valid[FEATS])
valid_predict06 = model06.predict(X_valid[FEATS])
valid_predict07 = model07.predict(X_valid[FEATS])
valid_predict08 = model08.predict(X_valid[FEATS])
valid_predict09 = model09.predict(X_valid[FEATS])
valid_predict10 = model10.predict(X_valid[FEATS])

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
test = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/test_data-1.pkl')
test = test[test.answerCode==-1]

test_predict01 = model01.predict(test[FEATS])
test_predict02 = model02.predict(test[FEATS])
test_predict03 = model03.predict(test[FEATS])
test_predict04 = model04.predict(test[FEATS])
test_predict05 = model05.predict(test[FEATS])
test_predict06 = model06.predict(test[FEATS])
test_predict07 = model07.predict(test[FEATS])
test_predict08 = model08.predict(test[FEATS])
test_predict09 = model09.predict(test[FEATS])
test_predict10 = model10.predict(test[FEATS])


print("AUC ELASTIC 1:{} ".format(get_metric(y_valid, np.array(valid_predict01))))
print("AUC ELASTIC 2:{} ".format(get_metric(y_valid, np.array(valid_predict02))))
print("AUC ELASTIC 3:{} ".format(get_metric(y_valid, np.array(valid_predict03))))
print("AUC ELASTIC 4:{} ".format(get_metric(y_valid, np.array(valid_predict04))))
print("AUC ELASTIC 5:{} ".format(get_metric(y_valid, np.array(valid_predict05))))
print("AUC ELASTIC 6:{} ".format(get_metric(y_valid, np.array(valid_predict06))))
print("AUC ELASTIC 7:{} ".format(get_metric(y_valid, np.array(valid_predict07))))
print("AUC ELASTIC 8:{} ".format(get_metric(y_valid, np.array(valid_predict08))))
print("AUC ELASTIC 9:{} ".format(get_metric(y_valid, np.array(valid_predict09))))
print("AUC ELASTIC 0:{} ".format(get_metric(y_valid, np.array(valid_predict10))))

auc : 0.7923155162738496
acc : 0.7634408602150538
precision : 0.8055555555555556
recall : 0.8787878787878788
AUC ELASTIC 1:None 
auc : 0.7936307519640853
acc : 0.7594086021505376
precision : 0.8066783831282952
recall : 0.8693181818181818
AUC ELASTIC 2:None 
auc : 0.7918069584736251
acc : 0.7782258064516129
precision : 0.8071065989847716
recall : 0.9034090909090909
AUC ELASTIC 3:None 
auc : 0.7922979797979798
acc : 0.771505376344086
precision : 0.8086206896551724
recall : 0.8882575757575758
AUC ELASTIC 4:None 
auc : 0.7931134259259259
acc : 0.7634408602150538
precision : 0.8109540636042403
recall : 0.8693181818181818
AUC ELASTIC 5:None 
auc : 0.790518027497194
acc : 0.7741935483870968
precision : 0.7970297029702971
recall : 0.9147727272727273
AUC ELASTIC 6:None 
auc : 0.7922979797979798
acc : 0.771505376344086
precision : 0.8086206896551724
recall : 0.8882575757575758
AUC ELASTIC 7:None 
auc : 0.7913685465768799
acc : 0.7795698924731183
precision : 0.8063973063973064
recall : 0.90719696

In [18]:
new_valid = X_valid[FEATS].copy()
new_valid.loc[:,'predict01'] = valid_predict01
valid_predict01
new_valid.loc[:,'predict02'] = valid_predict02
new_valid.loc[:,'predict03'] = valid_predict03
new_valid.loc[:,'predict04'] = valid_predict04
new_valid.loc[:,'predict05'] = valid_predict05
new_valid.loc[:,'predict06'] = valid_predict06
new_valid.loc[:,'predict07'] = valid_predict07
new_valid.loc[:,'predict08'] = valid_predict08
new_valid.loc[:,'predict09'] = valid_predict09
new_valid.loc[:,'predict10'] = valid_predict10


# valid_tail = new_valid[new_valid.index.isin(X_valid.groupby('userID').tail(1).index)==True]
# new_valid = new_valid[new_valid.index.isin(X_valid.groupby('userID').tail(1).index)==False]

new_test = test[FEATS].copy()
new_test.loc[:,'predict01'] = test_predict01
new_test.loc[:,'predict02'] = test_predict02
new_test.loc[:,'predict03'] = test_predict03
new_test.loc[:,'predict04'] = test_predict04
new_test.loc[:,'predict05'] = test_predict05
new_test.loc[:,'predict06'] = test_predict06
new_test.loc[:,'predict07'] = test_predict07
new_test.loc[:,'predict08'] = test_predict08
new_test.loc[:,'predict09'] = test_predict09
new_test.loc[:,'predict10'] = test_predict10

In [19]:
Final = ElasticNet(alpha=0.7)
Final.fit(new_valid, y_valid)

ElasticNet(alpha=0.7)

In [20]:
preds = Final.predict(new_test)

In [21]:
from datetime import date, datetime, timezone, timedelta

KST = timezone(timedelta(hours=9))
time_record = datetime.now(KST)
_day = str(time_record)[:10]
_time = str(time_record.time())[:8]
now_time = _day+'_'+_time


test_to_csv(preds, f'Blending_Elastic_{now_time}')

In [23]:
test_to_csv(test_predict01, 'Elastic01')
test_to_csv(test_predict02, 'Elastic02')
test_to_csv(test_predict03, 'Elastic03')
test_to_csv(test_predict04, 'Elastic04')
test_to_csv(test_predict05, 'Elastic05')
test_to_csv(test_predict06, 'Elastic06')
test_to_csv(test_predict07, 'Elastic07')
test_to_csv(test_predict08, 'Elastic08')
test_to_csv(test_predict09, 'Elastic09')
test_to_csv(test_predict10, 'Elastic10')