In [1]:
import os
import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds

from catboost import CatBoostClassifier, Pool

from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, ElasticNet

import torch
from tqdm import tqdm
from dataset import custom_train_test_split, make_dataset

from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(f'output/{name}.csv', index=None)

In [29]:
cat_cols = ['i_head', 'i_mid','i_tail', 'hour', 'dow']
cont_cols = [                        
        'user_correct_answer',
        'user_total_answer',
        'user_acc',            
        't_elapsed',            
        'cum_correct',
        'last_problem',
        'head_term',
        # 'left_asymptote',
        'elo_prob',
        'pkt',
        'u_head_mean',
        'u_head_count',
        'u_head_std',
        'u_head_elapsed',
        'i_mid_elapsed',
        'i_mid_mean',
        'i_mid_std',
        'i_mid_sum',
        'i_mid_count',
        'i_mid_tag_count',
        'assessment_mean',
        'assessment_sum',
        # 'assessment_std',
        'tag_mean',
        'tag_sum',
        # 'tag_std',
        'tail_mean',
        'tail_sum',
        # 'tail_std',
        'hour_mean',
        'hour_sum',
        # 'hour_std',
        'dow_mean',
        'dow_sum',
        # 'dow_std',
        'tag_elapsed',
        'tag_elapsed_o',
        'tag_elapsed_x',
        'assessment_elapsed',
        'assessment_elapsed_o',
        'assessment_elapsed_x',
        'tail_elapsed',
        'tail_elapsed_o',
        'tail_elapsed_x']

FEATS = cat_cols + cont_cols

In [54]:
test_path = '/opt/ml/level2-dkt-level2-recsys-08/LetsEnsemble/test4feature'
new_test = np.array([pd.read_csv(os.path.join(test_path, i)).prediction.to_numpy() for i in sorted(os.listdir(test_path)) if 'csv' in i]).T

In [95]:
SVD_M = np.array([pd.read_csv(os.path.join(test_path, i)).prediction.to_numpy() for i in sorted(os.listdir(test_path)) if 'SVD' in i])

In [97]:
test_to_csv( SVD_M.mean(axis=0), 'SVD_M')

In [69]:
valid_path = '/opt/ml/level2-dkt-level2-recsys-08/LetsEnsemble/valid4feature'
new_valid = np.array([pd.read_csv(os.path.join(valid_path, i)).prediction.to_numpy() for i in sorted(os.listdir(valid_path)) if 'csv' in i]).T

In [70]:
new_valid = pd.DataFrame(new_valid)

In [71]:
new_valid

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.918116,0.933648,0.931913,0.779519,0.797663,0.712029,0.787183,0.791153,0.756471,0.786572,...,0.614572,0.639967,0.628232,0.634238,0.610640,0.982841,0.974085,0.964015,0.975442,0.973217
1,0.970901,0.919472,0.916695,0.637984,0.609609,0.610547,0.625726,0.619585,0.605415,0.640123,...,0.509931,0.527975,0.505947,0.515663,0.499735,0.832041,0.676492,0.855268,0.838003,0.862343
2,0.679411,0.614240,0.614876,0.521571,0.469497,0.461646,0.491434,0.483382,0.458275,0.510387,...,0.511642,0.516946,0.519482,0.516710,0.515005,0.647891,0.678279,0.695586,0.652820,0.712564
3,0.443304,0.378989,0.378319,0.397810,0.324053,0.289034,0.350591,0.340875,0.292373,0.373986,...,0.485679,0.474777,0.502003,0.493449,0.526718,0.174367,0.179221,0.279049,0.240238,0.211458
4,0.456442,0.412985,0.406406,0.406301,0.328313,0.255739,0.357771,0.346956,0.266084,0.380305,...,0.487983,0.483617,0.500965,0.498902,0.526614,0.697683,0.658814,0.622517,0.678679,0.671731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254012,0.380841,0.395753,0.394698,0.537085,0.537361,0.597468,0.528282,0.531533,0.571448,0.527866,...,0.497018,0.495628,0.495748,0.495673,0.495369,0.065116,0.091831,0.062530,0.069911,0.057821
254013,0.283810,0.273031,0.274308,0.613243,0.600292,0.599663,0.612619,0.607764,0.598443,0.620992,...,0.497553,0.497044,0.496619,0.496665,0.496894,0.226089,0.268577,0.242929,0.248453,0.184142
254014,0.032606,0.019868,0.023070,0.476197,0.447102,0.418394,0.461375,0.456094,0.428348,0.472600,...,0.491932,0.491120,0.491464,0.491398,0.491194,0.009692,0.009458,0.010611,0.020361,0.008770
254015,0.269548,0.267726,0.268909,0.623331,0.629422,0.608601,0.634944,0.633089,0.616653,0.638834,...,0.496589,0.496307,0.496234,0.496338,0.496521,0.047741,0.085718,0.102905,0.085990,0.077035


In [72]:
# valid 답 가져오기용
valid_user = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv').userID.unique()
data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/all.pkl')
data = data[data.userID.isin(valid_user)]
y_valid = data.answerCode.to_numpy()
data = data[FEATS] 

In [None]:
PendingDeprecationWarning

In [73]:
new_valid = pd.concat([data.reset_index(drop=True), new_valid.reset_index(drop=True)],axis =1 )

In [74]:
new_valid

Unnamed: 0,i_head,i_mid,i_tail,hour,dow,user_correct_answer,user_total_answer,user_acc,t_elapsed,cum_correct,...,41,42,43,44,45,46,47,48,49,50
0,4,93,1,23,3,0.0,0,0.000000,0.0,0.000000,...,0.614572,0.639967,0.628232,0.634238,0.610640,0.982841,0.974085,0.964015,0.975442,0.973217
1,4,93,2,23,3,1.0,1,1.000000,60.0,1.000000,...,0.509931,0.527975,0.505947,0.515663,0.499735,0.832041,0.676492,0.855268,0.838003,0.862343
2,4,93,3,23,3,1.0,2,0.500000,43.0,0.500000,...,0.511642,0.516946,0.519482,0.516710,0.515005,0.647891,0.678279,0.695586,0.652820,0.712564
3,4,93,4,23,3,1.0,3,0.333333,19.0,0.333333,...,0.485679,0.474777,0.502003,0.493449,0.526718,0.174367,0.179221,0.279049,0.240238,0.211458
4,4,93,5,23,3,1.0,4,0.250000,171.0,0.250000,...,0.487983,0.483617,0.500965,0.498902,0.526614,0.697683,0.658814,0.622517,0.678679,0.671731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254012,3,171,1,7,4,7.0,9,0.777778,0.0,0.000000,...,0.497018,0.495628,0.495748,0.495673,0.495369,0.065116,0.091831,0.062530,0.069911,0.057821
254013,3,171,2,7,4,7.0,10,0.700000,36.0,0.000000,...,0.497553,0.497044,0.496619,0.496665,0.496894,0.226089,0.268577,0.242929,0.248453,0.184142
254014,3,171,3,7,4,7.0,11,0.636364,2.0,0.000000,...,0.491932,0.491120,0.491464,0.491398,0.491194,0.009692,0.009458,0.010611,0.020361,0.008770
254015,3,171,4,7,4,7.0,12,0.583333,1.0,0.000000,...,0.496589,0.496307,0.496234,0.496338,0.496521,0.047741,0.085718,0.102905,0.085990,0.077035


In [55]:
new_test = pd.DataFrame(new_test)

In [87]:
test_data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/test_data.pkl')

In [88]:
test_data

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,hour,...,dow_std,tag_elapsed,tag_elapsed_o,tag_elapsed_x,assessment_elapsed,assessment_elapsed_o,assessment_elapsed_x,tail_elapsed,tail_elapsed_o,tail_elapsed_x
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,0.0,0,0.000000,10,...,0.480501,71.311828,76.240741,64.487179,152.000000,182.000000,92.000000,142.857053,146.182621,132.386643
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,1.0,1,1.000000,10,...,0.480501,71.311828,76.240741,64.487179,73.096774,69.411765,77.571429,54.837210,55.774326,52.407706
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625,2.0,2,1.000000,10,...,0.480501,69.839196,78.957265,56.829268,93.466667,105.500000,87.450000,55.713450,58.404177,49.635740
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625,2.0,3,0.666667,10,...,0.480501,69.839196,78.957265,56.829268,49.437500,83.937500,14.937500,54.394718,58.958252,45.354587
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623,2.0,4,0.500000,10,...,0.480501,58.286089,68.246305,46.926966,40.156250,53.333333,35.000000,55.045978,65.101522,41.996363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,8.0,11,0.727273,23,...,0.482976,64.899563,66.930233,62.585859,170.625000,202.833333,74.000000,142.857053,146.182621,132.386643
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,8.0,12,0.666667,23,...,0.482976,64.899563,66.930233,62.585859,47.341463,56.190476,38.050000,54.837210,55.774326,52.407706
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,9.0,13,0.692308,23,...,0.482976,34.006579,36.500000,26.289474,29.000000,31.612903,20.900000,55.713450,58.404177,49.635740
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,10.0,14,0.714286,23,...,0.482976,34.006579,36.500000,26.289474,44.214286,48.416667,19.000000,54.394718,58.958252,45.354587


In [57]:
test_data = test_data[test_data.answerCode==-1][FEATS]

In [58]:
new_test = pd.concat([test_data.reset_index(drop=True), new_test.reset_index(drop=True)],axis =1 )

In [59]:
new_test

Unnamed: 0,i_head,i_mid,i_tail,hour,dow,user_correct_answer,user_total_answer,user_acc,t_elapsed,cum_correct,...,41,42,43,44,45,46,47,48,49,50
0,5,133,8,13,0,717.0,1035,0.692754,46.0,0.857143,...,0.578802,0.584042,0.571803,0.583117,0.567278,0.526163,0.674929,0.605678,0.662589,0.594922
1,7,146,8,2,6,465.0,670,0.694030,23.0,0.857143,...,0.645740,0.644783,0.643798,0.648254,0.653150,0.800086,0.884491,0.844972,0.845496,0.907504
2,7,111,8,4,6,915.0,1316,0.695289,8.0,0.428571,...,0.410186,0.325136,0.365284,0.366662,0.360052,0.202562,0.219156,0.209493,0.240123,0.232104
3,9,64,6,5,4,1031.0,1259,0.818904,75.0,1.000000,...,0.615056,0.629136,0.623329,0.624016,0.623162,0.667712,0.750481,0.767879,0.816710,0.712494
4,6,135,7,11,4,293.0,386,0.759067,17.0,0.666667,...,0.491498,0.489242,0.490462,0.493413,0.495987,0.310624,0.285298,0.264130,0.293787,0.280257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,4,122,5,2,1,7.0,23,0.304348,2.0,0.250000,...,0.495342,0.496767,0.496771,0.497086,0.496689,0.003992,0.003884,0.003353,0.005917,0.006627
740,3,111,5,9,1,7.0,14,0.500000,107.0,0.500000,...,0.500796,0.501698,0.502320,0.503535,0.503188,0.854108,0.818918,0.879979,0.797587,0.899924
741,5,193,4,2,6,7.0,14,0.500000,24.0,0.666667,...,0.508781,0.508427,0.508477,0.508130,0.508132,0.806766,0.778813,0.878524,0.885150,0.840537
742,5,193,4,13,6,2.0,14,0.142857,21.0,0.666667,...,0.507866,0.507274,0.507308,0.506940,0.506948,0.879703,0.857131,0.923626,0.900812,0.868369


In [104]:
v = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv')

In [107]:
new_valid[new_valid.index.isin(v[v.answerCode==-1].index)]

Unnamed: 0,i_head,i_mid,i_tail,hour,dow,user_correct_answer,user_total_answer,user_acc,t_elapsed,cum_correct,...,41,42,43,44,45,46,47,48,49,50
690,1,112,6,10,0,567.0,690,0.821739,0.0,0.600000,...,0.496760,0.488694,0.492336,0.481946,0.521601,0.574127,0.658865,0.708638,0.645739,0.683056
1226,7,148,8,8,4,249.0,535,0.465421,0.0,0.714286,...,0.481680,0.466683,0.464883,0.464323,0.468223,0.527919,0.524251,0.518357,0.497389,0.474085
1955,5,135,7,7,0,501.0,728,0.688187,7.0,0.666667,...,0.370629,0.362848,0.340628,0.321788,0.321753,0.043070,0.029936,0.031450,0.030838,0.025529
2626,7,160,8,4,6,541.0,670,0.807463,98.0,1.000000,...,0.583105,0.591106,0.588659,0.590834,0.603935,0.730409,0.763569,0.732371,0.714161,0.733945
3296,8,130,8,9,6,170.0,669,0.254111,1.0,0.000000,...,0.372949,0.353966,0.368778,0.370443,0.377119,0.067979,0.065165,0.051247,0.087836,0.082109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253960,7,121,1,3,3,7.0,15,0.466667,1.0,0.200000,...,0.495489,0.494868,0.493611,0.494186,0.494257,0.016149,0.014160,0.021054,0.010883,0.025744
253973,5,193,4,8,1,7.0,12,0.583333,60.0,1.000000,...,0.517111,0.518042,0.518502,0.519435,0.519277,0.998172,0.999136,0.999462,0.998449,0.999498
253989,4,169,5,12,2,3.0,15,0.200000,37.0,0.500000,...,0.511077,0.512295,0.513245,0.513477,0.513273,0.582991,0.554037,0.696441,0.501505,0.560566
254002,2,124,5,3,3,8.0,12,0.666667,30.0,0.750000,...,0.491319,0.491617,0.491923,0.491374,0.491377,0.022291,0.015642,0.019376,0.014994,0.012387


In [110]:
valid_target = pd.read_csv('/opt/ml/input/data/valid_target.csv').target

In [157]:
train_pool = Pool(new_valid ,y_valid)
valid_pool = Pool(new_valid[new_valid.index.isin(v[v.answerCode==-1].index)], valid_target)
model = CatBoostClassifier(
            iterations = 1500,
            random_seed = 42,         
            learning_rate = 0.0005,
            loss_function = 'Logloss', 
            custom_metric = ['Logloss','AUC'],
            early_stopping_rounds = 30,
            use_best_model =  True,
            task_type = "GPU",
            bagging_temperature = 1,
            verbose = False)

In [158]:
model.fit(valid_pool, eval_set=valid_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f60330e93d0>

In [159]:
valid_preds = model.predict_proba(new_valid)[:,-1]

In [160]:
get_metric(y_valid, valid_preds)

auc : 0.8931017631561702
acc : 0.8404516233165497
precision : 0.8616745655608214
recall : 0.901174562167683


In [161]:
test_preds = model.predict_proba(new_test)[:,-1]

In [162]:
test_to_csv(test_preds, 'all12')

In [85]:
test_to_csv(new_test.mean(axis=0),'all4')

In [102]:
import lightgbm as lgb


In [117]:
param = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'random_state': 42,
        'num_leaves' : 70,
        'metric': 'auc',
        'num_threads': -1,
        'learning_rate' : 0.005,}

In [118]:
lgb_valid = lgb.Dataset(new_valid, label = y_valid)

model = lgb.train(
                    param, 
                    lgb_valid,
                    # valid_sets =[lgb_tail],
                    num_boost_round = 1500,
                    # early_stopping_rounds=100, 
                    verbose_eval=100,                     
                    )
        
# Final_valid_predict = model.predict(valid_tail)

Final_test_predict = model.predict(new_test)



[LightGBM] [Info] Number of positive: 166445, number of negative: 87572
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 254017, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655251 -> initscore=0.642204
[LightGBM] [Info] Start training from score 0.642204


In [119]:
test_to_csv(Final_test_predict, 'all5')