In [1]:
import pickle
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict, deque
from tqdm.notebook import tqdm
import lightgbm as lgb

## setting
CV files are generated by [this notebook](https://www.kaggle.com/its7171/cv-strategy)

In [2]:
train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
question_file = '../input/features/question3.csv'
debug = False
validaten_flg = False

## feature engineering

In [3]:
def get_user_feats_for_nn_without_update(df, past_question, past_tag, past_part, past_answer, past_prior_elaps, past_time_diff, past_prior_exp):
    current_part = []
    current_question = []
    current_tag = []
    past_part_answer = []
    past_question_answer = []
    past_tag_answer = []
    past_answer_correctly = []
    past_time = []
    past_prior = []
    past_prior_explanation = []
    #past_other_feats = []
    
    
    for cnt,row in enumerate(tqdm(df[['user_id','content_id','part','tag_num','prior_question_elapsed_time','time_diff','prior_question_had_explanation']].values)):
                
        
        
        if row[0] not in past_answer:
            temp_answer = [0]*60
            temp_past_answer = [0]*59 + [2]
        else:
            temp_answer = past_answer[row[0]].copy()
            temp_past_answer = past_answer[row[0]].copy()
        
        
        if row[0] not in past_question:
            temp_question = [0]*59 + [13523*2+1]
        else:
            temp_question = past_question[row[0]].copy()
        
        
        temp_past_answer= [x+1 if y > 0 else 0 for x , y in zip(temp_past_answer, temp_question)]
        past_answer_correctly.append(temp_past_answer)
        
        temp_current_question = [x if x!= 13523*2+1 else 0 for x in temp_question]
        temp_current_question.append(row[1]+1)
        current_question.append(temp_current_question[1:])
        
        temp_past_question_answer = [x+y*13523 for x,y in zip(temp_question, temp_answer)]
        past_question_answer.append(temp_past_question_answer)
        
        
        
        
        if row[0] not in past_tag:
            temp_tag = [0]*59 + [1520*2+1]
        else:
            temp_tag = past_tag[row[0]].copy()
        
        temp_current_tag = [x if x!= 1520*2+1 else 0 for x in temp_tag]
        temp_current_tag.append(row[3]+1)
        current_tag.append(temp_current_tag[1:])
        
        temp_past_tag_answer = [x+y*1520 for x,y in zip(temp_tag, temp_answer)]
        past_tag_answer.append(temp_past_tag_answer)
        
        
        

        if row[0] not in past_part:
            temp_part = [0]*59 + [15]
        else:
            temp_part = past_part[row[0]].copy()
        
        temp_current_part = [x if x!= 15 else 0 for x in temp_part]
        temp_current_part.append(row[2])
        current_part.append(temp_current_part[1:])
        
        temp_past_part_answer = [x+y*7 for x,y in zip(temp_part,temp_answer)]
        past_part_answer.append(temp_past_part_answer)
        
        
        
        if row[0] not in past_prior_elaps:
            temp_elaps = [0]*59 + [row[4]/3e5]
        else:
            temp_elaps = past_prior_elaps[row[0]].copy()
            temp_elaps.append(row[4]/3e5)
        past_prior.append(temp_elaps)


        if row[0] not in past_prior_exp:
            temp_prior_exp = [0]*59 + [row[6]]
        else:
            temp_prior_exp = past_prior_exp[row[0]].copy()
            temp_prior_exp.append(row[6])
        past_prior_explanation.append(temp_prior_exp)

        
        if row[0] not in past_time_diff:
            temp_time_diff = [0]*59 + [row[5]/1e6]
        else:
            temp_time_diff = past_time_diff[row[0]].copy()
            temp_time_diff.append(row[5]/1e6)
        past_time.append(temp_time_diff)
    
    current_part = np.array(current_part)
    current_tag = np.array(current_tag)
    current_question = np.array(current_question)
    past_part_answer = np.array(past_part_answer)
    past_tag_answer = np.array(past_tag_answer)
    past_question_answer = np.array(past_question_answer)    
    past_other_feats = np.dstack((past_prior,past_time))
    past_answer_correctly = np.array(past_answer_correctly)
    past_prior_explanation = np.array(past_prior_explanation)
    
    return current_part, current_tag, current_question, past_part_answer, past_tag_answer, past_question_answer, past_other_feats, past_answer_correctly, past_prior_explanation

In [4]:
def add_user_feats_without_update(df , all_last60_avg, part1_last30_avg, part2_last30_avg, part3_last30_avg, 
                                  part4_last30_avg, part5_last30_avg, part6_last30_avg, part7_last30_avg, user_tag_lag1, user_answer_lag1,
                                  last_time_u_dict, part1_answered_correctly_sum_u_dict, part1_count_u_dict, part2_answered_correctly_sum_u_dict,
                                  part2_count_u_dict, part3_answered_correctly_sum_u_dict, part3_count_u_dict, part4_answered_correctly_sum_u_dict,
                                  part4_count_u_dict, part5_answered_correctly_sum_u_dict, part5_count_u_dict, part6_answered_correctly_sum_u_dict,
                                  part6_count_u_dict, part7_answered_correctly_sum_u_dict, part7_count_u_dict ):
    acsu_part = np.zeros(len(df), dtype=np.int32)
    cu_part = np.zeros(len(df), dtype=np.int32)
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    td = np.zeros(len(df), dtype=np.int32)
    user_prev_tag_lag1 = np.zeros(len(df), dtype = np.int32)
    acsu_part_30 = np.zeros(len(df), dtype=np.float32)
    acsu_60 = np.zeros(len(df), dtype=np.float32)
    
    for cnt,row in enumerate(df[['user_id','part','timestamp']].values):
        td[cnt] = row[2] - last_time_u_dict[row[0]]
        acsu[cnt] = part1_answered_correctly_sum_u_dict[row[0]]+part2_answered_correctly_sum_u_dict[row[0]]+part3_answered_correctly_sum_u_dict[row[0]]+part4_answered_correctly_sum_u_dict[row[0]]+part5_answered_correctly_sum_u_dict[row[0]]+part6_answered_correctly_sum_u_dict[row[0]]+part7_answered_correctly_sum_u_dict[row[0]]
        cu[cnt] = part1_count_u_dict[row[0]]+part2_count_u_dict[row[0]]+part3_count_u_dict[row[0]]+part4_count_u_dict[row[0]]+part5_count_u_dict[row[0]]+part6_count_u_dict[row[0]]+part7_count_u_dict[row[0]]

            
            
        if row[0] not in all_last60_avg:
            acsu_60[cnt] = 0
        else:
            acsu_60[cnt] = sum(all_last60_avg[row[0]])/len(all_last60_avg[row[0]])
            
        
        if row[0] in user_tag_lag1:
            user_prev_tag_lag1[cnt] = user_tag_lag1[row[0]]+user_answer_lag1[row[0]]*(283+1)
        else:
            user_prev_tag_lag1[cnt] = 568
            
            
        if row[1] == 1:
            acsu_part[cnt] = part1_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part1_count_u_dict[row[0]]
            
            if row[0] not in part1_last30_avg:
                acsu_part_30[cnt] = 0
            else:
                acsu_part_30[cnt] = sum(part1_last30_avg[row[0]])/len(part1_last30_avg[row[0]])  
            
            
        elif row[1] == 2:
            acsu_part[cnt] = part2_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part2_count_u_dict[row[0]]

            if row[0] not in part2_last30_avg:
                acsu_part_30[cnt] = 0
            else:
                acsu_part_30[cnt] = sum(part2_last30_avg[row[0]])/len(part2_last30_avg[row[0]])  
            
            
        elif row[1] == 3:
            acsu_part[cnt] = part3_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part3_count_u_dict[row[0]]

            if row[0] not in part3_last30_avg:
                acsu_part_30[cnt] = 0
            else:
                acsu_part_30[cnt] = sum(part3_last30_avg[row[0]])/len(part3_last30_avg[row[0]])  
            
            
        elif row[1] == 4:
            acsu_part[cnt] = part4_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part4_count_u_dict[row[0]]
            if row[0] not in part4_last30_avg:
                acsu_part_30[cnt] = 0
            else:
                acsu_part_30[cnt] = sum(part4_last30_avg[row[0]])/len(part4_last30_avg[row[0]])      
                
                
        elif row[1] == 5:
            acsu_part[cnt] = part5_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part5_count_u_dict[row[0]]
            if row[0] not in part5_last30_avg:
                acsu_part_30[cnt] = 0
            else:
                acsu_part_30[cnt] = sum(part5_last30_avg[row[0]])/len(part5_last30_avg[row[0]])         
                
                
        elif row[1] == 6:
            acsu_part[cnt] = part6_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part6_count_u_dict[row[0]]
            if row[0] not in part6_last30_avg:
                acsu_part_30[cnt] = 0
            else:
                acsu_part_30[cnt] = sum(part6_last30_avg[row[0]])/len(part6_last30_avg[row[0]])  
                
                
        elif row[1] == 7:
            acsu_part[cnt] = part7_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part7_count_u_dict[row[0]]
            if row[0] not in part7_last30_avg:
                acsu_part_30[cnt] = 0
            else:
                acsu_part_30[cnt] = sum(part7_last30_avg[row[0]])/len(part7_last30_avg[row[0]])  
                
                
    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu, 'part_answered_correctly_sum_u':acsu_part, 'part_count_u':cu_part, 'time_diff':td,'user_prev_tag_lag1' : user_prev_tag_lag1, 'last_60':acsu_60, 'part_last_30': acsu_part_30})#, 'user_prev_tag_lag2': user_prev_tag_lag2, 'user_prev_tag_lag3': user_prev_tag_lag3, 'user_prev_tag_lag4': user_prev_tag_lag4, 'user_prev_tag_lag5': user_prev_tag_lag5})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    user_feats_df['part_answered_correctly_avg_u'] = user_feats_df['part_answered_correctly_sum_u'] / user_feats_df['part_count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    return df

In [5]:
def update_user_feats(df , all_last60_avg, part1_last30_avg, part2_last30_avg, part3_last30_avg
                      , part4_last30_avg, part5_last30_avg, part6_last30_avg, part7_last30_avg , user_tag_lag1, user_answer_lag1 ,
                      last_time_u_dict, part1_answered_correctly_sum_u_dict, part1_count_u_dict, 
                      part2_answered_correctly_sum_u_dict, part2_count_u_dict, part3_answered_correctly_sum_u_dict, part3_count_u_dict,
                      part4_answered_correctly_sum_u_dict, part4_count_u_dict, part5_answered_correctly_sum_u_dict, part5_count_u_dict,
                      part6_answered_correctly_sum_u_dict, part6_count_u_dict, part7_answered_correctly_sum_u_dict, part7_count_u_dict, 
                      past_question, past_tag, past_part, past_answer, past_prior_elaps, past_time_diff, past_prior_exp):
    for row in df[['user_id','answered_correctly','content_type_id','part','timestamp','tag_sum','content_id','tag_num','prior_question_elapsed_time','time_diff', 'prior_question_had_explanation']].values:
        if row[2] == 0:                
            if row[0] not in past_question:
                new_list = deque([0]*60, maxlen = 60)
                new_list.append(13523*2+1)
                new_list.append(row[6]+1)
                past_question[row[0]] = new_list
            else:
                past_question[row[0]].append(row[6]+1)
            
            
            
            
            if row[0] not in past_tag:
                new_list = deque([0]*60, maxlen = 60)
                new_list.append(1520*2+1)
                new_list.append(row[7]+1)
                past_tag[row[0]] = new_list
            else:
                past_tag[row[0]].append(row[7]+1)            
            
            
            
            
            
            if row[0] not in past_part:
                new_list = deque([0]*60, maxlen = 60)
                new_list.append(15)
                new_list.append(row[3])
                past_part[row[0]] = new_list
            else:
                past_part[row[0]].append(row[3])     

                
                
            
            
            if row[0] not in past_answer:
                new_list = deque([0]*60, maxlen = 60)
                new_list.append(2)
                new_list.append(row[1])
                past_answer[row[0]] = new_list
            else:
                past_answer[row[0]].append(row[1])     

            
            
            
            
            if row[0] not in past_prior_elaps:
                new_list = deque([0]*60, maxlen = 60)
                new_list.append(row[8]/3e5)
                past_prior_elaps[row[0]] = new_list
            else:
                past_prior_elaps[row[0]].append(row[8]/3e5)   
                
            
            
            
            
            if row[0] not in past_time_diff:
                new_list = deque([0]*60, maxlen = 60)
                if row[9] >= 1e6:
                    new_list.append(1.0)
                else:
                    new_list.append(row[9]/1e6)
                past_time_diff[row[0]] = new_list
            else:
                if row[9] >= 1e6:
                    past_time_diff[row[0]].append(1.0)   
                else:
                    past_time_diff[row[0]].append(row[9]/1e6)   
             
            
            
                    
            if row[0] not in past_prior_exp:
                new_list = deque([0]*60, maxlen = 60)
                new_list.append(row[10])
                past_prior_exp[row[0]] = new_list
            else:
                past_prior_exp[row[0]].append(row[10])                    
            
            
            
                    
            if row[0] not in all_last60_avg:
                new_list = deque(maxlen = 60)
                new_list.append(row[1])
                all_last60_avg[row[0]] = new_list
            else:
                all_last60_avg[row[0]].append(row[1])            

                
                
            user_tag_lag1[row[0]] = row[5]
            
            user_answer_lag1[row[0]] = row[1]   
            
            last_time_u_dict[row[0]] = row[4]
            
            if row[3] == 1:
                part1_answered_correctly_sum_u_dict[row[0]] += row[1]
                part1_count_u_dict[row[0]] += 1
                if row[0] not in part1_last30_avg:
                    new_list = deque(maxlen = 30)
                    new_list.append(row[1])
                    part1_last30_avg[row[0]] = new_list
                else:
                    part1_last30_avg[row[0]].append(row[1])
                
            
            elif row[3] == 2:
                part2_answered_correctly_sum_u_dict[row[0]] += row[1]
                part2_count_u_dict[row[0]] += 1   
                if row[0] not in part2_last30_avg:
                    new_list = deque(maxlen = 30)
                    new_list.append(row[1])
                    part2_last30_avg[row[0]] = new_list
                else:
                    part2_last30_avg[row[0]].append(row[1])
                
            
            elif row[3] == 3:
                part3_answered_correctly_sum_u_dict[row[0]] += row[1]
                part3_count_u_dict[row[0]] += 1   
                if row[0] not in part3_last30_avg:
                    new_list = deque(maxlen = 30)
                    new_list.append(row[1])
                    part3_last30_avg[row[0]] = new_list
                else:
                    part3_last30_avg[row[0]].append(row[1])
                
            
            elif row[3] == 4:
                part4_answered_correctly_sum_u_dict[row[0]] += row[1]
                part4_count_u_dict[row[0]] += 1     
                if row[0] not in part4_last30_avg:
                    new_list = deque(maxlen = 30)
                    new_list.append(row[1])
                    part4_last30_avg[row[0]] = new_list
                else:
                    part4_last30_avg[row[0]].append(row[1])
                    
            
            elif row[3] == 5:
                part5_answered_correctly_sum_u_dict[row[0]] += row[1]
                part5_count_u_dict[row[0]] += 1   
                if row[0] not in part5_last30_avg:
                    new_list = deque(maxlen = 30)
                    new_list.append(row[1])
                    part5_last30_avg[row[0]] = new_list
                else:
                    part5_last30_avg[row[0]].append(row[1])
                    
            
            elif row[3] == 6:
                part6_answered_correctly_sum_u_dict[row[0]] += row[1]
                part6_count_u_dict[row[0]] += 1 
                if row[0] not in part6_last30_avg:
                    new_list = deque(maxlen = 30)
                    new_list.append(row[1])
                    part6_last30_avg[row[0]] = new_list
                else:
                    part6_last30_avg[row[0]].append(row[1])
                
                
            elif row[3] == 7:
                part7_answered_correctly_sum_u_dict[row[0]] += row[1]
                part7_count_u_dict[row[0]] += 1
                if row[0] not in part7_last30_avg:
                    new_list = deque(maxlen = 30)
                    new_list.append(row[1])
                    part7_last30_avg[row[0]] = new_list
                else:
                    part7_last30_avg[row[0]].append(row[1])
                    


In [6]:
# read data
# read data
feld_needed = ['user_id','content_id','answered_correctly','prior_question_elapsed_time','prior_question_had_explanation']
train = pd.read_pickle(train_pickle)[feld_needed]
if debug:
    train = train[:1000000]
    valid = valid[:10000]

In [7]:
train = train.loc[train.answered_correctly != -1].reset_index(drop=True)
_=gc.collect()

In [8]:
prior_question_elapsed_time_mean = train.prior_question_elapsed_time.dropna().values.mean()

In [9]:
content_df = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean','std']).reset_index()
content_df.columns = ['content_id', 'answered_correctly_avg_c','answered_correctly_std_c']

In [10]:
train_time_diff = pd.read_csv('../input/for-lgbm/train_time_diff.csv')
train = pd.concat([train,train_time_diff], axis = 1)
train.time_diff.loc[train.time_diff >= 1e6] = 1e6
del(train_time_diff)
_=gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [11]:
content_df2 = train[['content_id','time_diff']].groupby(['content_id']).agg(['median']).reset_index()
content_df2.columns = ['content_id', 'time_diff_average_c']

content_df = content_df.merge(content_df2, on = 'content_id', how = 'left')

#train['prior_question_had_explanation'] = train['prior_question_had_explanation']*1
content_df2 = train[['content_id','prior_question_had_explanation']].groupby(['content_id']).agg(['mean']).reset_index()
content_df2.columns = ['content_id','prior_has_explanation_average_c']
content_df = content_df.merge(content_df2, on = 'content_id', how = 'left')

del(content_df2)

In [12]:
del(train)
_=gc.collect()

In [13]:
questions_df = pd.read_csv(question_file)
questions_df.tags.fillna('-1-1', inplace = True)
questions_df['tag_sum'] = pd.factorize(questions_df.tag_sum)[0]
questions_df['tag_num'] = pd.factorize(questions_df.tags)[0]

## TRANSFORMET

In [14]:
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Activation, Dense, Dropout, LSTM, Masking, Embedding, Concatenate, Input, Reshape,Flatten, AveragePooling1D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.metrics import AUC
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda
#from tensorflow.keras.layers import merge
from tensorflow.keras.layers import multiply, Reshape
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm import tqdm
from tqdm import trange
from tensorflow.keras.utils import Sequence

In [15]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates


def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)




def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding
  # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)




def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)




def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.

    Args:
      q: query shape == (..., seq_len_q, depth)
      k: key shape == (..., seq_len_k, depth)
      v: value shape == (..., seq_len_v, depth_v)
      mask: Float tensor with shape broadcastable 
            to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
      output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights




class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, 
                                       (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights





def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])


    
    
    
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, e, training, mask):

        attn_output, _ = self.mha(x, x, e, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(e + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2



class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff,
                   maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        #self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.d_model)


        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, e, training, mask):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        #x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)
    
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, e, training, mask)

        return x  # (batch_size, input_seq_len, d_model)




#class Transformer(tf.keras.Model):
#    def __init__(self, num_layers, en_d_model, en_num_heads, dff, pe_input, rate=0.2):
#        super(Transformer, self).__init__()

#        self.encoder = Encoder(num_layers, en_d_model, en_num_heads, dff, 
#                           pe_input, rate)


#        self.second_final_layer = tf.keras.layers.Dense(dff)
#        self.final_layer = Dense(1,activation = 'sigmoid')
    
#    def call(self, inp1, inp2, training, mask):

#        enc_output = self.encoder(inp1, inp2, training, mask)  # (batch_size, inp_seq_len, d_model)
            
#        second_final_output = self.second_final_layer(enc_output)  # (batch_size, tar_seq_len, question_answer_pair_size)
#        final_output = self.final_layer(second_final_output)
#        return final_output

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, 
                look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
                enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

    
    
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff,
                    maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        #self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, 
               look_ahead_mask, padding_mask):

        seq_len = tf.shape(x)[1]
        attention_weights = {}

        #x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
              x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

        attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
        attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

    # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights    

class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, padding_length, rate=0.1):
        super(Transformer, self).__init__()
        
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, padding_length)

        self.second_final_layer = tf.keras.layers.Dense(dff)
        self.final_layer = Dense(1,activation = 'sigmoid')
    
    def call(self, inp1, inp2, training, de_look_ahead_mask, de_padding_mask):

        dec_output, attention_weights = self.decoder(
                inp2, inp1, training, de_look_ahead_mask, de_padding_mask)
            
        second_final_output = self.second_final_layer(dec_output)  # (batch_size, tar_seq_len, question_answer_pair_size)
        final_output = self.final_layer(second_final_output)
        return final_output




In [16]:
num_layers = 6
d_model = 128
num_heads = 8
dff = 512

n_question = 13524
n_tag = 1521
n_part = 8
n_answer = 4
n_prev_q_exp = 4

pe_input = 60



def build(num_layers, d_model, num_heads, dff, n_answer, n_question, n_tag, n_part, n_prev_q_exp, pe_input):
    masking_func = lambda inputs, previous_mask: previous_mask
    
    en_input1 = Input(batch_shape = (None, None), name = 'past_answer')
    en_input1_embed = Embedding(n_answer, d_model)(en_input1)
    en_input2 = Input(batch_shape = (None, None, 1), name = 'other_feature1')
    en_input2_masked = (Masking(mask_value= 0, input_shape = (None, None, 1)))(en_input2)
    en_input2_embed = Dense(d_model, input_shape = (None, None, 1))(en_input2_masked)
    en_input3 = Input(batch_shape = (None, None, 1), name = 'other_feature2')
    en_input3_masked = (Masking(mask_value= 0, input_shape = (None, None, 1)))(en_input2)
    en_input3_embed = Dense(d_model, input_shape = (None, None, 1))(en_input2_masked)
    en_input4 = Input(batch_shape = (None, None), name = 'past_prior_exp')
    en_input4_embed = Embedding(n_prev_q_exp, d_model)(en_input4)
    
    
    en_input_embed_sum = tf.math.add_n([en_input1_embed])
    
    
    
    #en_input1_embed = K.sum(en_input1_embed, axis = -2)
    en_input5 = Input(batch_shape = (None, None), name = 'current_question')
    en_input5_embed = Embedding(n_question, d_model)(en_input5)
    en_input6 = Input(batch_shape = (None, None), name = 'current_tag')
    en_input6_embed = Embedding(n_tag, d_model)(en_input6)
    en_input7 = Input(batch_shape = (None, None), name = 'current_part')
    en_input7_embed = Embedding(n_part, d_model)(en_input7)
    en_input_embed_sum2 = tf.math.add_n([en_input5_embed, en_input6_embed, en_input7_embed, en_input2_embed, en_input3_embed, en_input4_embed])
    
    
    #en_input2_embed = K.sum(en_input2_embed, axis = -2)

    
    
    look_ahead_mask = create_look_ahead_mask(tf.shape(en_input_embed_sum)[1])
    padding_mask = create_padding_mask(en_input1)
    combined_mask = tf.maximum(look_ahead_mask, padding_mask)
    
    
    transformer = Transformer(num_layers, d_model, num_heads, dff, pe_input)
    
    final_output = transformer(en_input_embed_sum, en_input_embed_sum2, False, combined_mask, combined_mask)

    
    model = Model(inputs=[en_input1, en_input2, en_input3, en_input4, en_input5, en_input6, en_input7], outputs=final_output)
    model.compile( optimizer = 'adam',
                    loss = 'binary_crossentropy',
                    metrics=['accuracy',AUC()])
    
    return model

sakt_model = build(num_layers, d_model, num_heads, dff, n_answer, n_question, n_tag, n_part, n_prev_q_exp, pe_input)

In [17]:
sakt_model.load_weights('../input/my-sakt/Transformer_model_feature_extraction.h5')

## SAINT

In [18]:
class EncoderLayer2(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer2, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2



class Encoder2(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff,
                   maximum_position_encoding, rate=0.1):
        super(Encoder2, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        #self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.d_model)


        self.enc_layers = [EncoderLayer2(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        #x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)
    
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

    
class DecoderLayer2(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer2, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, 
                look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
                enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

    
    
class Decoder2(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff,
                    maximum_position_encoding, rate=0.1):
        super(Decoder2, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        #self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer2(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, 
               look_ahead_mask, padding_mask):

        seq_len = tf.shape(x)[1]
        attention_weights = {}

        #x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
              x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

        attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
        attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

    # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights    

class Transformer2(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, padding_length, rate=0.1):
        super(Transformer2, self).__init__()

        self.encoder = Encoder2(num_layers, d_model, num_heads, dff, padding_length)
        
        self.decoder = Decoder2(num_layers, d_model, num_heads, dff, padding_length)

        self.second_final_layer = tf.keras.layers.Dense(dff)
        self.final_layer = Dense(1,activation = 'sigmoid')
    
    def call(self, inp1, inp2, training, en_combined_mask, de_look_ahead_mask, de_padding_mask):

        enc_output = self.encoder(inp1, training, en_combined_mask)  # (batch_size, inp_seq_len, d_model)
        dec_output, attention_weights = self.decoder(
                inp2, enc_output, training, de_look_ahead_mask, de_padding_mask)
            
        second_final_output = self.second_final_layer(dec_output)  # (batch_size, tar_seq_len, question_answer_pair_size)
        final_output = self.final_layer(second_final_output)
        return final_output

In [19]:
num_layers = 4
d_model = 160
num_heads = 8
dff = 160*4

n_question = 13524
n_tag = 1521
n_part = 8
n_answer = 4
n_prev_q_exp = 4

pe_input = 60


def build(num_layers, d_model, num_heads, dff, n_question, n_tag, n_part, n_answer, n_prev_q_exp, pe_input):

    en_input1 = Input(batch_shape = (None, None), name = 'current_question')
    en_input1_embed = Embedding(n_question, d_model)(en_input1)
    en_input2 = Input(batch_shape = (None, None), name = 'current_tag')
    en_input2_embed = Embedding(n_tag, d_model)(en_input2)
    en_input3 = Input(batch_shape = (None, None), name = 'current_part')
    en_input3_embed = Embedding(n_part, d_model)(en_input3)

    en_look_ahead_mask = create_look_ahead_mask(tf.shape(en_input1)[1])
    en_padding_mask = create_padding_mask(en_input1)
    en_combined_mask = tf.maximum(en_look_ahead_mask, en_padding_mask)
    
    
    
    #en_input1_embed = K.sum(en_input1_embed, axis = -2)
    de_input4 = Input(batch_shape = (None, None), name = 'past_answer')
    de_input4_embed = Embedding(n_answer, d_model)(de_input4)
    de_input5 = Input(batch_shape = (None, None, 1), name = 'other_feature1')
    de_input5_masked = (Masking(mask_value= 0, input_shape = (None, None, 1)))(de_input5)
    de_input5_embed = Dense(d_model, input_shape = (None, None, 1))(de_input5_masked)
    de_input6 = Input(batch_shape = (None, None, 1), name = 'other_feature2')
    de_input6_masked = (Masking(mask_value= 0, input_shape = (None, None, 1)))(de_input6)
    de_input6_embed = Dense(d_model, input_shape = (None, None, 1))(de_input6_masked)   
    de_input7 = Input(batch_shape = (None,None), name = 'prev_q_exp')
    de_input7_embed = Embedding(n_prev_q_exp, d_model)(de_input7)
    de_input = tf.math.add_n([de_input4_embed, de_input5_embed, de_input6_embed, de_input7_embed])

    en_input = tf.math.add_n([en_input1_embed, en_input2_embed, en_input3_embed])

    
    
    de_look_ahead_mask = create_look_ahead_mask(tf.shape(de_input4)[1])
    de_padding_mask = create_padding_mask(de_input4)
    de_combined_mask = tf.maximum(de_look_ahead_mask, de_padding_mask)
    
    
    transformer = Transformer2(num_layers, d_model, num_heads, dff, pe_input)
    
    final_output = transformer(en_input, de_input, False, en_combined_mask, de_combined_mask, de_combined_mask)
    
    #with tpu_strategy.scope():
    model = Model(inputs=[en_input1, en_input2, en_input3, de_input4, de_input5, de_input6, de_input7], outputs=final_output)
    #    model.build()
    #    model.compile( optimizer = 'adam',
    #                    loss = 'binary_crossentropy',
    #                    metrics=['accuracy',AUC()])
    
    return model


saint_model1 = build(num_layers, d_model, num_heads, dff, n_question, n_tag, n_part, n_answer, n_prev_q_exp, pe_input)

In [20]:
saint_model1.load_weights('../input/saint-model4/SAINT_model_feature_extraction.h5')

## SAINT 2

In [21]:
num_layers = 4
d_model = 128
num_heads = 8
dff = 512

n_question = 13524
n_tag = 1521
n_part = 8
n_answer = 4
n_prev_q_exp = 4

pe_input = 60


def build(num_layers, d_model, num_heads, dff, n_question, n_tag, n_part, n_answer, n_prev_q_exp, pe_input):

    en_input1 = Input(batch_shape = (None, None), name = 'current_question')
    en_input1_embed = Embedding(n_question, d_model)(en_input1)
    en_input2 = Input(batch_shape = (None, None), name = 'current_tag')
    en_input2_embed = Embedding(n_tag, d_model)(en_input2)
    en_input3 = Input(batch_shape = (None, None), name = 'current_part')
    en_input3_embed = Embedding(n_part, d_model)(en_input3)

    en_look_ahead_mask = create_look_ahead_mask(tf.shape(en_input1)[1])
    en_padding_mask = create_padding_mask(en_input1)
    en_combined_mask = tf.maximum(en_look_ahead_mask, en_padding_mask)
    
    
    
    #en_input1_embed = K.sum(en_input1_embed, axis = -2)
    de_input4 = Input(batch_shape = (None, None), name = 'past_answer')
    de_input4_embed = Embedding(n_answer, d_model)(de_input4)
    de_input5 = Input(batch_shape = (None, None, 1), name = 'other_feature1')
    de_input5_masked = (Masking(mask_value= 0, input_shape = (None, None, 1)))(de_input5)
    de_input5_embed = Dense(d_model, input_shape = (None, None, 1))(de_input5_masked)
    de_input6 = Input(batch_shape = (None, None, 1), name = 'other_feature2')
    de_input6_masked = (Masking(mask_value= 0, input_shape = (None, None, 1)))(de_input6)
    de_input6_embed = Dense(d_model, input_shape = (None, None, 1))(de_input6_masked)   
    de_input7 = Input(batch_shape = (None,None), name = 'prev_q_exp')
    de_input7_embed = Embedding(n_prev_q_exp, d_model)(de_input7)
    de_input = tf.math.add_n([de_input4_embed])#, de_input5_embed, de_input6_embed, de_input7_embed])

    en_input = tf.math.add_n([en_input1_embed, en_input2_embed, en_input3_embed, de_input5_embed, de_input6_embed, de_input7_embed])

    
    
    de_look_ahead_mask = create_look_ahead_mask(tf.shape(de_input4)[1])
    de_padding_mask = create_padding_mask(de_input4)
    de_combined_mask = tf.maximum(de_look_ahead_mask, de_padding_mask)
    
    
    transformer = Transformer2(num_layers, d_model, num_heads, dff, pe_input)
    
    final_output = transformer(en_input, de_input, False, en_combined_mask, de_combined_mask, de_combined_mask)
    
    #with tpu_strategy.scope():
    model = Model(inputs=[en_input1, en_input2, en_input3, de_input4, de_input5, de_input6, de_input7], outputs=final_output)
    model.compile( optimizer = 'adam',
                    loss = 'binary_crossentropy',
                    metrics=['accuracy',AUC()])
    
    return model

saint_model2 = build(num_layers, d_model, num_heads, dff, n_question, n_tag, n_part, n_answer, n_prev_q_exp, pe_input)


In [22]:
saint_model2.load_weights('../input/saint-model2/SAINT_model_feature_extraction.h5')

## DKT

In [23]:
other_input_dim = 2
hidden_layer_size = 128
input_dim_order = 13523
prev_q_perform_dim = 27046
prev_t_perform_dim = 3041
prev_p_perform_dim = 15
prev_q_exp_dim = 3


def dkt_build(hidden_layer_size, input_dim_order, prev_q_perform_dim, prev_t_perform_dim, prev_p_perform_dim, other_input_dim, prev_q_exp_dim):    
    masking_func = lambda inputs, previous_mask: previous_mask
    ## One hot encode question_id and answer/tag_id and answer
    #prev_q_a = Input(batch_shape = (None, None), dtype = 'int32', name = 'prev_q_a')
    #one_hot_prev_q_a = tf.one_hot(prev_q_a, prev_q_perform_dim, axis = -1)
    
    prev_t_a = Input(batch_shape = (None, None), dtype = 'int32', name = 'prev_t_a')
    one_hot_prev_t_a = tf.one_hot(prev_t_a, prev_t_perform_dim, axis = -1)
    
    prev_p_a = Input(batch_shape = (None, None), dtype = 'int32', name = 'prev_p_a')
    one_hot_prev_p_a = tf.one_hot(prev_p_a, prev_p_perform_dim, axis = -1)
    
    prev_q_exp = Input(batch_shape = (None, None), dtype = 'int32', name = 'prev_q_exp')
    one_hot_prev_q_exp = tf.one_hot(prev_q_exp, prev_q_exp_dim, axis = -1)
    
    other_input = Input(batch_shape = (None, None, other_input_dim), name= 'other_input')
    
    one_hot = Concatenate()([one_hot_prev_t_a, one_hot_prev_p_a, one_hot_prev_q_exp, other_input])
    
    masked_oh = (Masking(mask_value= 0, input_shape = (None, None, prev_t_perform_dim + prev_p_perform_dim + prev_q_exp + other_input_dim)))(one_hot)
    
    lstm_out = LSTM(hidden_layer_size, input_shape = (None, None, prev_t_perform_dim + prev_p_perform_dim + other_input_dim + prev_q_exp_dim),
                    dropout=0.2, recurrent_dropout =0.2, return_sequences = True)(masked_oh)
    
    
    dense_out = Dense(input_dim_order, input_shape = (None, None, hidden_layer_size), activation='sigmoid')(lstm_out)
    order = Input(batch_shape = (None, None), dtype = 'int32', name = 'order')
    #one hot encode
    one_hot_order = tf.one_hot(order, input_dim_order, axis = -1)
    #one_hot_order = K.sum(one_hot_order, axis = -2)
    
    merged = multiply([dense_out, one_hot_order])
    
    def reduce_dim(x):
        x = K.max(x, axis = 2, keepdims = True)
        return x

    def reduce_dim_shape(input_shape):
        shape = list(input_shape)
        shape[-1] = 1
        print ("reduced_shape", shape)
        return tuple(shape)
    
    reduced = Lambda(reduce_dim, output_shape = reduce_dim_shape, mask = masking_func)(merged)
    
    model = Model(inputs=[prev_t_a, prev_p_a, prev_q_exp, other_input, order], outputs=reduced)
    model.compile( optimizer = 'adam',
                    loss = 'binary_crossentropy',
                    metrics=['accuracy',AUC()])

    return model



dkt_model = dkt_build(hidden_layer_size, input_dim_order, prev_q_perform_dim, prev_t_perform_dim, prev_p_perform_dim, other_input_dim, prev_q_exp_dim)


In [24]:
dkt_model.load_weights('../input/dkt-model/dkt_model.h5')

## LGBM

In [25]:
import joblib

# load model
lgbm_model = joblib.load('../input/lgbm-model/lgb_model.pkl')

## modeling

In [26]:
TARGET = 'answered_correctly'
FEATS = ['answered_correctly_avg_u', 'last_60', 'part_last_30', 'part_answered_correctly_avg_u', 'answered_correctly_avg_c','answered_correctly_std_c','prior_has_explanation_average_c', 'time_diff_average_c'
         ,'count_u','part_answered_correctly_sum_u', 'part', 'prior_question_elapsed_time','time_diff','tag_sum','user_prev_tag_lag1']

Have a fun with loops! :)

In [27]:
import pickle

loaded_dictionary = open("../input/for-lgbm/last_time_u_dict.pkl", "rb")
last_time_u_dict = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/part1_answered_correctly_sum_u_dict.pkl", "rb")
part1_answered_correctly_sum_u_dict = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part1_count_u_dict.pkl", "rb")
part1_count_u_dict = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/part2_answered_correctly_sum_u_dict.pkl", "rb")
part2_answered_correctly_sum_u_dict = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part2_count_u_dict.pkl", "rb")
part2_count_u_dict = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/part3_answered_correctly_sum_u_dict.pkl", "rb")
part3_answered_correctly_sum_u_dict = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part3_count_u_dict.pkl", "rb")
part3_count_u_dict = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/part4_answered_correctly_sum_u_dict.pkl", "rb")
part4_answered_correctly_sum_u_dict = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part4_count_u_dict.pkl", "rb")
part4_count_u_dict = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/part5_answered_correctly_sum_u_dict.pkl", "rb")
part5_answered_correctly_sum_u_dict = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part5_count_u_dict.pkl", "rb")
part5_count_u_dict = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/part6_answered_correctly_sum_u_dict.pkl", "rb")
part6_answered_correctly_sum_u_dict = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part6_count_u_dict.pkl", "rb")
part6_count_u_dict = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/part7_answered_correctly_sum_u_dict.pkl", "rb")
part7_answered_correctly_sum_u_dict = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part7_count_u_dict.pkl", "rb")
part7_count_u_dict = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/user_answer_lag1.pkl", "rb")
user_answer_lag1 = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/user_tag_lag1.pkl", "rb")
user_tag_lag1 = pickle.load(loaded_dictionary)

loaded_dictionary = open("../input/for-lgbm/all_last60_avg.pkl", "rb")
all_last60_avg = pickle.load(loaded_dictionary)


loaded_dictionary = open("../input/for-lgbm/part1_last30_avg.pkl", "rb")
part1_last30_avg = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part2_last30_avg.pkl", "rb")
part2_last30_avg = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part3_last30_avg.pkl", "rb")
part3_last30_avg = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part4_last30_avg.pkl", "rb")
part4_last30_avg = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part5_last30_avg.pkl", "rb")
part5_last30_avg = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part6_last30_avg.pkl", "rb")
part6_last30_avg = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/for-lgbm/part7_last30_avg.pkl", "rb")
part7_last30_avg = pickle.load(loaded_dictionary)




loaded_dictionary = open("../input/prev-user-q-a/past_answer.pkl", "rb")
past_answer = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/prev-user-q-a/past_part.pkl", "rb")
past_part = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/prev-user-q-a/past_prior_elaps.pkl", "rb")
past_prior_elaps = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/prev-user-q-a/past_question.pkl", "rb")
past_question = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/prev-user-q-a/past_tag.pkl", "rb")
past_tag = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/prev-user-q-a/past_time_diff.pkl", "rb")
past_time_diff = pickle.load(loaded_dictionary)
loaded_dictionary = open("../input/prev-user-q-a/past_prior_exp.pkl", "rb")
past_prior_exp = pickle.load(loaded_dictionary)

In [28]:
class Iter_Valid(object):
    def __init__(self, df, max_user=1000):
        df = df.reset_index(drop=True)
        self.df = df
        self.user_answer = df['user_answer'].astype(str).values
        self.answered_correctly = df['answered_correctly'].astype(str).values
        df['prior_group_responses'] = "[]"
        df['prior_group_answers_correct'] = "[]"
        self.sample_df = df[df['content_type_id'] == 0][['row_id']]
        self.sample_df['answered_correctly'] = 0
        self.len = len(df)
        self.user_id = df.user_id.values
        self.task_container_id = df.task_container_id.values
        self.content_type_id = df.content_type_id.values
        self.max_user = max_user
        self.current = 0
        self.pre_user_answer_list = []
        self.pre_answered_correctly_list = []

    def __iter__(self):
        return self
    
    def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
        df= self.df[pre_start:self.current].copy()
        sample_df = self.sample_df[pre_start:self.current].copy()
        df.loc[pre_start,'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
        df.loc[pre_start,'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
        self.pre_user_answer_list = user_answer_list
        self.pre_answered_correctly_list = answered_correctly_list
        return df, sample_df

    def __next__(self):
        added_user = set()
        pre_start = self.current
        pre_added_user = -1
        pre_task_container_id = -1
        pre_content_type_id = -1
        user_answer_list = []
        answered_correctly_list = []
        while self.current < self.len:
            crr_user_id = self.user_id[self.current]
            crr_task_container_id = self.task_container_id[self.current]
            crr_content_type_id = self.content_type_id[self.current]
            if crr_user_id in added_user and (crr_user_id != pre_added_user or (crr_task_container_id != pre_task_container_id and crr_content_type_id == 0 and pre_content_type_id == 0)):
                # known user(not prev user or (differnt task container and both question))
                return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            if len(added_user) == self.max_user:
                if  crr_user_id == pre_added_user and (crr_task_container_id == pre_task_container_id or crr_content_type_id == 1):
                    user_answer_list.append(self.user_answer[self.current])
                    answered_correctly_list.append(self.answered_correctly[self.current])
                    self.current += 1
                    continue
                else:
                    return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            added_user.add(crr_user_id)
            pre_added_user = crr_user_id
            pre_task_container_id = crr_task_container_id
            pre_content_type_id = crr_content_type_id
            user_answer_list.append(self.user_answer[self.current])
            answered_correctly_list.append(self.answered_correctly[self.current])
            self.current += 1
        if pre_start < self.current:
            return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
        else:
            raise StopIteration()

In [29]:
# You can debug your inference code to reduce "Submission Scoring Error" with `validaten_flg = True`.
# Please refer https://www.kaggle.com/its7171/time-series-api-iter-test-emulator about Time-series API (iter_test) Emulator.

if validaten_flg:
    target_df = pd.read_pickle(valid_pickle)
    if debug:
        target_df = target_df[:10000]
    iter_test = Iter_Valid(target_df,max_user=1000)
    predicted = []
    def set_predict(df):
        predicted.append(df)
    # reset answered_correctly_sum_u_dict and count_u_dict
    part1_answered_correctly_sum_u_dict = defaultdict(int)
    part1_count_u_dict = defaultdict(int)
    part2_answered_correctly_sum_u_dict = defaultdict(int)
    part2_count_u_dict = defaultdict(int)
    part3_answered_correctly_sum_u_dict = defaultdict(int)
    part3_count_u_dict = defaultdict(int)
    part4_answered_correctly_sum_u_dict = defaultdict(int)
    part4_count_u_dict = defaultdict(int)
    part5_answered_correctly_sum_u_dict = defaultdict(int)
    part5_count_u_dict = defaultdict(int)
    part6_answered_correctly_sum_u_dict = defaultdict(int)
    part6_count_u_dict = defaultdict(int)
    part7_answered_correctly_sum_u_dict = defaultdict(int)
    part7_count_u_dict = defaultdict(int)
    train = pd.read_pickle(train_pickle)[['user_id','answered_correctly','content_type_id','content_id']]
    if debug:
        train = train[:1000000]
    train = train[train.content_type_id == False].reset_index(drop=True)
    train = train.merge(questions_df[['question_id','part']],left_on = 'content_id', right_on = 'question_id', how = 'left')
    update_user_feats(train, part1_answered_correctly_sum_u_dict, part1_count_u_dict, part2_answered_correctly_sum_u_dict, part2_count_u_dict, part3_answered_correctly_sum_u_dict, part3_count_u_dict, part4_answered_correctly_sum_u_dict, part4_count_u_dict, part5_answered_correctly_sum_u_dict, part5_count_u_dict, part6_answered_correctly_sum_u_dict, part6_count_u_dict, part7_answered_correctly_sum_u_dict, part7_count_u_dict)
    del train
else:
    import riiideducation
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

In [30]:
previous_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if previous_test_df is not None:
        previous_test_df[TARGET] = np.array(eval(test_df["prior_group_answers_correct"].iloc[0]))[mask]
        update_user_feats(previous_test_df , all_last60_avg, part1_last30_avg, part2_last30_avg, part3_last30_avg
                      , part4_last30_avg, part5_last30_avg, part6_last30_avg, part7_last30_avg , user_tag_lag1, user_answer_lag1 ,
                      last_time_u_dict, part1_answered_correctly_sum_u_dict, part1_count_u_dict, 
                      part2_answered_correctly_sum_u_dict, part2_count_u_dict, part3_answered_correctly_sum_u_dict, part3_count_u_dict,
                      part4_answered_correctly_sum_u_dict, part4_count_u_dict, part5_answered_correctly_sum_u_dict, part5_count_u_dict,
                      part6_answered_correctly_sum_u_dict, part6_count_u_dict, part7_answered_correctly_sum_u_dict, part7_count_u_dict, 
                      past_question, past_tag, past_part, past_answer, past_prior_elaps, past_time_diff, past_prior_exp)
    test_df = pd.merge(test_df, questions_df[['content_id', 'part','tag_sum','tag_num']], on='content_id', how='left')
    mask = (test_df['content_type_id'] == 0).values.tolist()
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = add_user_feats_without_update(test_df , all_last60_avg, part1_last30_avg, part2_last30_avg, part3_last30_avg, 
                                  part4_last30_avg, part5_last30_avg, part6_last30_avg, part7_last30_avg, user_tag_lag1, user_answer_lag1,
                                  last_time_u_dict, part1_answered_correctly_sum_u_dict, part1_count_u_dict, part2_answered_correctly_sum_u_dict,
                                  part2_count_u_dict, part3_answered_correctly_sum_u_dict, part3_count_u_dict, part4_answered_correctly_sum_u_dict,
                                  part4_count_u_dict, part5_answered_correctly_sum_u_dict, part5_count_u_dict, part6_answered_correctly_sum_u_dict,
                                  part6_count_u_dict, part7_answered_correctly_sum_u_dict, part7_count_u_dict)
    test_df.time_diff.loc[test_df.time_diff >= 1e6] = 1e6    
    test_df = pd.merge(test_df, content_df, on='content_id',  how="left")
    test_df['prior_question_elapsed_time'] = test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)    
    test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation*1 + 1
    test_df['prior_question_had_explanation'].fillna(3, inplace = True)
    
    previous_test_df = test_df[['user_id','timestamp','content_type_id','content_id','part','tag_sum','tag_num','prior_question_elapsed_time','time_diff', 'prior_question_had_explanation']].copy()
    
    lgbm_predict = lgbm_model.predict(test_df[FEATS])
    
    current_part, current_tag, current_question, past_part_answer, past_tag_answer, past_question_answer, past_other_feats, past_answer_correctly, past_prior_explanation = get_user_feats_for_nn_without_update(test_df, past_question, past_tag, past_part, past_answer, past_prior_elaps, past_time_diff, past_prior_exp)
    past_other_feats1 = np.reshape(past_other_feats[:,:,0],(-1,60,1))
    past_other_feats2 = np.reshape(past_other_feats[:,:,1],(-1,60,1))
    
    #transformer_predict = transformer_model.predict([past_question_answer, past_tag_answer, past_part_answer,
    #                           current_question, current_tag, current_part,
    #                           past_other_feats1, past_other_feats2, past_prior_explanation], batch_size = 500)
    
    
    #sakt_predict = sakt_model.predict([past_answer_correctly, past_other_feats1, past_other_feats2,past_prior_explanation,
    #                           current_question, current_tag, current_part], batch_size = 500)
    
    saint1_predict = saint_model1.predict([current_question, current_tag, current_part, 
                                past_answer_correctly, past_other_feats1, past_other_feats2, past_prior_explanation], batch_size = 500)
    
    saint2_predict = saint_model2.predict([current_question, current_tag, current_part, 
                                past_answer_correctly, past_other_feats1, past_other_feats2, past_prior_explanation], batch_size = 500)
    
    past_tag_answer = past_tag_answer - 1
    past_part_answer = past_part_answer - 1
    past_question_answer = past_question_answer - 1
    current_question = current_question - 1
    past_prior_explanation = past_prior_explanation - 1
    
    dkt_predict = dkt_model.predict([past_tag_answer, past_part_answer, past_prior_explanation, past_other_feats, current_question], batch_size = 100)
    test_df[TARGET] = dkt_predict[:,-1,0]*(0.0067) + saint1_predict[:,-1,0]*(0.5654)  + lgbm_predict*0.2491 + saint2_predict[:,-1,0]*0.1788
    test_df[TARGET].fillna(0.65, inplace = True)
    set_predict(test_df[['row_id', TARGET]])
    #---
    #print(sample_prediction_df)
    #print(test_df[['row_id', TARGET]])
    #print(test_df.shape, sample_prediction_df.shape, test_df[TARGET].shape)
    #---

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
100%|██████████| 18/18 [00:00<00:00, 9676.68it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
100%|██████████| 27/27 [00:00<00:00, 10287.63it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
100%|██████████| 26/26 [00:00<00:00, 11093.78it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the docum

In [31]:
if validaten_flg:
    y_true = target_df[target_df.content_type_id == 0].answered_correctly
    y_pred = pd.concat(predicted).answered_correctly
    print(roc_auc_score(y_true, y_pred))