In [None]:
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
pos_encoding = positional_encoding(50, 512)
print (pos_encoding.shape)

plt.pcolormesh(pos_encoding[0], cmap='RdBu')
plt.xlabel('Depth')
plt.xlim((0, 512))
plt.ylabel('Position')
plt.colorbar()
plt.show()

In [None]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.

    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
    output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [None]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape

In [None]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding,emb_matrix, inp2_vocab_size,inp3_vocab_size,dec_inp2_size,dec_inp3_size,rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        
        
#         self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model,weights=[emb_matrix],trainable=True)
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        
        self.embedding2 = tf.keras.layers.Embedding(inp2_vocab_size, d_model)
        
        self.embedding3 = tf.keras.layers.Embedding(inp3_vocab_size, d_model)
        
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.d_model)


        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x,x2,x3, training, mask):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
#         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x2 = self.embedding2(x2)  # (batch_size, input_seq_len, d_model)
#         x2 *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x3 = self.embedding3(x3)  # (batch_size, input_seq_len, d_model)
#         x3 *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x=x+x2+x3
        
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding,emb_matrix, inp2_vocab_size,dec_inp2_size,dec_inp3_size,rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.embedding_2=tf.keras.layers.Embedding(dec_inp2_size, d_model)
        self.embedding_3=tf.keras.layers.Embedding(dec_inp3_size, d_model)
        
#         self.embedding_4=tf.keras.layers.Dense(d_model)
        
        self.embedding_41 = tf.keras.layers.Dense(d_model)
        
        self.embedding_4 = tf.keras.layers.TimeDistributed(self.embedding_41)
        
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x,x2,x3,x4, enc_output, training, 
           look_ahead_mask, padding_mask):

        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        
#         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x2 = self.embedding_2(x2)  # (batch_size, target_seq_len, d_model)
        
#         x2 *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x3 = self.embedding_3(x3)  # (batch_size, target_seq_len, d_model)
        
#         x3 *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x4 = self.embedding_4(x4)  # (batch_size, target_seq_len, d_model)
#         print(x4.shape)
        
#         x4 *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x=x+x2+x3+x4
        
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                 look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, emb_matrix, inp2_vocab_size,inp3_vocab_size,dec_inp2_size,dec_inp3_size,rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                               input_vocab_size, pe_input,emb_matrix, inp2_vocab_size,inp3_vocab_size,dec_inp2_size,dec_inp3_size,rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                               target_vocab_size, pe_target,emb_matrix, inp2_vocab_size,dec_inp2_size,dec_inp3_size,rate)
   
        self.final_layer = tf.keras.layers.Dense(3,activation='softmax')
        
        self.final_layer_1 = tf.keras.layers.TimeDistributed(self.final_layer)

    def call(self, inp1,inp2,inp3,tar1,tar2,tar3,tar4,training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

        enc_output = self.encoder(inp1,inp2,inp3, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights = self.decoder(
            tar1,tar2,tar3,tar4,enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer_1(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
#         print(final_output.shape)
        return final_output, attention_weights

In [None]:
!pip install ../input/datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Activation
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding, RNN
from keras import optimizers
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.layers import RepeatVector
# from keras_contrib.layers import CRF

from keras.layers import Conv2D,concatenate,MaxPool2D,MaxPooling2D,Flatten
from keras.layers import Lambda
from keras.layers.core import Reshape
from keras.utils import to_categorical
import datatable as dt
from matplotlib import pyplot as plt
import seaborn as sns
import os

import riiideducation

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MultiLabelBinarizer
import gc
from keras.layers import Reshape
import gensim
import pickle
from keras import layers
from sklearn.model_selection import train_test_split


In [None]:
train_csv = dt.fread("../input/riiid-test-answer-prediction/train.csv",columns=set(['timestamp','user_id','content_id','task_container_id','answered_correctly','prior_question_elapsed_time','prior_question_had_explanation'])).to_pandas()

In [None]:
w2v_model = gensim.models.Word2Vec.load("../input/w2v-model/w2v_model.model")
with open(r"../input/w2v-model/tokenizer.pkl", "rb") as input_file:
    tokenizer = pickle.load(input_file)

with open(r"../input/w2v-model/embedding_matrix.pkl", "rb") as input_file:
    embedding_matrix = pickle.load(input_file)

In [None]:
questions_csv = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")
example_test_csv = pd.read_csv("../input/riiid-test-answer-prediction/example_test.csv")
lectures_csv = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv")

In [None]:
questions_csv['tags'].fillna('200',inplace=True)

questions_csv['tags1']=questions_csv['tags'].apply(lambda x : int(x.split(' ')[0]))

# from sklearn.preprocessing import MultiLabelBinarizer

# mlb = MultiLabelBinarizer()
# mlb.fit(questions_csv['tags1'])

In [None]:
train_csv['lag'] = train_csv.groupby('user_id')['answered_correctly'].shift()
cum = train_csv.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
train_csv['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_csv.drop(columns=['lag'], inplace=True)
user_agg = train_csv.groupby('user_id')['answered_correctly'].agg(['sum', 'count'])

In [None]:
train_csv = train_csv.groupby('user_id').tail(105).reset_index(drop=True)

In [None]:
train_csv = train_csv[train_csv['answered_correctly'] != -1].reset_index(drop=True)

In [None]:
train_csv['answered_correctly_1'] = train_csv.groupby('user_id')['answered_correctly'].shift()
train_csv['answered_correctly_1'] = train_csv.groupby(['user_id','task_container_id'],sort=False)['answered_correctly_1'].transform('first')
gc.collect()
train_csv.answered_correctly_1.fillna(0,inplace=True)
train_csv.answered_correctly_1=train_csv.answered_correctly_1.astype('int')

In [None]:
train_csv=train_csv.merge(questions_csv[['question_id','part','tags1']],how='left',left_on='content_id',right_on='question_id')
train_csv['prior_question_had_explanation'].fillna(False, inplace=True)
train_csv['prior_question_elapsed_time'].fillna(0,inplace=True)

train_csv.head()

train_csv['prior_question_elapsed_time']=train_csv['prior_question_elapsed_time']/100
train_csv['prior_question_elapsed_time']=train_csv['prior_question_elapsed_time'].astype(int)
train_csv['prior_question_elapsed_time'].values[train_csv['prior_question_elapsed_time'] > 300] = 300

train_csv['prior_question_elapsed_time'].values[train_csv['prior_question_elapsed_time'] > 300] = 300

train_csv.head()

train_csv['answered_correctly'].replace(to_replace=0, value=2, inplace=True)
train_csv['answered_correctly_1'].replace(to_replace=0, value=2, inplace=True)


train_csv['prior_question_had_explanation'].replace(to_replace={False:2,True:1}, inplace=True)

# label_enc_dict={}
# for i in ['prior_question_had_explanation']:
#     le = LabelEncoder()
#     train_csv[i] = le.fit_transform(train_csv[i])
#     label_enc_dict[i]=le.classes_
    
# scaler = StandardScaler()
# train_csv['prior_question_elapsed_time']=scaler.fit_transform(train_csv['prior_question_elapsed_time'].values.reshape(-1,1))  

train_csv.content_id=train_csv.content_id.astype('str')
train_csv['user_correctness'].fillna(0,inplace=True)
# current=train_csv.groupby('user_id')
# train_csv['mask']=1

In [None]:
max_length=70
a=train_csv[['user_id','content_id','answered_correctly','prior_question_elapsed_time','prior_question_had_explanation','part','user_correctness','tags1','answered_correctly_1']].groupby('user_id').agg({'content_id':lambda x: np.array(x[-max_length::]).tolist(),'answered_correctly_1':lambda x: x[-max_length::].tolist(),'prior_question_elapsed_time':lambda x: x[-max_length::].tolist(),'prior_question_had_explanation':lambda x: x[-max_length::].tolist(),'part':lambda x: x[-max_length::].tolist(),'user_correctness':lambda x: x[-max_length::].tolist(),'tags1':lambda x: x[-max_length::].tolist(),'answered_correctly':lambda x: x[-max_length::].tolist()})
a.columns=['history_content_id','history_answered_correctly','history_prior_question_elapsed_time','history_prior_question_had_explanation','history_part','history_user_correctness','history_tags','answered_correctly']

In [None]:
# a_c=pd.concat([a,c],axis=1)
# del(a)
# del(c)
del(train_csv)
gc.collect()

In [None]:
a_b_both_train,a_b_both_test=train_test_split(a,test_size=0.2)

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
features_content=['history_content_id']
enc_cat_features=['history_part','history_tags']
dec_cat_feature_explain=['history_prior_question_had_explanation']
dec_cat_feature_el_time=['history_prior_question_elapsed_time']
dec_cat_feature_hist_correct=['history_answered_correctly']
cont_features=['history_user_correctness']
# mask_feature=['history_mask']
target=['answered_correctly']

In [None]:
from keras.utils import to_categorical

In [None]:
def create_training_test_sets(a,tokenizer,features_content,cont_features,enc_cat_features,dec_cat_feature_explain,dec_cat_feature_el_time,dec_cat_feature_hist_correct,target,lenght=max_length): 
    X=list()
    
    
    for j in features_content:
        X=[sequence.pad_sequences(tokenizer.texts_to_sequences(a[j].to_list()),maxlen=lenght,padding='pre')]
    
    for i in enc_cat_features:
        p=[pad_sequences(a[i],maxlen=lenght,padding='pre')]
        X.extend(p)
        
    for i in dec_cat_feature_explain:
        p=[pad_sequences(a[i],maxlen=lenght,padding='pre')]
        p[0][:,0]=3
        
        X.extend(p) 
        
    for i in dec_cat_feature_el_time:
        p=[pad_sequences(a[i],maxlen=lenght,padding='pre')]
        p[0][:,0]=301
        
        X.extend(p) 

    for i in dec_cat_feature_hist_correct:
        p=[pad_sequences(a[i],maxlen=lenght,padding='pre')]
        p[0][:,0]=3
        
        X.extend(p) 
    
        
    for i in cont_features:
        p=[pad_sequences(a[i],maxlen=lenght,padding='pre',dtype='float32')]
        X.extend(p)
        
        
#     for i in mask_feature:
#         p=[pad_sequences(a[i],maxlen=lenght,padding='pre')]
#         X.extend(p)    
        
#     for j in curr_cat_features_1:
#         p=[sequence.pad_sequences(tokenizer.texts_to_sequences(a[j].to_list()))]
#         X.extend(p)
        
        
    
        
#     X_cat = [a.loc[:, curr_cat_features].values[:, k] for k in range(a.loc[:, curr_cat_features].values.shape[1])]
# #     print(np.array(data.loc[:,items]).shape)
#     X_con = [a.loc[:, curr_cont_features].values[:, k] for k in range(a.loc[:, curr_cont_features].values.shape[1])]
    
#     X.extend(X_cat)
#     X.extend(X_con)
    
    for i in target:
        Y=pad_sequences(a[i],maxlen=lenght,padding='pre')
        Y_t=to_categorical(Y)
            
        
    
    return X, Y_t

# X_train, Y_train = create_training_test_sets(a_c,tokenizer,features_content,cont_features,enc_cat_features,dec_cat_feature_explain,dec_cat_feature_el_time,dec_cat_feature_hist_correct,target,lenght=70)
X_train, Y_train = create_training_test_sets(a_b_both_train,tokenizer,features_content,cont_features,enc_cat_features,dec_cat_feature_explain,dec_cat_feature_el_time,dec_cat_feature_hist_correct,target,lenght=max_length)
X_test, Y_test = create_training_test_sets(a_b_both_test,tokenizer,features_content,cont_features,enc_cat_features,dec_cat_feature_explain,dec_cat_feature_el_time,dec_cat_feature_hist_correct,target,lenght=max_length)

In [None]:
def create_model(a,embedding_matrix,max_length):    
    inputs = []
    outputs = []
    outputs_dense = []
    categorical_output = []
    
    
    inp_1 = layers.Input(shape=(max_length,))
#     out_content = embedding_layer(inp_1)
    inputs.append(inp_1)
    
#     print(out_content.shape)

    inp_2 = layers.Input(shape=(max_length,))
#     out_2=Reshape((max_length, 1), input_shape=(max_length,))(inp_2)
    inputs.append(inp_2)

    
    #y = Model(inputs=inputB, outputs=y)
    
    inp_8 = layers.Input(shape=(max_length,))
#     out_2=Reshape((max_length, 1), input_shape=(max_length,))(inp_2)
    inputs.append(inp_8)
    
    inp_3 = layers.Input(shape=(max_length,))
#     out_3=Reshape((max_length, 1), input_shape=(max_length,))(inp_3)
    inputs.append(inp_3)
    
    inp_4= layers.Input(shape=(max_length,))
#     out_had_explaination=Embedding(3,3,input_length=max_length,mask_zero=True)(inp_4)
    inputs.append(inp_4)
    
    
    inp_5= layers.Input(shape=(max_length,))
#     out_part=Embedding(8,20,input_length=max_length,mask_zero=True)(inp_5)
    inputs.append(inp_5)
    
    inp_6= layers.Input(shape=(max_length,))
#     out_answered_correctly=Embedding(3,4,input_length=max_length)(inp_6)
    out_6=Reshape((max_length, 1), input_shape=(max_length,))(inp_6) 
    inputs.append(inp_6)
    
#     inp_7= layers.Input(shape=(max_length,))
#     out_answered_correctly=Embedding(3,4,input_length=max_length)(inp_6)
#     inputs.append(inp_7)
    
    sample_transformer = Transformer(
    num_layers=2, d_model=512, num_heads=4, dff=2048, 
    input_vocab_size=13524, target_vocab_size=5, 
    pe_input=10000, pe_target=6000,emb_matrix=embedding_matrix,inp2_vocab_size=9,inp3_vocab_size=202,dec_inp2_size=302,dec_inp3_size=5,rate=0.2)

    
# embedding_matrix, inp2_vocab_size,dec_inp2_size,dec_inp3_size,rate=0.1    
#     temp_input = tf.random.uniform((64, 38), dtype=tf.int64, minval=0, maxval=200)
#     temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)

# inp1,inp2, tar1,tar2,tar3
    enc_mask = create_padding_mask(inp_1)
    dec_mask = create_padding_mask(inp_1)
    
    look_mask=create_look_ahead_mask(max_length)

    fn_out, _ = sample_transformer(inp_1,inp_2,inp_8,inp_5,inp_4,inp_3,out_6,training=True, 
                               enc_padding_mask=look_mask, 
                               look_ahead_mask=look_mask,
                               dec_padding_mask=dec_mask)
    
    print(fn_out.shape)
    
    
    
    
    model = Model(inputs=inputs, outputs=fn_out)
    return model

model = create_model(a_b_both_train,embedding_matrix,max_length=max_length)

In [None]:
Y_train.shape

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=3000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(512)

In [None]:
model.compile(loss='CategoricalCrossentropy', optimizer=optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9),metrics=['accuracy']) 
model.fit(X_train,Y_train,epochs=12,batch_size=256,validation_split=.2)

In [None]:
y_predict=model.predict(X_test)
Y_correct_test=Y_test[:,:,1]
Y_correct_predict=y_predict[:,:,1]

scores=Y_correct_predict[:,-1]
actual=Y_correct_test[:,-1]
from sklearn.metrics import roc_auc_score,accuracy_score
roc_auc_score(actual,scores)

In [None]:
# lm=Y_correct_predict[a_b_both_test['history_content_id'].apply(lambda x: len(x))<5,:]
# ls=Y_correct_test[a_b_both_test['history_content_id'].apply(lambda x: len(x))<5,:]
# scores=lm[:,-1]
# actual=ls[:,-1]
# roc_auc_score(actual,scores)

In [None]:
# example_test = pd.read_csv("../input/riiid-test-answer-prediction/example_test.csv")
# example_test.head()
# example_test_1=example_test[example_test['group_num']==0]
# example_test_2=example_test[example_test['group_num']==1]

In [None]:
x2=a.loc[115,'history_answered_correctly']
prior_answer=a.loc[115,'answered_correctly'][-1]
x2.append(prior_answer)
x2=x2[-max_length::]
a.loc[115,'history_answered_correctly']=x2

In [None]:
a.loc[115,'history_answered_correctly']

In [None]:
a.loc[115,'answered_correctly'][-1]

In [None]:
test_df.head()

In [None]:
a_df.shape

In [None]:
a_df.head()

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df=prior_test_df[prior_test_df['content_type_id'] == 0].reset_index(drop = True)
        
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            if user_id in user_agg.index:
                user_agg.loc[user_id, 'sum'] += answered_correctly
                user_agg.loc[user_id, 'count'] += 1
            else:
                user_agg.loc[user_id] = [answered_correctly[0], 1]
            
                            
        prior_test_df['user_correctness'] = prior_test_df['user_id'].map(user_agg['sum'] / user_agg['count']).fillna(0.5)
        
        
#         print(roc_auc_score(prior_test_df['answered_correctly'].values,scores))
        prior_test_df['answered_correctly'].replace(to_replace=0, value=2, inplace=True)
        
        
        
        
        
        prior_group=prior_test_df.groupby('user_id',sort=False)['row_id'].count().reset_index()
        
#         d=0
#         for user_id, group_num in zip(prior_group.user_id, prior_group.row_id):
#             for j in range(group_num):
#                 correct_answer=(prior_test_df.answered_correctly[d])
#                 a.loc[user_id,'answered_correctly'][-(group_num-j)]=correct_answer
# #                 print(d)
# #                 print(group_num)
# #                 print(group_num-j)
#                 d=d+1

        for i in range(prior_test_df.shape[0]):
        
            user_id=prior_test_df.user_id[i]
#             if user_id not in(a.index):
#                 a=a.append(pd.DataFrame({'history_content_id':[['0']],'history_answered_correctly':[[0]],'history_prior_question_elapsed_time':[[0]],'history_prior_question_had_explanation':[[0]],'history_part':[[0]],'history_history_user_correctness':[[0]]},index=[user_id]))

            
#             x7.append(prior_test_df.answered_correctly[i])
#         x7.append(prior_answer)
#             x7=x7[-max_length::]
#             a.loc[user_id,'answered_correctly']=x7
            
            x7=a.loc[user_id,'answered_correctly']
            
#             prior_answer=a.loc[user_id,'answered_correctly'][-1]
            correct_answer=(prior_test_df.answered_correctly[i])
            x7.append(correct_answer)
            
            x7=x7[-70::]
            a.loc[user_id,'answered_correctly']=x7

            

            
#         a_df=a_df.append(pd.DataFrame({'content_id':[x1],'answered_correctly':[x5],'prior_question_elapsed_time':[x2],'prior_question_had_explanation':[x3],'part':[x4]}))
 
    
        
    prior_test_df = test_df.copy()
#     print(test_df)
    
    
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    
    test_df = pd.merge(test_df, questions_csv, left_on = 'content_id', right_on = 'question_id', how = 'left')
    
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df['prior_question_elapsed_time'].fillna(0,inplace=True)

    test_df['prior_question_elapsed_time']=test_df['prior_question_elapsed_time']/100
    test_df['prior_question_elapsed_time']=test_df['prior_question_elapsed_time'].astype(int)
    test_df['prior_question_elapsed_time'].values[test_df['prior_question_elapsed_time'] > 300] = 300
#     test_df['answered_correctly'].replace(to_replace=0, value=2, inplace=True)
    test_df['prior_question_had_explanation'].replace(to_replace={False:2,True:1}, inplace=True)
    
    
#     for i in ['prior_question_had_explanation']:
        
#         encoder=LabelEncoder()
#         encoder.classes_=label_enc_dict[i]
#         test_df[i]=encoder.transform(test_df[i]) 
#     test_df['prior_question_elapsed_time']=scaler.transform(test_df['prior_question_elapsed_time'].values.reshape(-1,1))

    
    test_df['user_correctness'] = test_df['user_id'].map(user_agg['sum'] / user_agg['count']).fillna(0.6)
        
#     test_df['content_id_correctness'] = test_df['content_id'].map(content_agg['sum'] / content_agg['count']).fillna(0.7)
    
    test_df.content_id=test_df.content_id.astype('str')
    
    test_df['answered_correctly']=0
    
    a_df=pd.DataFrame()    
    for i in range(test_df.shape[0]):
        
        user_id=test_df.user_id[i]
        if user_id not in(a.index):
            a=a.append(pd.DataFrame({'history_content_id':[['0']],'history_answered_correctly':[[0]],'history_prior_question_elapsed_time':[[0]],'history_prior_question_had_explanation':[[0]],'history_part':[[0]],'history_user_correctness':[[0]],'answered_correctly':[[0]],'history_tags':[[0]]},index=[user_id]))
        
        
        
        
        
        x1=a.loc[user_id,'history_content_id']
        x1.append(test_df.content_id[i])
        x1=x1[-max_length::]
        a.loc[user_id,'history_content_id']=x1

        x2=a.loc[user_id,'history_answered_correctly']
        prior_answer=a.loc[user_id,'answered_correctly'][-1]
        x2.append(prior_answer)
        x2=x2[-max_length::]
        a.loc[user_id,'history_answered_correctly']=x2



        x3=a.loc[user_id,'history_prior_question_elapsed_time']
        x3.append(test_df.prior_question_elapsed_time[i])
        x3=x3[-max_length::]
        a.loc[user_id,'history_prior_question_elapsed_time']=x3

        x4=a.loc[user_id,'history_prior_question_had_explanation']
        x4.append(test_df.prior_question_had_explanation[i])
        x4=x4[-max_length::]
        a.loc[user_id,'history_prior_question_had_explanation']=x4

        x5=a.loc[user_id,'history_part']
        x5.append(test_df.part[i])
        x5[-max_length::]
        a.loc[user_id,'history_part']=x5

        x6=a.loc[user_id,'history_user_correctness']
        x6.append(test_df.user_correctness[i])
        x6=x6[-max_length::]
        a.loc[user_id,'history_user_correctness']=x6
        
        
        x7=a.loc[user_id,'answered_correctly']
#         prior_answer=a.loc[user_id,'answered_correctly'][-1]
#         x7.append(test_df.answered_correctly[i])
#         x7.append(prior_answer)
#         x7=x7[-max_length::]
#         a.loc[user_id,'answered_correctly']=x7

        x8=a.loc[user_id,'history_tags']
        x8.append(test_df.tags1[i])
        x8=x8[-max_length::]
        a.loc[user_id,'history_tags']=x8
        
        
        
        
        
        
        

        
        
        
        a_df=a_df.append(pd.DataFrame({'history_content_id':[x1],'history_answered_correctly':[x2],'history_prior_question_elapsed_time':[x3],'history_prior_question_had_explanation':[x4],'history_part':[x5],'history_user_correctness':[x6],'answered_correctly':[x7],'history_tags':[x8]},index=[i]))

        
        
        
                       
#     a_df=a[a.index.isin(test_df.user_id.values)]
    
    
    #print(a_df.shape)
#     a_final=pd.concat([a_df,test_df],axis=1)
    
    X_train, Y_train = create_training_test_sets(a_df,tokenizer,features_content,cont_features,enc_cat_features,dec_cat_feature_explain,dec_cat_feature_el_time,dec_cat_feature_hist_correct,target,lenght=max_length)
    
#     test_df.drop('answered_correctly',axis=1,inplace=True)

    Y=model.predict(X_train)
    
    Y_correct_predict=Y[:,:,1]
    scores=Y_correct_predict[:,-1]
    
    
#     answer_df=pd.DataFrame({'user_id':a_df.index,'answered_correctly':Y[:,39,0]})
    
#     test_df=test_df.merge(answer_df,on='user_id',how='left')

    test_df['answered_correctly']=scores

    #print(Y)
       
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])