In [None]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

import _pickle as cPickle
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense,Concatenate, Flatten, Lambda,Dropout,Layer, SpatialDropout1D, RepeatVector, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers
from keras_self_attention import SeqSelfAttention
from sklearn.metrics import accuracy_score
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.keras import layers


#modify seed to verify results with different random seeds
tf.random.set_seed(20)

### Load Data

In [None]:
#load data from appropriate folders

with open('train.txt','rb')as f:
    train=cPickle.load(f)
with open('test.txt','rb')as f:
    test=cPickle.load(f)

print("len of train",len(train[0]))
print("len of test",len(test[0]))

#train sequences and labels

train_seq=train[0]
train_y=train[1]

#test sequences and labels

test_seq=test[0]
test_y=test[1]

#vocab
vocab = np.load("vocab_yoochoose_64.npy")
print(len(vocab))

#embedding
with open("transe_emb_new",'rb')as f:
    transe_emb=cPickle.load(f)


In [None]:
# take recent 10 percent data as validation

k = int(0.1*len(train_seq))

val_y=train_y[-k:]
train_y = train_y[:-k]
val_x=train_seq[-k:]
train_seq = train_seq[:-k]


print(len(train_seq), len(val_x))

### Re-order training data based on different Curriculums

In [None]:
#Run the cell with required curriculum and comment the other cells (CL1 or CL2 or Hybrid or Reverse)

#CL with last item (CL1)
def find_similarity(a,b):
    a_val= transe_emb[str(a)]
    b_val= transe_emb[str(b)]
    return dot(a_val, b_val)/(norm(a_val)*norm(b_val))

new_train_seq=[]
for i in range (0,len(train_seq)):
    new_sub_seq =train_seq[i]
    sim= find_similarity(train_seq[i][-1], train_y[i])
#     print(train_seq[i][-1], train_y[i])
    new_sub_seq.append(train_y[i])
    new_sub_seq.append(sim)
    new_train_seq.append(new_sub_seq)

new_train_seq.sort(key=lambda x: (x[-1],len(x)), reverse=True)

#reverse CL (uncomment the following line for reverse CL)
# new_train_seq.sort(key=lambda x: (x[-1],len(x)), reverse=False)



In [None]:
#CL with avg of last k items (k=10 got best results)  (CL2)
def find_similarity(a,b):
    a_emb=[]
    for each in a:
        a_emb.append(transe_emb[str(each)])
    a_emb= np.array(a_emb)
    a_val = np.mean(a_emb, axis=0)
    b_val=transe_emb[str(b)]
    return dot(a_val, b_val)/(norm(a_val)*norm(b_val))

new_train_seq=[]
for i in range (0,len(train_seq)):
    new_sub_seq =train_seq[i]
    sim= find_similarity(train_seq[i][-10:], train_y[i])
    new_sub_seq.append(train_y[i])
    new_sub_seq.append(sim)
    new_train_seq.append(new_sub_seq)

new_train_seq.sort(key=lambda x: (x[-1],len(x)), reverse=True)


In [None]:
#CL with combination of both (Hybrid)
def find_similarity1(a,b):
    a_emb=[]
    for each in a:
        a_emb.append(transe_emb[str(each)])
    a_emb= np.array(a_emb)
    a_val = np.mean(a_emb, axis=0)
    b_val=transe_emb[str(b)]
    return dot(a_val, b_val)/(norm(a_val)*norm(b_val))

def find_similarity2(a,b):
    a_val= transe_emb[str(a)]
    b_val= transe_emb[str(b)]
    return dot(a_val, b_val)/(norm(a_val)*norm(b_val))

new_train_seq=[]
for i in range (0,len(train_seq)):
    new_sub_seq =train_seq[i]
    sim1= find_similarity1(train_seq[i][-10:], train_y[i])
    sim2= find_similarity2(train_seq[i][-1], train_y[i])
    sim=0.5*sim1+0.5*sim2
    new_sub_seq.append(train_y[i])
    new_sub_seq.append(sim)
    new_train_seq.append(new_sub_seq)


new_train_seq.sort(key=lambda x: (x[-1],len(x)), reverse=True)

In [None]:
#new train y to be computed after any of CL approaches 
new_train_y=[]
for i in range (0,len(new_train_seq)):
    sim=new_train_seq[i].pop()
    last_item=new_train_seq[i].pop()
    new_train_y.append(last_item)

def padding_seq(dictList):
    for i in range(0,len(dictList)):
        if len(dictList[i])>10:
            max_len=len(dictList[i])
            dictList[i]=dictList[i][-10:]
        if len(dictList[i])<10:
            w=10-len(dictList[i])
            dictList[i]=['padding_id']*w+dictList[i]
        
    return dictList

train_x=padding_seq(new_train_seq)
test_x=padding_seq(test_seq)
val_x=padding_seq(val_x)


### Convert Data to Suitable Format

In [None]:
#convert data
#create a dictionary
convert_dict=dict()
convert_dict["padding_id"]=0
i=1
for each in vocab:
    convert_dict[each]=i
    i=i+1

#create embedding matrix
embed_dim =200
total_vocab = len(vocab)+1
unknown_id=np.zeros((200,))
vocab = list(vocab)
embedding_matrix = np.zeros((total_vocab, embed_dim))
embedding_matrix[0] = unknown_id

for each in vocab:
    embedding_matrix[convert_dict[each]] =transe_emb[str(each)]

def type_conv(data):
    data_new=[]
    for each in data:
        for i in range(0,len(each)):
            try:
                each[i]= convert_dict[str(each[i])]
            except:
                each[i]=0
        data_new.append(each)
    return data_new

train_x= type_conv(train_x)
val_x= type_conv(val_x)
test_x= type_conv(test_x)
    
train_x=np.array(train_x)
val_x=np.array(val_x)
test_x=np.array(test_x)

train_x = train_x.astype(int)
val_x = val_x.astype(int)
test_x = test_x.astype(int)

train_y= [convert_dict[str(each)] for each in new_train_y]
val_y= [convert_dict[str(each)] for each in val_y]
test_y1=[]
for each in test_y:
    try:
        test_y1.append(convert_dict[str(each)])
    except:
        test_y1.append(0)

### LSTM + CL

In [None]:
# # LSTM

unknown_id=np.zeros((200,))
def w2v_data_extraction(new_list):
    w2v_data=[]
    for i in range(0,len(new_list)):
        seq_vec=[]
        for j in range(0,len(new_list[i])):
            try:
                embedding= embedding_matrix[new_list[i][j]]
#                 print(Embedding)
            except (KeyError,ValueError, IndexError):
                embedding=unknown_id
            seq_vec.append(embedding)

        w2v_data.append(seq_vec)
    return np.asarray(w2v_data)

#represent output as one-hot encoded
def one_hot(seq,total_vocab):
    seq_one_hot=np.zeros([len(seq),total_vocab])
    for i in range(0,len(seq)):
        seq_one_hot[i][seq[i]]=1
    return seq_one_hot

def generator(X_data, y_data, batch_size, total_vocab):
    #shuffle data
    indices = np.arange(X_data.shape[0])
    np.random.shuffle(indices)
    X_data= np.array(X_data)
    y_data=np.array(y_data)
    X_data = X_data[indices]
    y_data = y_data[indices]
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    while 1:
        X_batch = np.array(X_data[batch_size*counter:batch_size*(counter+1)])
        X_batch = w2v_data_extraction(X_batch)
        y_batch = np.array(y_data[batch_size*counter:batch_size*(counter+1)])
        y_batch=one_hot(y_batch,total_vocab)
        counter += 1
        yield X_batch,y_batch

        #restart counter to yeild data in the next epoch as well
        if counter >= number_of_batches:
            counter = 0

total_vocab = len(vocab)+1
batch_size=512



top20accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=20)
early_callback = tf.keras.callbacks.EarlyStopping(monitor='val_top_k_categorical_accuracy', patience=3, restore_best_weights=True)


main_input = Input(shape=(20,200), name='main_input')
lstm_out = LSTM(100,return_sequences=True)(main_input)
lstm_out = Dropout(0.2)(lstm_out)
print(lstm_out.shape)
att = SeqSelfAttention(attention_type=SeqSelfAttention.ATTENTION_TYPE_MUL,kernel_regularizer=tf.keras.regularizers.l2(1e-4),
                       bias_regularizer=tf.keras.regularizers.l1(1e-4),attention_regularizer_weight=1e-4,name='Attention')(lstm_out)
att1 = Lambda(lambda xin: K.mean(xin, axis=-2), output_shape=(100,))(att)

main_output = (Dense(total_vocab, activation='softmax', name='main_output')(att1))
model = Model(inputs=main_input, outputs=main_output)
model.compile(loss='categorical_crossentropy',metrics=['accuracy',top20accuracy],optimizer="adam")


#fit model (LSTM+CL)
epochs=10
i=0.1
for j in range(epochs):
    train_x_sample = train_x[0:int(i*len(train_x))]
    train_y_sample = train_y[0:int(i*len(train_x))]
        
    if j==9:
        model.fit_generator(generator(train_x_sample,train_y_sample,batch_size, total_vocab),epochs=10,
                            steps_per_epoch = train_x_sample.shape[0]/batch_size,
    validation_data = generator(test_x,test_y1,batch_size,total_vocab),
    validation_steps = test_x.shape[0]/batch_size, callbacks=[early_callback])
    
    else:
        model.fit_generator(generator(train_x_sample,train_y_sample,batch_size, total_vocab),epochs=1,
                            steps_per_epoch = train_x_sample.shape[0]/batch_size,validation_data = generator(test_x,test_y1,batch_size,total_vocab),
    validation_steps = test_x.shape[0]/batch_size, callbacks=[early_callback])
          
    if i< 1:
        i=i+0.1
print("model building done")




In [None]:
# from sklearn.metrics import accuracy_score
def mrr_20(pred,actual):
    predics = []

    sum_of_inv_rank = 0

    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-20:])
    count = 0
    for i in range(0, len(predics)):
        # if actual[i] in predics[i]:
        try:
            rank = np.where(predics[i]==actual[i])[0][0] #predics[i].index(actual[i])
            sum_of_inv_rank += 1/ (20-rank)
        except:
            pass


    return sum_of_inv_rank/len(actual)

# Hit rate at 1 on test data
def hit_rate_at_1(pred,actual):
    # return accuracy_score(prediction,actual)
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-1:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1

    return count/len(actual)

# Hit rata at 5 on test data
def hit_rate_at_20(pred,actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-20:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1

    return count/len(actual)

# Hit rate at 10 on test data
def hit_rate_at_10(pred, actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-10:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1

    return count /len(actual)

# Prediction on test data for LSTM
def model_predict(model,test_x,test_seq):
    pred=model.predict(x=w2v_data_extraction(test_x))
    preddy=np.argmax(a=pred,axis=1)


    print(hit_rate_at_1(pred,test_seq))
    print(hit_rate_at_10(pred, test_seq))
    print(hit_rate_at_20(pred, test_seq))
    print(mrr_20(pred, test_seq))

# Prediction on test data for trasformer
def model_predict1(model,test_x,test_seq):
    pred=model.predict(x=test_x)
    preddy=np.argmax(a=pred,axis=1)


    print(hit_rate_at_1(pred,test_seq))
    print(hit_rate_at_10(pred, test_seq))
    print(hit_rate_at_20(pred, test_seq))
    print(mrr_20(pred, test_seq))

In [None]:
#Prediction using LSTM model
model_predict(model,test_x,test_y1)

### Transformers + CL

In [None]:
#Transformers

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, kernel_initializer='lecun_normal',
                                activation='gelu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)),
             layers.Dense(embed_dim, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-4)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-4)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        self.add = tf.keras.layers.Add()
    
    
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs, use_causal_mask = False)
        attn_output = self.dropout1(attn_output, training=training)
        out1= self.add([inputs, attn_output])
        out1 = self.layernorm1(out1)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2= self.add([out1, ffn_output])
        return self.layernorm2(out2)


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,mask_zero=True,)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
def generator(X_data, y_data, batch_size, total_vocab):
    #shuffle data
    indices = np.arange(X_data.shape[0])
    np.random.shuffle(indices)
    X_data= np.array(X_data)
    y_data=np.array(y_data)
    X_data = X_data[indices]
    y_data = y_data[indices]
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    while 1:
        X_batch = np.array(X_data[batch_size*counter:batch_size*(counter+1)])
        y_batch = np.array(y_data[batch_size*counter:batch_size*(counter+1)])
        y_batch=one_hot(y_batch,total_vocab)
        counter += 1
        yield X_batch,y_batch

        #restart counter to yeild data in the next epoch as well
        if counter >= number_of_batches:
            counter = 0


def scheduler(epoch, lr):
    if epoch<2:
        return lr
    else:
        return lr * tf.math.exp(-0.2)


#represent output as one-hot encoded
def one_hot(seq,total_vocab):
    seq_one_hot=np.zeros([len(seq),total_vocab])
    for i in range(0,len(seq)):
        seq_one_hot[i][seq[i]]=1
    return seq_one_hot


In [None]:
embed_dim = 200  # Embedding size for each token
num_heads = 8 # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
maxlen=10
dropout=0.1
batch_size = 2048

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, total_vocab, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
# x = transformer_block(x)
x = layers.Dropout(0.5)(x)
last_item=x[::,-1]
outputs = layers.Dense(total_vocab, activation="softmax")(last_item)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
top20accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=20)
early_callback = tf.keras.callbacks.EarlyStopping(monitor='val_top_k_categorical_accuracy', patience=3, restore_best_weights=True)
lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

model.compile(loss='categorical_crossentropy',metrics=['accuracy',top20accuracy],optimizer="adam")
epochs=10
i=0.1
for j in range(epochs):
    train_x_sample = train_x[0:int(i*len(train_x))]
    train_y_sample = train_y[0:int(i*len(train_x))]
        
    if j==9:
        history1=model.fit_generator(generator(train_x_sample,train_y_sample,batch_size, total_vocab),epochs=10,
                            steps_per_epoch = train_x_sample.shape[0]/batch_size,
        validation_data = generator(val_x,val_y,batch_size,total_vocab),
    validation_steps = val_x.shape[0]/batch_size, callbacks=[early_callback])
    
    else:
        history = model.fit_generator(generator(train_x_sample,train_y_sample,batch_size, total_vocab),epochs=1,
                            steps_per_epoch = train_x_sample.shape[0]/batch_size,validation_data = generator(val_x,val_y,batch_size,total_vocab),
    validation_steps = val_x.shape[0]/batch_size, callbacks=[early_callback])
          
    if i< 1:
        i=i+0.1
print("model building done")


In [None]:
def model_predict1(model,test_x,test_seq):
    pred=model.predict(x=test_x)
    preddy=np.argmax(a=pred,axis=1)


    print(hit_rate_at_1(pred,test_seq))
    print(hit_rate_at_10(pred, test_seq))
    print(hit_rate_at_20(pred, test_seq))
    print(mrr_20(pred, test_seq))

 
def mrr_20(pred,actual):
    predics = []

    sum_of_inv_rank = 0

    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-20:])
    count = 0
    for i in range(0, len(predics)):
        # if actual[i] in predics[i]:
        try:
            rank = np.where(predics[i]==actual[i])[0][0] #predics[i].index(actual[i])
            sum_of_inv_rank += 1/ (20-rank)
        except:
            pass


    return sum_of_inv_rank/len(actual)

# Hit rate at 1 on test data
def hit_rate_at_1(pred,actual):
    # return accuracy_score(prediction,actual)
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-1:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1

    return count/len(actual)

# Hit rata at 5 on test data
def hit_rate_at_20(pred,actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-20:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1

    return count/len(actual)

# Hit rate at 10 on test data
def hit_rate_at_10(pred, actual):
    predics = []
    for i in range(0, len(pred)):
        predics.append(np.argsort(pred[i])[-10:])
    count = 0
    for i in range(0, len(predics)):
        if actual[i] in predics[i]:
            count = count + 1

    return count /len(actual)


In [None]:
model_predict1(model,test_x,test_y1)

#### Adjustable CL

In [None]:
# LOSS based CL (Adjustable CL)


def generator_sample_weights(X_data, y_data,weights, batch_size, total_vocab):
    #shuffle data
    indices = np.arange(X_data.shape[0])
    np.random.shuffle(indices)
    X_data = X_data[indices]
    y_data = y_data[indices]
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    while 1:
        X_batch = np.array(X_data[batch_size*counter:batch_size*(counter+1)])
        y_batch = np.array(y_data[batch_size*counter:batch_size*(counter+1)])
        y_batch=one_hot(y_batch,total_vocab)
        weights_batch = np.array(weights[batch_size*counter:batch_size*(counter+1)])
        counter += 1
#         print(X_batch.shape, y_batch.shape, weights_batch.shape)
        yield X_batch,y_batch, weights_batch

        #restart counter to yeild data in the next epoch as well
        if counter >= number_of_batches:
            counter = 0

batch_size=2048

def scheduler(epoch, lr):
    if epoch < 2:
        return lr
    else:
        return lr * tf.math.exp(-0.5)

early_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

#represent output as one-hot encoded
def one_hot(seq,total_vocab):
    seq_one_hot=np.zeros([len(seq),total_vocab])
    for i in range(0,len(seq)):
        seq_one_hot[i][seq[i]]=1
    return seq_one_hot
individual_loss =tf.keras.losses.CategoricalCrossentropy(reduction=tf.compat.v1.losses.Reduction.NONE)
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer=optimizer)
weights = np.ones(len(train_x),)
val_weights = np.ones(len(val_x),)
epochs=20
lr=0.001
for i in range(epochs):
    model.fit_generator(
    generator_sample_weights(train_x,train_y,weights,batch_size, total_vocab),
    epochs=1,
    steps_per_epoch = train_x.shape[0]/batch_size,
    validation_data = generator(val_x,val_y,batch_size,total_vocab),
    validation_steps = val_x.shape[0]/batch_size)
   
    pred= model.predict(x=train_x, batch_size= len(train_x))
    print(pred.shape)
    ind_loss=[]
    step=10000
    j=0
    while 1:
        ind_loss.extend(individual_loss(pred[j:j+step], one_hot(train_y[j:j+step],total_vocab)).numpy())
        j=j+step
        if j > len(train_x):
            break
    print(len(ind_loss))
    ind_loss = np.array(ind_loss)
    weights = 1/ind_loss
    
model_predict(model,test_x,test_y1)
