In [1]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Masking, Dropout
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
inputs = []
labels = []

# Get the word embedding table as a df
word_embedding_df = pd.read_csv("pca_lookup_table.csv", header=None)

file = open("NN_input.txt")
sequence_list = []
for sequence in file:
  sequence = [int(x) for x in sequence.strip().strip('][').split(',')]
  sequence_list.append(sequence)
file.close()

for seq in sequence_list:
  # Replace the current integer with its corresponding vector in the word embedding table if > 0,
  # else use vector of all 0's
  inputs.append([list(word_embedding_df.loc[val - 1]) if val > 0 else [0] * 34 for val in seq[:-1]])
  # Store the last integer in each sequence as the label
  # one-hot
  labels.append([[1 if seq[-1] - 1 == i else 0 for i in range(7374)]])

# Convert the inputs and labels to numpy arrays
inputs = np.array(inputs, dtype=float)
labels = np.array(labels, dtype=float)

In [3]:
# Separate data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=0)
# Further separate the training data into training and validation data
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

In [4]:
model = Sequential()

model.add(Masking(mask_value=[0] * 34, input_shape=x_train.shape[1:]))

model.add(SimpleRNN(3000, activation="relu", input_shape=x_train.shape[1:], return_sequences=True))
model.add(Dropout(0.2))

model.add(SimpleRNN(3000, activation="relu", input_shape=x_train.shape[1:], return_sequences=True))
model.add(Dropout(0.2))

model.add(SimpleRNN(3000, activation="relu", input_shape=x_train.shape[1:], return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(7374, activation="softmax", input_shape=x_train.shape[1:]))

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

2021-12-06 08:39:11.599027: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 15, 34)            0         
                                                                 
 simple_rnn (SimpleRNN)      (None, 15, 3000)          9105000   
                                                                 
 dropout (Dropout)           (None, 15, 3000)          0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 15, 3000)          18003000  
                                                                 
 dropout_1 (Dropout)         (None, 15, 3000)          0         
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 3000)              18003000  
                                                                 
 dropout_2 (Dropout)         (None, 3000)              0

In [8]:
model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff4d4d8b790>

In [9]:
pred = model.predict(x_test)

In [13]:
pred_df = pd.DataFrame(pred, columns=[str(i) for i in range(1, 7375)])
pred_df.to_csv('RNN_pred.csv')

In [14]:
y_test_reshaped = y_test.reshape(9745, 7374)
y_test_labels = pd.DataFrame(y_test_reshaped, columns=[str(i) for i in range(1, 7375)])
y_test_labels.to_csv('RNN_y_test_labels.csv')

In [15]:
top_k = {1:0, 5:0, 10:0, 20:0}

In [17]:
i = 0
for vector in pred:
  prob_vec = list(vector).copy()

  prob_vec_sorted = prob_vec.copy()
  prob_vec_sorted.sort(reverse=True)

  top_20 = [prob_vec.index(prob) for prob in prob_vec_sorted[:20]]

  label = list(y_test[i][0]).index(1)

  i += 1
  if(label in top_20[:1]):
    top_k[1] += 1
    top_k[5] += 1
    top_k[10] += 1
    top_k[20] += 1
  elif(label in top_20[:5]):
    top_k[5] += 1
    top_k[10] += 1
    top_k[20] += 1
  elif(label in top_20[:10]):
    top_k[10] += 1
    top_k[20] += 1
  elif(label in top_20[:20]):
    top_k[20] += 1

In [18]:
top_k

{1: 613, 5: 1127, 10: 1506, 20: 2012}

In [20]:
for key in top_k.keys():
  acc = round(top_k[key]*100/len(y_test), 2)
  print(f'Top-{key} : {acc}%')

Top-1 : 6.29%
Top-5 : 11.56%
Top-10 : 15.45%
Top-20 : 20.65%


In [24]:
model2 = Sequential()

model2.add(Masking(mask_value=[0] * 34, input_shape=x_train.shape[1:]))

model2.add(SimpleRNN(10000, activation="relu", input_shape=x_train.shape[1:], return_sequences=True))
model2.add(Dropout(0.2))

model2.add(SimpleRNN(10000, activation="relu", input_shape=x_train.shape[1:], return_sequences=True))
model2.add(Dropout(0.2))

model2.add(SimpleRNN(10000, activation="relu", input_shape=x_train.shape[1:], return_sequences=False))
model2.add(Dropout(0.2))

model2.add(Dense(7374, activation="softmax", input_shape=x_train.shape[1:]))

model2.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [25]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, 15, 34)            0         
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 15, 10000)         100350000 
                                                                 
 dropout_3 (Dropout)         (None, 15, 10000)         0         
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 15, 10000)         200010000 
                                                                 
 dropout_4 (Dropout)         (None, 15, 10000)         0         
                                                                 
 simple_rnn_5 (SimpleRNN)    (None, 10000)             200010000 
                                                                 
 dropout_5 (Dropout)         (None, 10000)            

In [26]:
model2.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff41d58a5b0>

In [27]:
pred2 = model2.predict(x_test)

In [28]:
pred2_df = pd.DataFrame(pred2, columns=[str(i) for i in range(1, 7375)])
pred2_df.to_csv('RNN_pred2.csv')

In [29]:
top_k2 = {1:0, 5:0, 10:0, 20:0}

In [30]:
i = 0
for vector in pred2:
  prob_vec = list(vector).copy()

  prob_vec_sorted = prob_vec.copy()
  prob_vec_sorted.sort(reverse=True)

  top_20 = [prob_vec.index(prob) for prob in prob_vec_sorted[:20]]

  label = list(y_test[i][0]).index(1)

  i += 1
  if(label in top_20[:1]):
    top_k2[1] += 1
    top_k2[5] += 1
    top_k2[10] += 1
    top_k2[20] += 1
  elif(label in top_20[:5]):
    top_k2[5] += 1
    top_k2[10] += 1
    top_k2[20] += 1
  elif(label in top_20[:10]):
    top_k2[10] += 1
    top_k2[20] += 1
  elif(label in top_20[:20]):
    top_k2[20] += 1

In [31]:
top_k2

{1: 613, 5: 1126, 10: 1529, 20: 2069}

In [47]:
for key in top_k2.keys():
  acc = round(top_k2[key]*100/len(y_test), 2)
  print(f'Top-{key} : {acc}%')

Top-1 : 6.29%
Top-5 : 11.55%
Top-10 : 15.69%
Top-20 : 21.23%


In [49]:
model3 = Sequential()

model3.add(Masking(mask_value=[0] * 34, input_shape=x_train.shape[1:]))

model3.add(SimpleRNN(6000, activation="relu", input_shape=x_train.shape[1:], return_sequences=True))
model3.add(Dropout(0.2))

model3.add(SimpleRNN(4000, activation="relu", input_shape=x_train.shape[1:], return_sequences=True))
model3.add(Dropout(0.2))

model3.add(SimpleRNN(3000, activation="relu", input_shape=x_train.shape[1:], return_sequences=False))
model3.add(Dropout(0.2))

model3.add(Dense(7374, activation="softmax", input_shape=x_train.shape[1:]))

model3.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [50]:
model3.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_4 (Masking)         (None, 15, 34)            0         
                                                                 
 simple_rnn_12 (SimpleRNN)   (None, 15, 6000)          36210000  
                                                                 
 dropout_12 (Dropout)        (None, 15, 6000)          0         
                                                                 
 simple_rnn_13 (SimpleRNN)   (None, 15, 4000)          40004000  
                                                                 
 dropout_13 (Dropout)        (None, 15, 4000)          0         
                                                                 
 simple_rnn_14 (SimpleRNN)   (None, 3000)              21003000  
                                                                 
 dropout_14 (Dropout)        (None, 3000)             

In [51]:
model3.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff02bb13730>

In [52]:
pred3 = model3.predict(x_test)

In [53]:
top_k3 = {1:0, 5:0, 10:0, 20:0}

In [54]:
i = 0
for vector in pred3:
  prob_vec = list(vector).copy()

  prob_vec_sorted = prob_vec.copy()
  prob_vec_sorted.sort(reverse=True)

  top_20 = [prob_vec.index(prob) for prob in prob_vec_sorted[:20]]

  label = list(y_test[i][0]).index(1)

  i += 1
  if(label in top_20[:1]):
    top_k3[1] += 1
    top_k3[5] += 1
    top_k3[10] += 1
    top_k3[20] += 1
  elif(label in top_20[:5]):
    top_k3[5] += 1
    top_k3[10] += 1
    top_k3[20] += 1
  elif(label in top_20[:10]):
    top_k3[10] += 1
    top_k3[20] += 1
  elif(label in top_20[:20]):
    top_k3[20] += 1

In [55]:
top_k3

{1: 542, 5: 1055, 10: 1525, 20: 1931}

In [56]:
for key in top_k3.keys():
  acc = round(top_k3[key]*100/len(y_test), 2)
  print(f'Top-{key} : {acc}%')

Top-1 : 5.56%
Top-5 : 10.83%
Top-10 : 15.65%
Top-20 : 19.82%
