In [1]:
import data_prep
import LSTM

In [2]:
from keras import models, layers
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Masking, Flatten, Input, RepeatVector
from sklearn.model_selection import train_test_split

In [3]:
x, y = data_prep.get_input_vectors_and_labels("NN_input.txt", "vector.csv")

In [4]:
# Separate data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# Further separate the training data into training and validation data
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

In [5]:
model = Sequential()

model.add(Masking(mask_value= [0]*34, input_shape=x_train.shape[1:]))

model.add(LSTM(3000, activation="relu", input_shape=x_train.shape[1:], return_sequences=True ))
model.add(Dropout(0.2))

model.add(LSTM(3000, activation="relu", input_shape=x_train.shape[1:], return_sequences=True ))
model.add(Dropout(0.2))

model.add(LSTM(3000, activation="relu", input_shape=x_train.shape[1:], return_sequences=False ))
model.add(Dropout(0.2))

model.add(Dense(7374, activation="softmax", input_shape=x_train.shape[1:]))

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 15, 34)            0         
                                                                 
 lstm (LSTM)                 (None, 15, 3000)          36420000  
                                                                 
 dropout (Dropout)           (None, 15, 3000)          0         
                                                                 
 lstm_1 (LSTM)               (None, 15, 3000)          72012000  
                                                                 
 dropout_1 (Dropout)         (None, 15, 3000)          0         
                                                                 
 lstm_2 (LSTM)               (None, 3000)              72012000  
                                                                 
 dropout_2 (Dropout)         (None, 3000)              0

In [7]:
model.fit(x_train, y_train, epochs=4, validation_data=(x_val, y_val))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f448daedb10>

In [12]:
prediction = model.predict(x_test)

In [13]:
import pandas as pd

Save predictions and labels to a file in case they get lost

In [16]:
predict_df = pd.DataFrame(prediction,columns=[str(i) for i in range(1,7375)])
predict_df.to_csv('predictions.csv')

In [15]:
y_test_reshaped = y_test.reshape(9745, 7374)
y_test_labels = pd.DataFrame(y_test_reshaped, columns=[str(i) for i in range(1,7375)])
y_test_labels.to_csv('y_test_labels.csv')

In [17]:
top_k = {1: 0, 5: 0, 10: 0, 20: 0}

In [27]:
print(len(y_test) == len(prediction))

True


In [36]:
i = 0
for vector in prediction:
  # get the probability vector
  prob_vec = list(vector).copy()
  # make a copy of the prob. vector and sort it in desc. order
  prob_vec_sorted = prob_vec.copy()
  prob_vec_sorted.sort(reverse=True)
  # get the indices of the top 20 probilities in the probability vector
  top_20 = [ prob_vec.index(prob) for prob in prob_vec_sorted[:20] ]
  # get the index corresponding to the true label
  label = list(y_test[i][0]).index(1)

  i += 1
  if(label in top_20[:1]):
    top_k[1]  += 1
    top_k[5]  += 1
    top_k[10] += 1
    top_k[20] += 1
    continue
  elif(label in top_20[:5]):
    top_k[5]  += 1
    top_k[10] += 1
    top_k[20] += 1
    continue
  elif(label in top_20[:10]):
    top_k[10] += 1
    top_k[20] += 1
    continue
  elif(label in top_20):
    top_k[20] += 1



In [37]:
top_k

{1: 614, 5: 1120, 10: 1551, 20: 2129}

In [38]:
len(y_test)

9745

In [43]:
for key in top_k.keys():
  accuracy = round(top_k[key]*100/len(y_test), 2)
  print(f'Top-{key} : {accuracy}%')

Top-1 : 6.3%
Top-5 : 11.49%
Top-10 : 15.92%
Top-20 : 21.85%
