In [1]:
import numpy as np 
import pandas as pd 
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Embedding, LSTM, Activation

Using TensorFlow backend.


In [2]:
def get_input(filename):
  data = pd.read_csv(filename)
  return data['sequence'].values + data['q8'].values

def get_output(filename):
  output = []
  file = np.load(filename)
  for key in file:
    output.append(file[key])
  return output

def get_ngram_text(seqs, n=8):
    return np.array([[seq[i:i+n] for i in range(len(seq))] for seq in seqs])

In [3]:
train_input = get_input('train_input.csv')
train_output = get_output('train_output.npz')
test_input = get_input('test_input.csv')

In [4]:
# maxlen_seq = max([len(seq) for seq in train_input])
maxlen_seq = 128

input_grams = get_ngram_text(train_input)

tokenizer_encoder = Tokenizer()
tokenizer_encoder.fit_on_texts(input_grams)
input_data = tokenizer_encoder.texts_to_sequences(input_grams)
input_data = sequence.pad_sequences(input_data, maxlen=maxlen_seq, padding='post')

In [5]:
output_data = []
for output in train_output:
    output_data.append(np.average(output))

output_data = np.array(output_data)

n_words = len(tokenizer_encoder.word_index) + 1

In [6]:
sequence_text = Input(shape=(None, ))
embedding_layer = Embedding(input_dim=n_words, output_dim=128)(sequence_text)
x = LSTM(128)(embedding_layer)
x = Activation('relu')(x)
y = Dense(1, activation='relu')(x)

model = Model(sequence_text, y)
# model = Sequential()
# model.add(Embedding(input_dim=n_words, output_dim=100))
# model.add(LSTM(100))
# model.add(Activation('relu'))
# model.add(Dense(1, activation='relu'))

model.compile(optimizer='adam', loss='mean_squared_error')

# input_data = input_data.reshape((len(input_data), 1, 1382))

model.fit(input_data, output_data,
        epochs=20, validation_split=0.2, verbose=2)

  num_elements)


Train on 3643 samples, validate on 911 samples
Epoch 1/20
 - 189s - loss: 215.6754 - val_loss: 93.7914
Epoch 2/20
 - 183s - loss: 66.7312 - val_loss: 42.2645
Epoch 3/20
 - 183s - loss: 39.1871 - val_loss: 32.3220
Epoch 4/20
 - 1667s - loss: 34.4569 - val_loss: 31.2690
Epoch 5/20
 - 182s - loss: 33.9721 - val_loss: 31.2487
Epoch 6/20
 - 182s - loss: 33.9346 - val_loss: 31.2755
Epoch 7/20
 - 182s - loss: 33.9327 - val_loss: 31.2994
Epoch 8/20
 - 183s - loss: 33.9234 - val_loss: 31.2812
Epoch 9/20
 - 182s - loss: 33.9285 - val_loss: 31.2587
Epoch 10/20
 - 182s - loss: 33.9288 - val_loss: 31.2562
Epoch 11/20
 - 182s - loss: 33.9307 - val_loss: 31.2545
Epoch 12/20
 - 181s - loss: 33.9224 - val_loss: 31.2938
Epoch 13/20
 - 182s - loss: 33.9373 - val_loss: 31.3075
Epoch 14/20
 - 181s - loss: 33.9237 - val_loss: 31.2710
Epoch 15/20
 - 182s - loss: 33.9257 - val_loss: 31.2457
Epoch 16/20
 - 194s - loss: 33.9569 - val_loss: 31.2627
Epoch 17/20
 - 185s - loss: 33.9348 - val_loss: 31.2598
Epoch 18

<keras.callbacks.History at 0xb91b62e80>

In [7]:
test_input_grams = get_ngram_text(test_input)
output_data = tokenizer_encoder.texts_to_sequences(test_input_grams)
output_data = sequence.pad_sequences(output_data, maxlen=maxlen_seq, padding='post')

In [8]:
predictions = model.predict(output_data)

In [9]:
final_matrix = []
test_input = pd.read_csv('test_input.csv')

for index,row in test_input.iterrows():
    
    # For each test example, create a temp_matrix having all values same as predicted value
    temp_matrix = np.full((row['length'],row['length']),predictions[index][0])
    
    # Set the diagonal values to 0
    np.fill_diagonal(temp_matrix, 0)
    
    # Append to final_matrix
    final_matrix.append(temp_matrix)

In [10]:
np.savez('test_%d.npz'%5,*final_matrix)