In [104]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

import sentencepiece as spm
from keras.preprocessing.text import Tokenizer

from gensim.models import KeyedVectors
import re

In [105]:
cwd = Path(os.getcwd())
mainpath = cwd.parents[0] / "data/raw"
languagepaths = [x for x in mainpath.iterdir() if x.is_dir()]
languages = [x.name for x in languagepaths]

In [106]:
full_df = pd.DataFrame()

fieldnames = ['hit_id', 'sentence', 'start_offset', 'end_offset', 'target_word', 'native_annots', 
              'nonnative_annots', 'native_complex', 'nonnative_complex', 'gold_label', 'gold_prob']

relevant_cols = ['sentence', 'target_word', 'gold_label']

for lang in languagepaths:
    for datasource in lang.iterdir():
        source_data_type = datasource.stem.split('_')
        source = source_data_type[0]
        data_type = source_data_type[1]
        print(lang.name, source, data_type)
        source_df = pd.read_csv(datasource, sep='\t', header=None, names=fieldnames)
        relevant_df = source_df[relevant_cols]
        
        relevant_df.is_copy = False
        
        relevant_df['lang'] = lang.name
        relevant_df['source'] = source
        relevant_df['data_type'] = data_type
        
        
        full_df = full_df.append(relevant_df, ignore_index=True)
        

english News Dev
english News Test
english News Train
english WikiNews Dev
english WikiNews Test
english WikiNews Train
english Wikipedia Dev
english Wikipedia Test
english Wikipedia Train
french French Test
german German Dev
german German Test
german German Train
spanish Spanish Dev
spanish Spanish Test
spanish Spanish Train


In [107]:
full_df.loc[full_df['lang'] == 'spanish'].head()

Unnamed: 0,sentence,target_word,gold_label,lang,source,data_type
45035,Los Bronces de Riace conocidos también como Lo...,Los Bronces de Riace,1,spanish,Spanish,Dev
45036,Los Bronces de Riace conocidos también como Lo...,Bronces,1,spanish,Spanish,Dev
45037,Los Bronces de Riace conocidos también como Lo...,Riace,1,spanish,Spanish,Dev
45038,Los Bronces de Riace conocidos también como Lo...,griegas,1,spanish,Spanish,Dev
45039,Los Bronces de Riace conocidos también como Lo...,conocidos,0,spanish,Spanish,Dev


In [108]:
# test_langs = ['spanish', 'english', 'german','french']
# train_langs = ['spanish', 'english', 'german']

test_langs = ['english']
train_langs = ['english']

all_langs = set(test_langs + train_langs)

test_data = full_df.loc[(full_df['lang'].isin(test_langs)) & (full_df['data_type'] == 'Test' )]
dev_data = full_df.loc[(full_df['lang'].isin(test_langs)) & (full_df['data_type'] == 'Dev' )]

train_data = full_df.loc[(full_df['lang'].isin(train_langs)) & (full_df['data_type'] == 'Train' )]

# 50% split on French data as dev, 100% still to test
# Next week write about monolingual model
# Frequency of the target word in learner corpus

target = pd.concat([test_data, dev_data, train_data])

In [109]:
def get_sent_len_words(sentence):
    return len(sentence.split())

def get_sent_len_chars(sentence):
    return len(sentence)

def get_sent_len_chars_avg(sentence):
    return len(sentence)/len(sentence.split())

def get_num_target_words(words):
    result = len(words.split(' '))
    return result

def get_avg_word_len(words):
    num_words = len(words.split(' '))
    total_len_char = len(words)
    
    # Removing the spaces
    num_spaces = num_words - 1
    word_chars = total_len_char - num_spaces
    
    result = word_chars/num_words
    return result
        

In [110]:
target.is_copy = False
target['sent_len_w'] = target.apply(func= lambda row : get_sent_len_words( row['sentence'] ) , axis=1)
target['sent_len_c'] = target.apply(func= lambda row : get_sent_len_chars( row['sentence'] ) , axis=1)
target['sent_len_c_avg'] = target.apply(func= lambda row : get_sent_len_chars_avg( row['sentence'] ) , axis=1)
target['avg_target_len'] = target.apply(func= lambda row : get_avg_word_len( row['target_word'] ) , axis=1)
target['num_target_w'] = target.apply(func= lambda row :  get_num_target_words( row['target_word'] ) , axis=1)

target.head()

Unnamed: 0,sentence,target_word,gold_label,lang,source,data_type,sent_len_w,sent_len_c,sent_len_c_avg,avg_target_len,num_target_w
1764,"The teenage girl shot dead in Bellaghy, County...",teenage,0,english,News,Test,16,111,6.9375,7.0,1
1765,"The teenage girl shot dead in Bellaghy, County...",teenage girl,0,english,News,Test,16,111,6.9375,5.5,2
1766,"The teenage girl shot dead in Bellaghy, County...",Londonderry,1,english,News,Test,16,111,6.9375,11.0,1
1767,"The teenage girl shot dead in Bellaghy, County...",girl,0,english,News,Test,16,111,6.9375,4.0,1
1768,"The teenage girl shot dead in Bellaghy, County...",shot,0,english,News,Test,16,111,6.9375,4.0,1


In [111]:
target.describe()

Unnamed: 0,gold_label,sent_len_w,sent_len_c,sent_len_c_avg,avg_target_len,num_target_w
count,34879.0,34879.0,34879.0,34879.0,34879.0,34879.0
mean,0.413659,28.639325,170.933083,5.994051,6.721277,1.199633
std,0.492496,13.063224,78.405141,0.630687,2.285897,0.593245
min,0.0,3.0,16.0,2.947368,2.0,1.0
25%,0.0,20.0,116.0,5.578947,5.0,1.0
50%,0.0,27.0,158.0,5.982143,6.5,1.0
75%,1.0,35.0,209.0,6.393939,8.0,1.0
max,1.0,110.0,657.0,8.333333,22.0,11.0


In [112]:
train = target.loc[target['data_type'] == 'Train']
test = target.loc[target['data_type'] == 'Test']
dev = target.loc[target['data_type'] == 'Dev']

In [113]:
numeric_cols = target._get_numeric_data()

# Printing simple correlations
for colname in numeric_cols:
    if colname != 'gold_label':
        print("{}\t{:9.5f}".format(colname, target['gold_label'].corr(target[colname])))

# target['gold_label'].corr(target['B'])

sent_len_w	 -0.01439
sent_len_c	 -0.00577
sent_len_c_avg	  0.05746
avg_target_len	  0.38357
num_target_w	  0.21694


In [114]:
#TODO: Check this is correct. Is gold label of 1 Complex or Non-complex?
class_labels = ['Complex', 'Non-Complex']

train_data = train._get_numeric_data()
dev_data = dev._get_numeric_data()
test_data = test._get_numeric_data()

# Getting the train_features
train_features = train_data.drop('gold_label', axis=1)
train_feature_names = train_features.columns

train_ys = train_data['gold_label'].values
train_Xs = train_features.values

# Getting the dev_features
dev_features = dev_data.drop('gold_label', axis=1)
dev_feature_names = dev_features.columns

dev_ys = dev_data['gold_label'].values
dev_Xs = dev_features.values

#Getting the test_features
test_features = test_data.drop('gold_label', axis=1)
test_feature_names = test_features.columns

test_ys = test_data['gold_label'].values
test_Xs = test_features.values

In [115]:
print("train_Xs:\t{}\ntrain_ys:\t{}\ndev_Xs:\t\t{}\ndev_ys:\t\t{}\ntest_Xs:\t{}\ntest_ys:\t{}\n".format(str(train_Xs.shape), str(train_ys.shape), str(dev_Xs.shape), str(dev_ys.shape), str(test_Xs.shape), str(test_ys.shape)))

train_Xs:	(27299, 5)
train_ys:	(27299,)
dev_Xs:		(3328, 5)
dev_ys:		(3328,)
test_Xs:	(4252, 5)
test_ys:	(4252,)



In [116]:
# Normalizing the columns:

# Can't do this because we can't propagate the normalization factor as far as I can tell, so we're doing it a simpler way.
# train_Xs_norm = tf.keras.utils.normalize(train_Xs, axis=-1,order=2)
# dev_Xs_norm = tf.keras.utils.normalize(dev_Xs, axis=-1,order=2)
# test_Xs_norm = tf.keras.utils.normalize(test_Xs, axis=-1,order=2)

max_vals = np.max(train_Xs, axis=0)
min_vals = np.min(train_Xs, axis=0)
norm_factor = 1 / (max_vals - min_vals)
train_Xs_norm = norm_factor * (train_Xs - max_vals) + 1
dev_Xs_norm = norm_factor * (dev_Xs - max_vals) + 1
test_Xs_norm = norm_factor * (test_Xs - max_vals) + 1


train_ys_cat = keras.utils.to_categorical(train_ys)

if dev_ys.shape[0] != 0:
    dev_ys_cat = keras.utils.to_categorical(dev_ys)
    
test_ys_cat = keras.utils.to_categorical(test_ys)

In [117]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(5,)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(2, activation=tf.nn.softmax)
])


# Obtained from here: https://www.kaggle.com/teasherm/keras-metric-for-f-score-tf-only
def f1_score(y_true, y_pred):
    y_true = tf.cast(y_true, "int32")
    y_pred = tf.cast(tf.round(y_pred), "int32") # implicit 0.5 threshold via tf.round
    y_correct = y_true * y_pred
    sum_true = tf.reduce_sum(y_true, axis=1)
    sum_pred = tf.reduce_sum(y_pred, axis=1)
    sum_correct = tf.reduce_sum(y_correct, axis=1)
    precision = sum_correct / sum_pred
    recall = sum_correct / sum_true
    f_score = 2 * ((precision * recall) / (precision + recall))
    f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
    return tf.reduce_mean(f_score)

model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='categorical_crossentropy',
              metrics=['accuracy', f1_score])

In [118]:
model.fit(train_Xs_norm, train_ys_cat, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras._impl.keras.callbacks.History at 0x246acce3710>

In [119]:
test_loss, test_acc, test_f1 = model.evaluate(test_Xs_norm, test_ys_cat)

training_string = ""
for lang_i in range(len(train_langs)):
    lang = train_langs[lang_i]
    
    format_string = "{}, "
    if lang_i == len(train_langs) - 2:
        format_string = "{} and "
    elif lang_i == len(train_langs) - 1:
        format_string = "{}."
        
    to_append = format_string.format(lang.capitalize())
    training_string += to_append
print("\nTrained on {}".format(training_string))

testing_string = ""
for lang_i in range(len(test_langs)):
    lang = test_langs[lang_i]
    
    format_string = "{}, "
    if lang_i == len(test_langs) - 2:
        format_string = "{} and "
    elif lang_i == len(test_langs) - 1:
        format_string = "{}."
        
    to_append = format_string.format(lang.capitalize())
    testing_string += to_append
    
print("\nTested on {}\n  (test sets)".format(testing_string))

print("\nTest accuracy:\t{}\nTest f1:\t{}".format(test_acc, test_f1))


Trained on English.

Tested on English.
  (test sets)

Test accuracy:	0.732596425155593
Test f1:	0.7325964252116651


In [120]:
bytepairpath = cwd.parents[0] / "data/external/bytepairencoding"

langs_to_langcodes = {'german':'de','english':'en','spanish':'es','french':'fr'}

langs_to_bytepair_filepaths = {k: bytepairpath / (v + ".wiki.bpe.op10000.model") for k,v in langs_to_langcodes.items()}
langs_to_model_filepaths = {k: bytepairpath / (v + ".wiki.bpe.op10000.d300.w2v.bin") for k,v in langs_to_langcodes.items()}

langs_to_vocab_filepaths = {k: bytepairpath / (v + ".wiki.bpe.op10000.vocab.txt") for k,v in langs_to_langcodes.items()}

In [144]:
# Byte pair encoding comes from here: https://github.com/bheinzerling/bpemb

def preprocess_text(text):
    # lowercase:
    lowered = text.lower()
    
    non_latin = re.sub(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', "", lowered)
    
    # replace digits with 0
    no_zeros = re.sub("[0-9]+", "0", non_latin)
    
    # replace urls with <url>
    result = re.sub("((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}", "<url>", no_zeros)
    
    return result

def get_bpe_embs(text, model, sent_piece_processor):
    preprocessed = preprocess_text(text)
    subwords = sp.EncodeAsPieces(preprocessed)
    filtered_subwords = list(filter(lambda x: x in model.vocab, subwords))
    if len(filtered_subwords) > 0:
        bpe_embs = model[filtered_subwords]
    else:
        bpe_embs = np.zeros((1, 300))
    return bpe_embs

def create_model_and_sent_piece(language):
    sp = spm.SentencePieceProcessor()
    sp.Load(str(langs_to_bytepair_filepaths[lang]))
    model = KeyedVectors.load_word2vec_format(langs_to_model_filepaths[lang], fvocab=langs_to_vocab_filepaths[lang], binary=True)
    return model, sp
    
# test = "ηὕρηκα Hey ążę"
# model, sp = create_model_and_sent_piece('english')
# get_bpe_embs(test, model, sp)

In [145]:
# This gets the appropriate embedding for a particular language:
# (Useful in the monolingual case)
monolingual_index_to_embed_index = {}

# Since there are loads of duplicates, to save on disk space,
# these are different from the indices above. 
# Original database index 439 > embed index 15 > the embed itself
# Original database index 440 > embed index 15 > the embed itself
emb_indices = {}
current_emb_index = -1

target_word_embs = {}

for lang in all_langs:
    print("Processing: {}".format(lang))
    
    lang_df = target.loc[target['lang'] == lang]

    model, sp = create_model_and_sent_piece(lang)
    
    
    last_emb = None
    last_sent = None
    for index, values in lang_df.iterrows():
        target_word = values[1]
        target_word_emb = get_bpe_embs(target_word, model, sp)
        target_word_embs[index] = target_word_emb
        
        sent = values[0]

        # This check just makes it quicker to process the many duplicate sentences.
        # (which happen to be all in a row)
        
        if last_sent != sent:
            last_sent = sent
            # We pass model and sp in so we don't have to load them many times.
            emb = get_bpe_embs(sent, model, sp)

            last_emb = emb
            
            current_emb_index += 1
            emb_indices[current_emb_index] = emb
            monolingual_index_to_embed_index[index] = current_emb_index
            
        else:
            emb = last_emb
            monolingual_index_to_embed_index[index] = current_emb_index
        
# for orig_index, emb_index in monolingual_index_to_embed_index.items():
#     print(orig_index, emb_index, target.loc[orig_index]['sentence'][:10])
        
        
        

Processing: english


In [146]:
# Adapted the keras model from here: https://keras.io/getting-started/sequential-model-guide/

from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.layers import Embedding
import random

# From here: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
target_lang = 'english'

# 64 divides evenly, but should work with uneven batch_sizes
batch_size = 64

def get_all_batches(all_indices, batch_size):
    random.shuffle(all_indices)

    num_samples = len(all_indices)
    
    num_batches_excl_final = (num_samples) // batch_size
    normal_batch_total = (num_batches_excl_final * batch_size)
    
    final_batch_size = num_samples - normal_batch_total
    
    batches = []
    for i in range(num_batches_excl_final):
        first_index = i*batch_size
        last_index = (i+1)*batch_size
        this_batch = all_indices[first_index:last_index]
        batches.append(this_batch)
    
    
    if final_batch_size != 0:
        final_batch = all_indices[normal_batch_total:]
        batches.append(final_batch)

    return batches
    
all_indices = list(monolingual_index_to_embed_index.keys())
all_train_indices = [x for x in all_indices if target.loc[x]['data_type'] == 'Train']
all_dev_indices = [x for x in all_indices if target.loc[x]['data_type'] == 'Dev']
all_test_indices = [x for x in all_indices if target.loc[x]['data_type'] == 'Test']
print("Num train indices: {}\nNum dev indices: {}\nNum test indices: {}\n".format(len(all_train_indices),len(all_dev_indices),len(all_test_indices)))

train_batches = get_all_batches(all_train_indices, batch_size)
dev_batches = get_all_batches(all_dev_indices, batch_size)
test_batches = get_all_batches(all_test_indices, batch_size)

print("Num train batches: {}\nNum dev batches: {}\nNum test batches: {}".format(len(train_batches),len(dev_batches),len(test_batches)))

# print(batches[0])

# print(emb_indices[monolingual_index_to_embed_index[34879]].shape)


Num train indices: 27299
Num dev indices: 3328
Num test indices: 4252

Num train batches: 427
Num dev batches: 52
Num test batches: 67


In [147]:
def get_gold_labels_from_batches(batches):
    all_batch_labels = []
    for batch in batches:
        batch_labels = np.array([target.loc[x]['gold_label'] for x in batch])
        all_batch_labels.append(batch_labels)
    return all_batch_labels
# Not sure which of these two to go with:

train_y_batches = get_gold_labels_from_batches(train_batches)
dev_y_batches = get_gold_labels_from_batches(dev_batches)
test_y_batches = get_gold_labels_from_batches(test_batches)

print(len(train_y_batches))
    
# train_ys = [target.loc[x]['gold_label'] for x in all_train_indices]
# dev_ys = [target.loc[x]['gold_label'] for x in all_dev_indices]
# test_ys = [target.loc[x]['gold_label'] for x in all_test_indices]

427


In [169]:
from numpy import inf

def pad_sequence(emb, max_len):
    
    # TODO: Check if this is the right thing to do with inf vals?
    emb[emb == -inf] = 0
    emb[emb == inf] = 0
    
    # Pre-padding the vector with 0s
    diff = max_len - emb.shape[0]
    padded_emb = np.pad(emb, [(diff, 0), (0, 0)], mode='constant', constant_values=0)
    
    return padded_emb

def get_padded_batches(unpadded_batches, emb_indices, target_index_to_embed_index):
    # Should be 300 for all embeddings
    emb_dim = emb_indices[0].shape[1]

    padded_batches = []

    progress = 0
    for batch in unpadded_batches:
        if progress % 50 == 0:
            print("Batch: {}".format(progress))

        # Each batch might have a different longest sequence
        longest_seq = 0

        for mono_index in batch:
            embed_index = monolingual_index_to_embed_index[mono_index]
            emb_len = emb_indices[embed_index].shape[0] 
            if emb_len > longest_seq:
                longest_seq = emb_len


        batch_location = 0
        # One emb_dim is for the sequence, and the other is for the target_word embedding
        this_batch = np.zeros( (len(batch) , longest_seq , emb_dim + emb_dim) )

        for mono_index in batch:
            emb = emb_indices[monolingual_index_to_embed_index[mono_index]]
            
            target_word_emb = target_word_embs[mono_index]
            seq_len = emb.shape[0]
            repeated_target_word_emb = np.repeat(target_word_emb, seq_len, axis = 0)
            
            
            padded_emb = pad_sequence(emb, longest_seq)
            padded_target_word_emb = pad_sequence(repeated_target_word_emb, longest_seq)
            
            joined = np.concatenate([padded_emb, padded_target_word_emb], axis=1)

            this_batch[batch_location] = joined


            batch_location += 1

        padded_batches.append(this_batch)
        
        progress += 1
    
    return padded_batches

In [170]:
print("Getting padded train batches...")
padded_train_batches = get_padded_batches(train_batches, emb_indices, monolingual_index_to_embed_index)
print("Getting padded dev batches...")
padded_dev_batches = get_padded_batches(dev_batches, emb_indices, monolingual_index_to_embed_index)
print("Getting padded test batches...")
padded_test_batches = get_padded_batches(test_batches, emb_indices, monolingual_index_to_embed_index)

Getting padded train batches...
Batch: 0


ValueError: [(-13, 0), (0, 0)] cannot contain negative values.

In [150]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM

# def get_output_shape(input_shape):
#     samples = input_shape[0]
#     seq_len = input_shape[1]
#     embedding_dim = input_shape[2]
#     # Final shape needs to be same as embedding: (samples, sequence_length, embedding_dim)
#     result_shape = (num_samples, seq_len, embedding_dim)
#     return result_shape

# def my_embedding_func():
#     return None

embed_dim = 300

model = Sequential()
# model.add(Lambda(my_embedding_func, output_shape = get_output_shape))
# model.add(Embedding(max_features, output_dim=256))
model.add(LSTM(128, input_shape=(None,embed_dim)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


epochs = 1

# TODO: Make it generate different batches for different epochs
# for epoch in epochs etc etc

for batch_i in range(len(train_batches)):
    
    train_x_batch = padded_train_batches[batch_i]
    train_y_batch = train_y_batches[batch_i]
    
    if batch_i % 10 == 0:
        score = model.test_on_batch(train_x_batch, train_y_batch)
        print(batch_i, score)
    
    
    
    model.train_on_batch(train_x_batch, train_y_batch)
    
    

# model.fit(x_train, y_train, batch_size=16, epochs=10)
# score = model.evaluate(x_test, y_test, batch_size=16)


# expected input data shape: (batch_size, timesteps, data_dim)
# For us, batch_size: not sure, timesteps = sent_len, data_dim=300

# model = Sequential()
# model.add(LSTM(32, return_sequences=True,
#                input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
# model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
# model.add(LSTM(32))  # return a single vector of dimension 32
# model.add(Dense(num_classes, activation='softmax'))

# model.compile(loss='categorical_crossentropy',
#               optimizer='rmsprop',
#               metrics=['accuracy'])

# # Generate dummy training data
# x_train = np.random.random((1000, timesteps, data_dim))
# y_train = np.random.random((1000, num_classes))

# # Generate dummy validation data
# x_val = np.random.random((100, timesteps, data_dim))
# y_val = np.random.random((100, num_classes))

# model.fit(x_train, y_train,
#           batch_size=64, epochs=5,
#           validation_data=(x_val, y_val))

0 [0.7037779, 0.390625]
10 [0.68359673, 0.578125]
20 [0.6997217, 0.5625]
30 [0.6534382, 0.625]
40 [0.6767038, 0.640625]
50 [0.6645366, 0.625]
60 [0.70213246, 0.609375]
70 [0.6777805, 0.609375]
80 [0.68530816, 0.515625]
90 [0.6669874, 0.609375]
100 [0.6885803, 0.546875]
110 [0.6954412, 0.5625]
120 [0.69717336, 0.53125]
130 [0.7034153, 0.5]
140 [0.7159567, 0.53125]
150 [0.67704695, 0.578125]
160 [0.71377164, 0.46875]
170 [0.6777791, 0.578125]
180 [0.67641383, 0.625]
190 [0.6566706, 0.625]
200 [0.6842252, 0.546875]
210 [0.7009125, 0.546875]
220 [0.70228577, 0.484375]
230 [0.6682924, 0.59375]
240 [0.6923089, 0.5625]
250 [0.6458506, 0.640625]
260 [0.71540207, 0.46875]
270 [0.71391976, 0.546875]
280 [0.6479999, 0.640625]
290 [0.6670786, 0.59375]
300 [0.6607239, 0.625]
310 [0.7065612, 0.546875]
320 [0.6749668, 0.625]
330 [0.68700695, 0.5625]
340 [0.64383495, 0.65625]
350 [0.6707193, 0.578125]
360 [0.6537538, 0.640625]
370 [0.6885006, 0.546875]
380 [0.71701723, 0.46875]
390 [0.6775142, 0.57812

In [151]:
def pad_batch(batch, amount_to_pad):
    padded_batch = np.pad(batch, [(0, 0),(amount_to_pad, 0), (0, 0)], mode='constant', constant_values=0)
    return padded_batch

def join_batches(list_of_batches):
    record_seq = 0
    for batch in list_of_batches:
        seq_len = batch.shape[1]
        if seq_len > record_seq:
            record_seq = seq_len
            
    
    list_of_padded_batches = []        
    for batch in list_of_batches:
        seq_len = batch.shape[1]
        amount_to_pad = record_seq - seq_len
        if amount_to_pad > 0:
            padded_batch = pad_batch(batch, amount_to_pad)
            
        else:
            padded_batch = batch
            
        list_of_padded_batches.append(padded_batch)
          
    joined_batches = np.concatenate(list_of_padded_batches, axis=0)

    return joined_batches

def join_y_batches(dev_y_batches):
    joined_batches = np.concatenate(dev_y_batches, axis=0)
    return joined_batches

dev_xs = join_batches(padded_dev_batches)
dev_ys = join_y_batches(dev_y_batches)

test_xs = join_batches(padded_test_batches)
test_ys = join_y_batches(test_y_batches)





In [152]:
# NOTE: This is currently just using accuracy, would want to spit out predictions to use with the other evaluation system.

model.evaluate(dev_xs, dev_ys, batch_size=batch_size)



[0.6799369190747921, 0.5829326923076923]

In [94]:
# NOTE: This is currently just using accuracy, would want to spit out predictions to use with the other evaluation system.

model.evaluate(test_xs, test_ys, batch_size=batch_size)



[0.6790529597736369, 0.5797271873181649]