In [10]:
import numpy as np
import re
import itertools
from collections import Counter
from utils.preprocessing import clean_tweets
import os
from keras.models import load_model
import tensorflow as tf

In [None]:
tf.flags.

In [2]:
def load_data_test():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    x_text = list(open("./twitter-datasets/test_data.txt").readlines())
    x_text = [s.strip() for s in x_text]
    # Split by words
    x_text = [clean_tweets(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    
    return x_text

In [3]:
def create_vector(word, word_embeddings, word_vector_size, silent=True):
    # if the word is missing from Glove or Google Vectors, create some fake vector and store in glove!
    vector = np.random.uniform(0.0, 1.0, (word_vector_size,))
    word_embeddings[word] = vector
    if not silent:
        print("utils.py::create_vector => %s is missing" % word)
    return vector

In [4]:
def manipulate_dataset(dataset,word_embeddings):
    missing_voc={}
    output_array = np.ndarray((len(dataset),20,200))
    for i,sentence in enumerate(dataset):
        matrix_embedding = []
        for word in sentence:
            try:
                matrix_embedding.append(word_embeddings[word])
            except:
                vector = create_vector(word,word_embeddings,200,silent=True)
                matrix_embedding.append(vector)
                try:
                    missing_voc[word] = missing_voc[word] + 1
                except KeyError:
                    missing_voc[word] = 1
        output_array[i]=(matrix_embedding)
    return output_array

In [5]:
def generate_word_embeddings():
    embeddings_index = {}
    print('Indexing word vectors.')
    f = open(os.path.join('./glove.twitter.27B/', 'glove.twitter.27B.'+str(200)+'d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

In [6]:
def pad_sentences(sentences, padding_word="padding_word"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = 20
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        if num_padding < 0:
            new_sentence = sentence[:sequence_length]
        else:
            new_sentence = sentence + [padding_word] * num_padding
        
        padded_sentences.append(new_sentence)
    return padded_sentences

In [None]:
x_test_text = load_data_test()
x_test_text_padded = pad_sentences(x_test_text)
embeddings_words = generate_word_embeddings()
x_test_embeddings = manipulate_dataset(x_test_text_padded,embeddings_words)

In [None]:
model = load_model('my_first_model.h5')
y_output = model.predict(x_test_embeddings, verbose=0)