In [1]:
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Convolution1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model
from keras.layers.merge import Concatenate
import numpy as np
from keras.datasets import imdb
from keras.preprocessing import sequence
import re
import itertools
from collections import Counter
from utils.preprocessing import clean_tweets
import os

Using TensorFlow backend.
  return f(*args, **kwds)


In [None]:
# ---------------------- Parameters section -------------------
#
# Model type. See Kim Yoon's Convolutional Neural Networks for Sentence Classification, Section 3
model_type = "CNN-non-static"  # CNN-rand|CNN-non-static|CNN-static

# Model Hyperparameters
embedding_dim = 200
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 200

# Training parameters
batch_size = 128
num_epochs = 5

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10

#
# ---------------------- Parameters end -----------------------

In [None]:
def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open("./twitter-datasets/train_pos.txt").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("./twitter-datasets/train_neg.txt").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_tweets(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [16]:
def pad_sentences(sentences, padding_word="padding_word"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = 20
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        if num_padding < 0:
            new_sentence = sentence[:sequence_length]
        else:
            new_sentence = sentence + [padding_word] * num_padding
        
        padded_sentences.append(new_sentence)
    return padded_sentences

In [8]:
def generate_word_embeddings():
    embeddings_index = {}
    print('Indexing word vectors.')
    f = open(os.path.join('./glove.twitter.27B/', 'glove.twitter.27B.'+str(200)+'d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

In [10]:
def manipulate_dataset(dataset,word_embeddings):
    missing_voc={}
    output_array = np.ndarray((len(dataset),20,200))
    for i,sentence in enumerate(dataset):
        matrix_embedding = []
        for word in sentence:
            try:
                matrix_embedding.append(word_embeddings[word])
            except:
                vector = create_vector(word,word_embeddings,200,silent=True)
                matrix_embedding.append(vector)
                try:
                    missing_voc[word] = missing_voc[word] + 1
                except KeyError:
                    missing_voc[word] = 1
        output_array[i]=(matrix_embedding)
    return output_array

In [13]:
def create_vector(word, word_embeddings, word_vector_size, silent=True):
    # if the word is missing from Glove or Google Vectors, create some fake vector and store in glove!
    vector = np.random.uniform(0.0, 1.0, (word_vector_size,))
    word_embeddings[word] = vector
    if not silent:
        print("utils.py::create_vector => %s is missing" % word)
    return vector

In [None]:
def split_and_shuffle(x,y, ratio, seed):
    split_index = int(len(x)*ratio)
    train_x, test_x = x[:split_index], x[split_index:]
    train_y, test_y = y[:split_index], y[split_index:]
    np.random.seed(seed)
    np.random.shuffle(train_x)
    np.random.seed(seed)
    np.random.shuffle(test_x)
    np.random.seed(seed)
    np.random.shuffle(train_y)
    np.random.seed(seed)
    np.random.shuffle(test_y)
    return train_x,test_x,train_y,test_y

In [None]:
sentences, labels = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
x = sentences_padded
y = np.array(labels)
#vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
y = y.argmax(axis=1)

In [None]:
x_train,x_test,y_train,y_test = split_and_shuffle(x,y,0.9,33)

In [None]:
print("x_train shape:", len(x_train))
print("x_test shape:", len(x_test))
print("y_train shape:", len(y_train))
print("y_test shape:", len(y_test))
#print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

In [9]:
embeddings_words = generate_word_embeddings()

Indexing word vectors.


In [None]:
def generator():
#     sentences, labels = load_data_and_labels()
#     sentences_padded = pad_sentences(sentences)
#     x = sentences_padded
#     y = np.array(labels)
#     y = y.argmax(axis=1)
#     x_train,x_test,y_train,y_test = split_and_shuffle(x,y,0.9,33)
    #embeddings_words = generate_word_embeddings()
    
    while 1:
        for i in range(int(len(x_train)/10000)): 
            x_train_embeddings = manipulate_dataset(x_train[i*10000:((i+1)*10000)],embeddings_words)
            #x_test_embeddings = manipulate_dataset(x_test,embeddings_words)
            if i%10000==0:
                print ("i = " + str(i))
                print(x_train_embeddings.shape)
                print(y_train.shape)
            yield x_train_embeddings, y_train[(i*10000):((i+1)*10000)]

In [None]:
def generator_validator():
#     sentences, labels = load_data_and_labels()
#     sentences_padded = pad_sentences(sentences)
#     x = sentences_padded
#     y = np.array(labels)
#     y = y.argmax(axis=1)
#     x_train,x_test,y_train,y_test = split_and_shuffle(x,y,0.9,33)
    #embeddings_words = generate_word_embeddings()
    
    while 1:
        for j in range(int(len(x_test)/10000)): 
            #x_train_embeddings = manipulate_dataset(x_train[i*10000:((i+1)*10000)],embeddings_words)
            x_test_embeddings = manipulate_dataset(x_test[j*10000:((j+1)*10000)],embeddings_words)
            if j%10000==0:
                print ("j = " + str(j))
            yield x_test_embeddings, y_test[(j*10000):((j+1)*10000)]

In [None]:
x_train_embeddings = manipulate_dataset(x_train[0,embeddings_words)
x_test_embeddings = manipulate_dataset(x_test,embeddings_words)


#x_train = np.stack([np.stack([embeddings_idnex[word] for word in sentence]) for sentence in x_train])
#x_test = np.stack([np.stack([embeddings_idnex[word] for word in sentence]) for sentence in x_test])
#print("x_train static shape:", x_train.shape)
#print("x_test static shape:", x_test.shape)

In [None]:
input_shape = (20, 200)
model_input = Input(shape=input_shape)
z = model_input
z = Dropout(dropout_prob[0])(z)

In [None]:
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.fit_generator(generator(),steps_per_epoch=180000, epochs=num_epochs,validation_data= generator_validator(),validation_steps=20000, verbose=2)

In [5]:
def load_data_test():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    x_text = list(open("./twitter-datasets/test_data.txt").readlines())
    x_text = [s.strip() for s in x_text]
    # Split by words
    x_text = [clean_tweets(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    
    return x_text

In [17]:
x_test_text = load_data_test()
x_test_text_padded = pad_sentences(x_test_text)
embeddings_words = generate_word_embeddings()
x_test_embeddings = manipulate_dataset(x_test_text_padded,embeddings_words)

In [20]:
from keras.models import load_model
model = load_model('my_first_model.h5')

In [21]:
yhat = model.predict(x_test_embeddings, verbose=0)