In [122]:
from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM, Input
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
import numpy as np
import random
import sys
import string

embeddings_path = "glove.840B.300d-char.txt"
embedding_dim = 300
batch_size = 128
lr = 0.001
lr_decay = 1e-4
maxlen = 10
step = 3

punctuation = '''!"'#$%&'()*,-./:;<>?@[]_'''
# + for padding -- should be mapped to index 0
chars = ["+"] + sorted(list(set(string.ascii_lowercase + punctuation + " ")))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(len(chars))

def vectorize(text, target_sequences=100):
    text = text.lower()
    sequences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sequences.append(text[i: i + maxlen])

    sequences = sequences[:target_sequences]
    if len(sequences) < target_sequences:
        for pad in range(target_sequences - len(sequences)):
            sequences.append("+" * maxlen)
    
    X = np.zeros((len(sequences), maxlen), dtype=np.int)
    for i, seq in enumerate(sequences):
        for t, char in enumerate(seq):
            try:
                X[i, t] = char_indices[char]
            except KeyError:
                X[i, t] = 0
    
    return X

def get_embeddings_matrix(embeddings_path):
    embedding_vectors = {}
    with open(embeddings_path, 'r') as f:
        for line in f:
            line_split = line.strip().split(" ")
            vec = np.array(line_split[1:], dtype=float)
            char = line_split[0]
            embedding_vectors[char] = vec

    embedding_matrix = np.zeros((len(chars), 300))
    for char, i in char_indices.items():
        embedding_vector = embedding_vectors.get(char)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


51


In [79]:
train_texts = ["This is very positive positive", "And this is negative negative"]
test_texts = ["Titive titivie ery positive hisis", "boohoo very negative negative egative gatie tive"]

X_train = np.array([vectorize(t) for t in train_texts])
X_test = np.array([vectorize(t) for t in train_texts])

In [82]:
embedding_matrix = get_embeddings_matrix(embeddings_path)

In [123]:
def get_model():
    sequence_input = Input(shape=(maxlen,), dtype='int32')
    embedding_layer = Embedding(5, embedding_dim, input_shape=(maxlen, step), weights=[embedding_matrix])
    embedded_sequences = embedding_layer(sequence_input)
    x = LSTM(256)(embedded_sequences)
    output = Dense(128, activation='relu')(x)
    model = Model(sequence_input, output)
    return model

        
model = get_model()
optimizer = Adam(lr=lr, decay=lr_decay)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer)

In [125]:
X_train.shape

(2, 100, 10)

In [124]:
model.fit(X_train,[1,0])

ValueError: Error when checking model input: expected input_21 to have 2 dimensions, but got array with shape (2, 100, 10)

In [None]:
print(string.punctuation)

In [110]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classication of newsgroup messages into 20 different categories).

GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model


BASE_DIR = ''
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 5
MAX_NB_WORDS = 5
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open('glove.840B.300d-char.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

texts = ["This is a piasdfa sdf awer dfsome text wasdf can we laenr form caharactweres",
         " Another piece of asdf awe testxt asdf kjvacn you evne usnersadnth this?"]  # list of text samples
labels = [0, 1]

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(128, activation='relu')(embedded_sequences)

x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop')

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10)


Indexing word vectors.
Found 94 word vectors.
Processing text dataset
Found 2 texts.
Found 24 unique tokens.
Shape of data tensor: (2, 5)
Shape of label tensor: (2, 2)
Preparing embedding matrix.
Training model.
Epoch 1/10


AttributeError: 'ProgbarLogger' object has no attribute 'log_values'