In [None]:
# The following are various methods within Tokenizer class

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts([])
# tokenizer.texts_to_matrix([])
# tokenizer.texts_to_sequences([])
# tokenizer.word_index
# tokenizer.word_counts
# tokenizer.document_count
# tokenizer.lower

In [4]:
from keras.preprocessing.text import Tokenizer

In [8]:
nb_words = 3 # maximum number of words to workwith
tokenizer = Tokenizer(nb_words=nb_words)

In [10]:
# training
tokenizer.fit_on_texts(["The sun is shining in June!","September is grey.","Life is beautiful in August.",
                        "I like it","This and other things?"])
# the word index can be found using 'word_index'
print(tokenizer.word_index)
# the special characters (such as !, ? etc) will automatically taken care
# but numerics will not be taken care yet

{'beautiful': 10, 'this': 15, 'life': 9, 'and': 16, 'august': 11, 'things': 18, 'september': 7, 'sun': 4, 'is': 1, 'june': 6, 'grey': 8, 'like': 13, 'i': 12, 'other': 17, 'in': 2, 'it': 14, 'the': 3, 'shining': 5}


In [11]:
tokenizer.texts_to_sequences(["June is beautiful and I like it!"])

[[1]]

In [5]:
# a parameter less constructor behaves betterly
tokenizer = Tokenizer()
texts = ["The sun is shining in June!","September is grey.","Life is beautiful in August.","I like it","This and other things?"]
tokenizer.fit_on_texts(texts)

print(tokenizer.word_index)
print(tokenizer.texts_to_matrix(['June is beautiful and I like it!'])) # vector size based on the vocabulary size
# useful for statistical based modeling (tf-idf etc)
print(tokenizer.texts_to_sequences(["June is beautiful and I like it!"])) # vector size based on the length of the sentence

{'beautiful': 10, 'this': 15, 'life': 9, 'and': 16, 'august': 11, 'things': 18, 'september': 7, 'sun': 4, 'is': 1, 'june': 6, 'grey': 8, 'like': 13, 'i': 12, 'other': 17, 'in': 2, 'it': 14, 'the': 3, 'shining': 5}
[[ 0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  1.  1.  1.  0.  1.  0.
   0.]]
[[6, 1, 10, 16, 12, 13, 14]]


In [6]:
# word counts
print(tokenizer.word_counts)

OrderedDict([('the', 1), ('sun', 1), ('is', 3), ('shining', 1), ('in', 2), ('june', 1), ('september', 1), ('grey', 1), ('life', 1), ('beautiful', 1), ('august', 1), ('i', 1), ('like', 1), ('it', 1), ('this', 1), ('and', 1), ('other', 1), ('things', 1)])


In [16]:
print(tokenizer.document_count)

5


In [17]:
print(tokenizer.lower)

True


# Basic Network

In [None]:
# Let's assume that the following learns a word being represented as 'y' and see if the network can predict the sameword or not

In [6]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

print(texts)
X = tokenizer.texts_to_matrix(texts)
y = [1,0,0,0,0]
print(X)

vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Dense(2, input_dim=vocab_size))
model.add(Dense(1, activation='sigmoid'))
 

model.compile(loss='binary_crossentropy', optimizer='rmsprop')

model.fit(X, y=y, batch_size=32, nb_epoch=20, verbose=0, validation_split=0.2)

['The sun is shining in June!', 'September is grey.', 'Life is beautiful in August.', 'I like it', 'This and other things?']
[[ 0.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.]
 [ 0.  1.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.]
 [ 0.  1.  1.  0.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  0.
   0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.
   0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.
   1.]]




<keras.callbacks.History at 0x7f26deb042d0>

In [20]:
from keras.utils.np_utils import np as np
np.round(model.predict(X))

array([[ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.]], dtype=float32)

# Embedding

In [12]:
import numpy as np

model = Sequential()
model.add(Embedding(2, 2, input_length=7))
# size of the vocabulary (0,1 here), size of the vector space in which the input is mapped
# input length
model.compile('rmsprop', 'mse')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 7, 2)              4         
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.predict(np.array([[0,1,0,1,1,0,0]]))
# we are mapping every binary element (0 or 1) into a 2D space
# this would produce a non-sparse/Dense 7x2 map matrix
# here, each element is represented as a vector of 2 elements
# clearly, all 0's are represented by [0.00923343,  0.03239873] and
# all '1's are represented by [0.01686737,  0.03767406]

array([[[ 0.00923343,  0.03239873],
        [ 0.01686737,  0.03767406],
        [ 0.00923343,  0.03239873],
        [ 0.01686737,  0.03767406],
        [ 0.01686737,  0.03767406],
        [ 0.00923343,  0.03239873],
        [ 0.00923343,  0.03239873]]], dtype=float32)

# Embedding class does indeed map discrete labels (words) into a continuous vector space. 
# This embedding does not in any way take the semantic similarity of the words into account. Hence embedding in keras and word2vec (of gensim) are completely different ways of expressing the labels 

In [22]:
#model.layers[0].get_value()

In [24]:
X

array([[ 0.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
         1.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  1.,  1.,  1.]])

In [31]:
model = Sequential()
model.add(Embedding(3, 10, input_length= X.shape[1] ))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
model.summary()
model.fit(X, y=y, batch_size=32, nb_epoch=700, verbose=0, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 19, 10)            30        
_________________________________________________________________
flatten_4 (Flatten)          (None, 190)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 191       
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x7f26cd187e90>

In [32]:
model.predict(X)
# it perfectly predicts the word as above

array([[  1.00000000e+00],
       [  9.82496928e-08],
       [  9.64378088e-08],
       [  9.14945897e-09],
       [  9.92534161e-01]], dtype=float32)

In [34]:
model = Sequential()
model.add(Embedding(vocab_size, 10))
model.add(LSTM(5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
model.fit(X, y=y,  nb_epoch=500, verbose=0, validation_split=0.2)

<keras.callbacks.History at 0x7f26afdaeb10>

In [35]:
model.predict(X)

array([[ 0.93342745],
       [ 0.01597509],
       [ 0.01614392],
       [ 0.01633371],
       [ 0.02689865]], dtype=float32)

# Using word2vec

In [37]:
# load the whole embedding into memory

embeddings_index = dict()
with open('/home/ramscrux7757/SPARK/glove/glove.6B.50d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print('Loaded {} word vectors'.format(len(embeddings_index)))

Loaded 400000 word vectors


In [None]:
embeddings_index = {}
glove_data = '/Users/Swa/Desktop/AIML/Glove/glove.6B.50d.txt'
f = open(glove_data)
for line in f:
    values = line.split()
    word = values[0]
    value = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = value
f.close()

print('Loaded %s word vectors.' % len(embeddings_index))

In [43]:
embedding_dimension = 50
word_index = tokenizer.word_index

In [39]:
print(word_index)

{'beautiful': 10, 'this': 15, 'life': 9, 'and': 16, 'august': 11, 'things': 18, 'september': 7, 'sun': 4, 'is': 1, 'june': 6, 'grey': 8, 'like': 13, 'i': 12, 'other': 17, 'in': 2, 'it': 14, 'the': 3, 'shining': 5}


In [44]:
# create a weeight matrix for words in training docs
# the embedding matrix is a (doc_size x embedded space) dimension matrix
# every doc is a vector of 100 elements (in this case)
vocab_size = len(word_index)+1
embedding_matrix = np.zeros((vocab_size, embedding_dimension))
#print(embedding_matrix)
for word, i in word_index.items():
    #print(word,i)
    embedding_vector = embeddings_index.get(word)
    if  embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix.shape)
print(embedding_matrix)

(19, 50)
[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  6.18499994e-01   6.42539978e-01  -4.65519994e-01   3.75699997e-01
    7.48380005e-01   5.37389994e-01   2.2239

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector[:embedding_dimension]

In [45]:
embedding_matrix.shape

(19, 50)

In [46]:
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=12)

In [47]:
from keras.preprocessing.sequence import pad_sequences
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=12)

In [50]:
X

array([[ 0,  0,  0,  0,  0,  0,  3,  4,  1,  5,  2,  6],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  1,  8],
       [ 0,  0,  0,  0,  0,  0,  0,  9,  1, 10,  2, 11],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 12, 13, 14],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 15, 16, 17, 18]], dtype=int32)

In [48]:
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.layers[0].trainable=False
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
model.fit(X, y=y, batch_size=20, nb_epoch=700, verbose=0, validation_split=0.2)

<keras.callbacks.History at 0x7f26afb26ed0>

In [49]:
model.predict(X)

array([[  1.00000000e+00],
       [  9.72407150e-08],
       [  9.93706735e-08],
       [  3.49633176e-08],
       [  1.48684252e-03]], dtype=float32)