In [1]:
import keras
from keras.datasets import reuters

Using TensorFlow backend.


In [2]:
# Load data
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
word_index = reuters.get_word_index(path="reuters_word_index.json")

In [3]:
# Having a look at the data
print(x_train[4])

[1, 8295, 111, 8, 25, 166, 40, 638, 10, 436, 22, 265, 9, 621, 575, 1080, 4742, 1149, 15874, 6, 438, 8295, 13, 102, 388, 15, 90, 67, 7, 197, 8295, 8, 4, 270, 416, 23, 527, 6, 15874, 4891, 4, 1055, 742, 16, 8, 36, 1480, 6, 2124, 100, 543, 5, 645, 362, 6, 2912, 4, 49, 8, 15874, 976, 124, 20, 5, 8295, 80, 9, 100, 362, 543, 395, 61, 44, 20, 8295, 8, 16, 40, 1276, 42, 1436, 166, 415, 6, 888, 4, 116, 9, 40, 3089, 4, 303, 163, 16, 64, 772, 13, 94, 156, 17, 12]


In [4]:
word_index['the']

1

In [5]:
index_to_word = {}
for key, value in word_index.items():
       index_to_word[value] = key

In [6]:
index_to_word[1]

'the'

In [7]:
print(' '.join([index_to_word[x] for x in x_train[4]]))

the bleached could mln at world as holding for include its i 3 start measures gnp 525 process ccb and nations bleached it 1985 do 000 april 0 a agreed bleached mln in ended cost cts must and ccb tenneco in winter 53 1 mln net diplomats and reorganization group 38 said 49 26 and plastics in this mln ccb field foreign is said bleached 10 3 group 26 38 producers had 4 is bleached mln 1 as equivalent not 145 world york and credits in 20 3 as permits in set board 1 share turnover it than growth pct dlrs


In [8]:
# Labels
y_train[4]

4

In [9]:
print(set(y_train))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45}


In [10]:
num_classes = max(y_train) + 1

In [11]:
print('# of Training Samples: {}'.format(len(x_train)))
print('# of Test Samples: {}'.format(len(x_test)))
print('# of Classes: {}'.format(num_classes))
# of Training Samples: 8982
# of Test Samples: 2246
# of Classes: 46

# of Training Samples: 8982
# of Test Samples: 2246
# of Classes: 46


Let's see what we want to do!
![this3](https://image.slidesharecdn.com/kpisummerschool2015wordembeddingsandneurallanguagemodeling1-150828091027-lva1-app6892/95/kpi-summer-school-2015-word-embeddings-and-neural-language-modeling-28-638.jpg?cb=1440753116)


In [12]:
# Different input dimensions!
print(len(x_train[0]), len(x_train[1]))

87 56


In [13]:
from keras.preprocessing.sequence import pad_sequences

sequence_len = 100

# pad text sequences
x_train = pad_sequences(x_train, maxlen=sequence_len)
x_test = pad_sequences(x_test, maxlen=sequence_len)

# pass labels to a fixed dimension
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [14]:
print(x_train[0])
print(len(x_train[0]))

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     1 27595 28842     8    43    10   447     5    25   207   270
     5  3095   111    16   369   186    90    67     7    89     5    19
   102     6    19   124    15    90    67    84    22   482    26     7
    48     4    49     8   864    39   209   154     6   151     6    83
    11    15    22   155    11    15     7    48     9  4579  1005   504
     6   258     6   272    11    15    22   134    44    11    15    16
     8   197  1245    90    67    52    29   209    30    32   132     6
   109    15    17    12]
100


In [15]:
print(y_train[0])
print(len(y_train[0]))

[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
46


In [16]:
from keras.models import Model
from keras.layers import Dense, Input, Embedding, Flatten

num_words = 50000
embedding_dim = 32

input = Input(shape=(sequence_len,), name="words_idx")
x = Embedding(num_words, output_dim=embedding_dim)(input)
x = Flatten()(x)
x = Dense(16)(x)
output = Dense(len(y_train[0]), activation='softmax')(x)

Instructions for updating:
Colocations handled automatically by placer.


![this4](https://image.slidesharecdn.com/kpisummerschool2015wordembeddingsandneurallanguagemodeling1-150828091027-lva1-app6892/95/kpi-summer-school-2015-word-embeddings-and-neural-language-modeling-28-638.jpg?cb=1440753116)

In [17]:
# compile model
model = Model([input], output)
model.compile(loss="categorical_crossentropy",
              optimizer='adam', metrics=['accuracy'])

In [18]:
# A look inside embeddings weights
before_embeddings = model.layers[1].get_weights()

In [19]:
before_embeddings

[array([[ 0.04601956, -0.01554308, -0.02021786, ...,  0.00869598,
         -0.03045226,  0.04847017],
        [ 0.02852594, -0.03818436,  0.02280318, ..., -0.0218235 ,
         -0.01591026,  0.00840474],
        [-0.01995652, -0.0424386 , -0.02714961, ...,  0.04274425,
         -0.01718838,  0.04180943],
        ...,
        [ 0.02374537,  0.01440113, -0.04720635, ...,  0.01661057,
         -0.0285208 , -0.03650192],
        [-0.01787205,  0.01878113, -0.02329276, ...,  0.01984921,
         -0.00031283,  0.02013868],
        [-0.03198608,  0.02111074, -0.03449867, ...,  0.00050364,
         -0.00879012,  0.04555333]], dtype=float32)]

In [20]:
batch_size = 32
epochs = 2

history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Instructions for updating:
Use tf.cast instead.
Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
Test loss: 1.5128490428041372
Test accuracy: 0.6273374888956388


In [21]:
# A look inside embeddings weights
model.layers[1].get_weights()

[array([[ 0.04778003, -0.02283379, -0.00449974, ..., -0.01279756,
         -0.01062095,  0.05110388],
        [-0.02302335, -0.04233605,  0.04907309, ..., -0.05940785,
         -0.00978484,  0.01989996],
        [ 0.00742607, -0.01621205, -0.00214382, ...,  0.01920804,
         -0.02073726,  0.02142042],
        ...,
        [ 0.02374537,  0.01440113, -0.04720635, ...,  0.01661057,
         -0.0285208 , -0.03650192],
        [-0.01787205,  0.01878113, -0.02329276, ...,  0.01984921,
         -0.00031283,  0.02013868],
        [-0.03198608,  0.02111074, -0.03449867, ...,  0.00050364,
         -0.00879012,  0.04555333]], dtype=float32)]

In [22]:
# first embedding
print(before_embeddings[0][0])
print(model.layers[1].get_weights()[0][0])

[ 0.04601956 -0.01554308 -0.02021786  0.0447003  -0.04753053 -0.01382623
 -0.02505436 -0.04954783 -0.01468536 -0.03577384 -0.03308809  0.04695566
  0.00676578  0.03421665  0.04425024 -0.00800001 -0.02503148  0.04108813
  0.04306828  0.00433546 -0.04491458  0.04180301  0.00239607 -0.00478263
  0.0045232  -0.00864415  0.00921013  0.00267029  0.03739797  0.00869598
 -0.03045226  0.04847017]
[ 0.04778003 -0.02283379 -0.00449974  0.0390374  -0.04742653  0.02518003
 -0.01547205 -0.04282432  0.01244631 -0.02246185 -0.03852264  0.04239924
 -0.01813188  0.02635528  0.03717105  0.03415433 -0.02073458  0.03327616
  0.02164897 -0.02178071 -0.04624386  0.02622519 -0.01788031  0.01164822
 -0.03788704  0.01609039 -0.015049    0.024016    0.02950859 -0.01279756
 -0.01062095  0.05110388]


In [23]:
# last embedding
print(before_embeddings[0][-1])
print(model.layers[1].get_weights()[0][-1])

[-0.03198608  0.02111074 -0.03449867  0.03452109 -0.04455796  0.02440829
 -0.04796584  0.04020821  0.03581233 -0.01820178  0.03445984 -0.03317568
  0.00663571  0.01838228 -0.04200111 -0.03165336 -0.00038855 -0.03517633
 -0.0132449   0.00412346  0.02558391  0.03398455  0.00598979 -0.04673339
 -0.04224164 -0.00108529  0.01757446 -0.04237891 -0.01370193  0.00050364
 -0.00879012  0.04555333]
[-0.03198608  0.02111074 -0.03449867  0.03452109 -0.04455796  0.02440829
 -0.04796584  0.04020821  0.03581233 -0.01820178  0.03445984 -0.03317568
  0.00663571  0.01838228 -0.04200111 -0.03165336 -0.00038855 -0.03517633
 -0.0132449   0.00412346  0.02558391  0.03398455  0.00598979 -0.04673339
 -0.04224164 -0.00108529  0.01757446 -0.04237891 -0.01370193  0.00050364
 -0.00879012  0.04555333]
