In [86]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model
from keras.initializers import Constant
import pickle as pkl
from sklearn.externals import joblib

In [68]:
BASE_DIR = '../Glove'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.3

In [46]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [62]:
print(embeddings_index['food'])

[ 0.47222   -0.44545   -0.51833   -0.26818    0.44427   -0.25108
 -0.99282   -0.90198    1.8729     0.039081   0.14284    0.074878
  1.0543    -0.3203     1.0722     0.44323    0.0099484  0.15754
  0.51399   -0.77668    0.924      0.010958   0.58815    0.23078
 -0.34281   -0.88444   -0.31492    0.12661    1.1445     0.60775
  3.4344     0.63561   -0.13832    0.28045   -0.16181    0.77541
 -0.49888    0.4602     0.91799    0.29007    0.06884    0.59978
  0.53967   -0.061752   1.2975     0.92323   -0.80945    0.34932
  0.33934    0.25499  ]


In [66]:
infile = open("Texts.pkl", "rb")
texts = pkl.load(infile)
infile2 = open("score.pkl", "rb")
labels = pkl.load(infile2)

In [67]:
print(type(texts))
print(type(texts[0]))
print(len(texts))
print(type(labels))
print(type(labels[0]))
print(len(labels))

<class 'list'>
<class 'str'>
364171
<class 'list'>
<class 'int'>
364171


In [69]:
# finally, vectorize the text samples into a 2D integer tensor

#tokenization
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

#padding the sequences so that they must have equal length
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 71626 unique tokens.
Shape of data tensor: (364171, 100)
Shape of label tensor: (364171, 2)


In [71]:
print(type(sequences))
print(type(word_index))
print(sequences[0])

<class 'list'>
<class 'dict'>
[14426, 28, 1082, 14, 369, 2405, 3182, 13462, 1117, 1214, 543, 103, 3775, 7624, 846, 793, 10259, 1817, 11796, 8, 171, 781, 1082, 1054, 2791, 1213, 1082, 1481, 1862, 369, 82, 258, 13462, 1407, 1661]


In [72]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]


In [73]:
# prepare embedding matrix
print('Preparing embedding matrix.')
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


Preparing embedding matrix.


In [74]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


In [76]:
from keras import backend as K
K.set_image_dim_ordering('th')

In [80]:
# train a 1D convnet with global maxpooling
print('Configuring and compiling CNN model..')
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu', data_format='channels_first')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu', data_format='channels_first')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu', data_format='channels_first')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

Configuring and compiling model..


In [81]:
print("Training model...")
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

Training model...
Train on 254920 samples, validate on 109251 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f856c679668>

In [84]:
# train a 1D convnet with global maxpooling
print('Configuring and compiling LSTM model..')
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=False)(embedded_sequences)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

Configuring and compiling LSTM model..


In [85]:
print("Training model...")
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

Training model...
Train on 254920 samples, validate on 109251 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f856c0f85c0>

In [87]:
filename = 'lstm_model.sav'
joblib.dump(model, filename)

['lstm_model.sav']