In [3]:
import pandas as pd
import numpy as np

from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, GRU, TimeDistributed, Dense


  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters
  from ._conv import register_converters as _register_converters
  from . import h5a, h5d, h5ds, h5f, h5fd, h5g, h5r, h5s, h5t, h5p, h5z
  from .. import h5g, h5i, h5o, h5r, h5t, h5l, h5p
Using TensorFlow backend.
  from ._solve_toepli

In [4]:
#load data
df = pd.read_csv("yelp_2013.csv")
#df = df.sample(5000)

Y = df.stars.values-1
Y = to_categorical(Y,num_classes=5)
X = df.text.values

#set hyper parameters
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.1
TEST_SPLIT=0.1
NUM_FILTERS = 50
MAX_LEN = 512
Batch_size = 100
EPOCHS = 10




In [5]:
#shuffle the data
indices = np.arange(X.shape[0])
np.random.seed(2018)
np.random.shuffle(indices)
X=X[indices]
Y=Y[indices]

#training set, validation set and testing set
nb_validation_samples_val = int((VALIDATION_SPLIT + TEST_SPLIT) * X.shape[0])
nb_validation_samples_test = int(TEST_SPLIT * X.shape[0])

x_train = X[:-nb_validation_samples_val]
y_train = Y[:-nb_validation_samples_val]
x_val =  X[-nb_validation_samples_val:-nb_validation_samples_test]
y_val =  Y[-nb_validation_samples_val:-nb_validation_samples_test]
x_test = X[-nb_validation_samples_test:]
y_test = Y[-nb_validation_samples_test:]

In [6]:
y_train[3]

array([0., 0., 1., 0., 0.])

In [7]:
#use tokenizer to build vocab
tokenizer1 = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer1.fit_on_texts(df.text)
vocab = tokenizer1.word_index

x_train_word_ids = tokenizer1.texts_to_sequences(x_train)
x_test_word_ids = tokenizer1.texts_to_sequences(x_test)
x_val_word_ids = tokenizer1.texts_to_sequences(x_val)

In [8]:
x_train[0]

"we've bought two vehicles from saul aguirre and can't recommend him enough!  had some service done recently and service adviser mike d. was very professional and attentive, and kept in excellent contact. service manager also was quick to address my initial concerns when i arrived."

In [9]:
x_train_word_ids[0]

[890,
 690,
 129,
 4226,
 48,
 2,
 202,
 190,
 250,
 226,
 23,
 62,
 47,
 251,
 681,
 2,
 47,
 18717,
 2443,
 787,
 6,
 35,
 610,
 2,
 543,
 2,
 623,
 11,
 241,
 1746,
 47,
 390,
 70,
 6,
 382,
 5,
 2466,
 12,
 2496,
 2984,
 50,
 3,
 478]

In [10]:
#pad sequences into the same length
x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=MAX_LEN)
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=MAX_LEN)
x_val_padded_seqs = pad_sequences(x_val_word_ids, maxlen=MAX_LEN)

In [11]:
print(x_train_padded_seqs[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [12]:
#slice sequences into many subsequences
x_test_padded_seqs_split=[]
for i in range(x_test_padded_seqs.shape[0]):
    split1=np.split(x_test_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_test_padded_seqs_split.append(a)

x_val_padded_seqs_split=[]
for i in range(x_val_padded_seqs.shape[0]):
    split1=np.split(x_val_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_val_padded_seqs_split.append(a)
   
x_train_padded_seqs_split=[]
for i in range(x_train_padded_seqs.shape[0]):
    split1=np.split(x_train_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_train_padded_seqs_split.append(a)

In [13]:
x_train_padded_seqs_split[0]

[[array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)],
 [array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)],
 [array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
  array([0, 0, 0, 0, 0, 0, 0, 0], dtyp

In [14]:
#load pre-trained GloVe word embeddings
print "Using GloVe embeddings"
glove_path = 'glove.6B.200d.txt'
embeddings_index = {}
f = open(glove_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Using GloVe embeddings
Found 400001 word vectors.


In [15]:
#use pre-trained GloVe word embeddings to initialize the embedding layer
embedding_matrix = np.random.random((MAX_NUM_WORDS + 1, EMBEDDING_DIM))
for word, i in vocab.items():
    if i<MAX_NUM_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
        # words not found in embedding index will be random initialized.
            embedding_matrix[i] = embedding_vector
            
embedding_layer = Embedding(MAX_NUM_WORDS + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_LEN/64,
                            trainable=True)


In [16]:
#build model
print "Build Model"
input1 = Input(shape=(MAX_LEN/64,), dtype='int32')
embed = embedding_layer(input1)
gru1 = GRU(NUM_FILTERS,recurrent_activation='sigmoid',activation=None,return_sequences=False)(embed)
Encoder1 = Model(input1, gru1)

input2 = Input(shape=(8,MAX_LEN/64,), dtype='int32')
embed2 = TimeDistributed(Encoder1)(input2)
gru2 = GRU(NUM_FILTERS,recurrent_activation='sigmoid',activation=None,return_sequences=False)(embed2)
Encoder2 = Model(input2,gru2)

input3 = Input(shape=(8,8,MAX_LEN/64), dtype='int32')
embed3 = TimeDistributed(Encoder2)(input3)
gru3 = GRU(NUM_FILTERS,recurrent_activation='sigmoid',activation=None,return_sequences=False)(embed3)
preds = Dense(5, activation='softmax')(gru3)
model = Model(input3, preds)

Build Model


In [17]:
print Encoder1.summary()
print Encoder2.summary()
print model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 8)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 8, 200)            6000200   
_________________________________________________________________
gru_1 (GRU)                  (None, 50)                37650     
Total params: 6,037,850
Trainable params: 6,037,850
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 8, 8)              0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 8, 50)             6037850   
_____________________________________________________________

In [18]:
#use adam optimizer
from keras.optimizers import Adam
opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['acc'])

In [19]:
#save the best model on validation set
from keras.callbacks import ModelCheckpoint             
savebestmodel = 'save_model/SRNN(8,2)_yelp2013.h5'
checkpoint = ModelCheckpoint(savebestmodel, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks=[checkpoint] 

In [20]:
model.fit(np.array(x_train_padded_seqs_split), y_train, 
          validation_data = (np.array(x_val_padded_seqs_split), y_val),
          nb_epoch = EPOCHS, 
          batch_size = Batch_size,
          callbacks = callbacks,
          verbose = 1)

  


Train on 374887 samples, validate on 46861 samples
Epoch 1/10
 21500/374887 [>.............................] - ETA: 4:55 - loss: 1.2235 - acc: 0.4639

KeyboardInterrupt: 

In [None]:
#use the best model to evaluate on test set
from keras.models import load_model
best_model= load_model(savebestmodel)          
print best_model.evaluate(np.array(x_test_padded_seqs_split),y_test,batch_size=Batch_size)