In [1]:
import tensorflow as tf
import keras
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import os

Using TensorFlow backend.


In [5]:
reviews = pd.read_csv('data/review.csv', nrows=500000, usecols=['stars','text'])

In [12]:
reviews = reviews[reviews.stars != 3]

In [13]:
reviews["sentiment"] = reviews['stars'].apply(lambda x: 1 if x > 3 else 0)
reviews.head()

Unnamed: 0,stars,text,sentiment
0,1.0,Total bill for this horrible service? Over $8G...,0
1,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,1
2,5.0,I have to say that this office really has it t...,1
3,5.0,Went in for a lunch. Steak sandwich was delici...,1
4,1.0,Today was my second out of three sessions I ha...,0


In [22]:
text = reviews['text'].values
sentiment = reviews['sentiment'].values
print(sentiment)

[0 1 1 ... 1 1 1]


In [28]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

MAX_NUM_WORDS=1000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH=100 # max number of words in a review to use


tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

sentiment = to_categorical(np.asarray(sentiment))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', sentiment.shape)

Found 180883 unique tokens.
Shape of data tensor: (444554, 100)
Shape of label tensor: (444554, 2)


In [31]:
split = 0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
sentoment = sentiment[indices]
nb_validation_samples = int(split * data.shape[0])


x_train = data[:-nb_validation_samples]
y_train = sentiment[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = sentiment[-nb_validation_samples:]

In [41]:
from keras import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout

model = Sequential()
model.add(Embedding(input_dim=MAX_NUM_WORDS, output_dim=128, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 128)          128000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
Total params: 259,842
Trainable params: 259,842
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [43]:
result= model.fit(x_train, y_train, batch_size=32, epochs=3, validation_data=(x_val, y_val))

Instructions for updating:
Use tf.cast instead.
Train on 355644 samples, validate on 88910 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
