In [1]:
import pandas as pd

In [3]:
data = pd.read_csv("E:\Product review Analysis\Reviews.csv")
data = data.sample(frac=1)[:40000]

In [4]:
data.columns = map(lambda x:x.lower(), list(data))
data["text"] = data["summary"] + " "+ data["text"]
data = data[["text", "score"]]

data.loc[data.score<3, "score"] = -1
data.loc[data.score==3, "score"] = 0
data.loc[data.score>3, "score"] = 1

data.head(5)


Unnamed: 0,text,score
218723,"Tasty and Healthy Many snacks labeled ""healthy...",1
205224,MINT FOR ALL REASONS! Terrific organic mint le...,1
134672,Nice to have litter delivered to your door! My...,1
367167,Very nice basket I was concerned ordering a gi...,1
133030,Robost and Sweet! Odd Yet Delicious Combinatio...,1


In [5]:
data.tail(5)

Unnamed: 0,text,score
362560,good K cups I like this coffee from Green Moun...,1
242252,Nice Indian Food Substitute! Indian food is a ...,1
151499,"Fine, but others are better It's better than a...",0
322209,Surf & Turf - Valentines Dinner (NY Strip Stea...,1
128360,Am I missing something? This olive oil is supp...,-1


In [13]:
reviews = []

In [14]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import random


In [15]:
random.seed(1000)

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer('[a-zA-Z0-9]\w+')
reviews = [tokenizer.tokenize(str(i).lower()) for i in data["text"]]

In [None]:
labels =[]
for i in data["score"]:
    labels.append(i)

In [9]:
reviews_ = []
for word in reviews:
    lemmatized = lemmatizer.lemmatize(str(word))
    reviews_.append(lemmatized)

In [None]:
vector_size = 256
window = 5
reviews = reviews_

In [17]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

import time

word2vec_model = 'word2vec.model'

print('Generating Word2Vec Vectors ..')

start = time.time()
model = Word2Vec(sentences=reviews, size=vector_size, window=window, negative=20, iter=50, workers=4)
print('Word2Vec Created in {} seconds.'.format(time.time() - start))
model.save(word2vec_model)
print('Word2Vec Model saved at {}'.format(word2vec_model))


Generating Word2Vec Vectors ..
Word2Vec Created in 269.93200278282166 seconds.
Word2Vec Model saved at word2vec.model


In [19]:
model = Word2Vec.load(word2vec_model)

In [20]:
x_vectors = model.wv

In [21]:
len(labels), len(reviews)

(40000, 40000)

In [22]:
import numpy as np
import keras.backend as K

train_size = int(0.9*(len(reviews)))
test_size = int(0.1*(len(reviews)))

max_no_tokens = 15

indexes = set(np.random.choice(len(reviews), train_size + test_size, replace=False))

x_train = np.zeros((train_size, max_no_tokens, vector_size), dtype=K.floatx())
y_train = np.zeros((train_size, 2), dtype=np.int32)

x_test = np.zeros((test_size, max_no_tokens, vector_size), dtype=K.floatx())
y_test = np.zeros((test_size, 2), dtype=np.int32)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [23]:
for i, index in enumerate(indexes):
    for t, token in enumerate(reviews[index]):
        if t >= max_no_tokens:
            break
      
        if token not in x_vectors:
            continue
    
        if i < train_size:
            x_train[i, t, :] = x_vectors[token]
        else:
            x_test[i - train_size, t, :] = x_vectors[token]

  
    if i < train_size:
        y_train[i, :] = [ 1.0 , 0.0 ] if labels[index] == 0 else [ 0.0 , 1.0 ]
    else:
        y_test[i - train_size, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]

In [24]:
x_train.shape, y_test.shape
batch_size = 50
no_epochs = 10

In [25]:
from keras.models import Sequential
from keras.layers import Conv1D, Dropout, Dense, Flatten, LSTM, MaxPooling1D, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard


model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same',
                 input_shape=(max_no_tokens, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=3))

model.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.3)))

model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))

model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001, decay=1e-6), metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='logs/', histogram_freq=0, write_graph=True, write_images=True)

model.summary()


Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 15, 32)            24608     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 15, 32)            3104      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 15, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 5, 32)             0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1024)              2232320   
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
________________________________________________

In [26]:
model.fit(x_train, y_train, batch_size=batch_size, shuffle=True, epochs=no_epochs,
         validation_data=(x_test, y_test), callbacks=[tensorboard, EarlyStopping(min_delta=0.0001, patience=3)])

Instructions for updating:
Use tf.cast instead.
Train on 36000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.callbacks.History at 0x18a518923c8>

In [27]:
model.metrics_names

['loss', 'accuracy']

In [28]:
model.evaluate(x=x_test, y=y_test, batch_size=32, verbose=1)



[0.2192348960042, 0.9259999990463257]

In [29]:
model.save('Product review analysis- bidirectionals lstm RNN-200k.model')