In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, Bidirectional

Using TensorFlow backend.


In [2]:
dataset = pd.read_csv('googleplaystore_user_reviews.csv')
dataset.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [3]:
x = dataset.dropna()['Translated_Review']
y = np.zeros((dataset.dropna()['Sentiment'].shape))
mask = dataset.dropna()['Sentiment'] == 'Positive'
y[mask] = 1

In [4]:
x

0        I like eat delicious food. That's I'm cooking ...
1          This help eating healthy exercise regular basis
3               Works great especially going grocery store
4                                             Best idea us
5                                                 Best way
                               ...                        
64222    Most ads older many agents ..not much owner po...
64223    If photos posted portal load, fit purpose. I'm...
64226    Dumb app, I wanted post property rent give opt...
64227    I property business got link SMS happy perform...
64230    Useless app, I searched flats kondapur, Hydera...
Name: Translated_Review, Length: 37427, dtype: object

In [5]:
x = x.str.replace(r'[.,\/#!$%\^&\*;:{}=\-_`~()]',"")
x = x.str.lower()
x

0        i like eat delicious food that's i'm cooking f...
1          this help eating healthy exercise regular basis
3               works great especially going grocery store
4                                             best idea us
5                                                 best way
                               ...                        
64222    most ads older many agents not much owner post...
64223    if photos posted portal load fit purpose i'm s...
64226    dumb app i wanted post property rent give opti...
64227    i property business got link sms happy perform...
64230    useless app i searched flats kondapur hyderaba...
Name: Translated_Review, Length: 37427, dtype: object

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

max_len = max([len(s.split()) for s in x])

vocab_size = len(tokenizer.word_index) + 1
x_token = tokenizer.texts_to_sequences(x)

x_pad = pad_sequences(x_token, maxlen=max_len, padding='post')

In [7]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_len))
model.add(Bidirectional(LSTM(64, dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 345, 50)           1273150   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               58880     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 1,332,159
Trainable params: 1,332,159
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.fit(x_pad, y, batch_size=128, epochs=3, validation_split=0.2, shuffle=True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 29941 samples, validate on 7486 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1b0f1304f88>

In [15]:
test1 = 'this is a really good product i love it'
test2 = 'i hate this and i will not buy it'
test = [test1, test2]
test_token = tokenizer.texts_to_sequences(test)
test_pad = pad_sequences(test_token, maxlen=max_len, padding='post')

In [28]:
predictions = model.predict(test_pad)

sentiment = ['positive' if x>0.5 else 'Negetive' for x in predictions]
sentiment


['positive', 'Negetive']