In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.utils import to_categorical
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hseth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data_loc = "data/Reviews.csv"

In [3]:
df = pd.read_csv(data_loc)

In [4]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
df = df[:10000] # taking only top 10K 
df.shape

(10000, 10)

In [6]:
corpus = []
for i in range(0 , 10000):
    review = re.sub('[^a-zA-Z]',' ',df['Text'][i])
    review = review.lower() # converting to lower
    review = review.split() # spliting words
    ps = PorterStemmer() # stemming api to remove ing,es,s etc
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review) # recreating sentence using processed words
    corpus.append(review)

In [7]:
corpus = pd.DataFrame(corpus,columns=['Reviews'])
corpus.head()

Unnamed: 0,Reviews
0,bought sever vital can dog food product found ...
1,product arriv label jumbo salt peanut peanut a...
2,confect around centuri light pillowi citru gel...
3,look secret ingredi robitussin believ found go...
4,great taffi great price wide assort yummi taff...


In [8]:
# joining o/p
results = corpus.join(df[['Score']])
results.head()

Unnamed: 0,Reviews,Score
0,bought sever vital can dog food product found ...,5
1,product arriv label jumbo salt peanut peanut a...,1
2,confect around centuri light pillowi citru gel...,4
3,look secret ingredi robitussin believ found go...,2
4,great taffi great price wide assort yummi taff...,5


In [9]:
# now for sentiment analysis we are going to use score and convert to pos or neg
results.dropna(inplace=True)
results[results['Score']!=3] # taking non neutral reviews
results['Positivity'] = np.where(results['Score'] > 3,1,0)
results.drop(['Score'], axis = 1, inplace = True)
results.head()

Unnamed: 0,Reviews,Positivity
0,bought sever vital can dog food product found ...,1
1,product arriv label jumbo salt peanut peanut a...,0
2,confect around centuri light pillowi citru gel...,1
3,look secret ingredi robitussin believ found go...,0
4,great taffi great price wide assort yummi taff...,1


In [10]:
# tokenizing the sentences
max_features =30000
tokenizer = Tokenizer(num_words=max_features,split=" ")
tokenizer.fit_on_texts(results['Reviews'].values)
X = tokenizer.texts_to_sequences(results['Reviews'].values)
print(X)
X = pad_sequences(X)

[[66, 203, 3230, 181, 43, 16, 7, 57, 6, 104, 7, 48, 2, 1718, 508, 357, 168, 35, 5858, 1950, 882, 7, 35], [7, 221, 380, 2335, 133, 352, 352, 147, 126, 112, 2336, 124, 2337, 1005, 1809, 2631, 7, 2335], [3437, 228, 5859, 211, 7535, 1060, 2338, 398, 182, 7536, 418, 661, 781, 3686, 534, 198, 38, 661, 350, 1524, 918, 4, 186, 47, 494, 92, 1993, 1546, 469, 7537, 5860, 5861, 5862, 92, 7538, 4955, 432, 1309, 1428, 5861], [48, 2256, 80, 5863, 342, 57, 88, 384, 1638, 843, 633, 24, 6, 53, 603, 120, 4, 2101], [12, 2186, 12, 26, 1501, 1212, 494, 2186, 556, 407, 2186, 764, 268], [88, 1429, 1525, 2186, 24, 670, 420, 32, 2186, 69, 77, 4, 1484, 1638, 843, 4425, 832, 640, 377, 728, 74, 25, 453, 262, 535, 4, 304, 721, 60, 234, 247, 154, 83, 239, 15, 47, 39, 2186, 621, 92], [7539, 2186, 12, 4, 361, 918, 322, 573, 665, 36, 600, 322, 852, 624, 557, 193, 465, 7540, 15, 186, 47, 322, 161, 2339, 2049, 666, 312, 11], [2186, 6, 361, 918, 4, 306, 15, 117, 47, 18, 399], [106, 683, 1894, 87, 27, 722, 11, 2102, 228, 3

In [11]:
Y = pd.get_dummies(results['Positivity']).values
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,random_state = 42)
print(Xtrain.shape,Ytrain.shape)
print(Xtest.shape,Ytest.shape)

(7500, 932) (7500, 2)
(2500, 932) (2500, 2)


In [12]:
# deep learning model
embedded_dim = 150
lstm_out = 200
model =Sequential()
model.add(Embedding(max_features,embedded_dim,input_length=X.shape[1]))
model.add(LSTM(lstm_out))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 932, 150)          4500000   
_________________________________________________________________
lstm (LSTM)                  (None, 200)               280800    
_________________________________________________________________
dense (Dense)                (None, 2)                 402       
Total params: 4,781,202
Trainable params: 4,781,202
Non-trainable params: 0
_________________________________________________________________


In [14]:
# training the model 
model.fit(Xtrain,Ytrain,epochs=10,batch_size=32,verbose=1)

Train on 7500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x16230cf3788>

In [15]:
score,acc = model.evaluate(Xtest, Ytest, verbose = 2, batch_size = 32)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

2500/2500 - 3s - loss: 1.1584 - accuracy: 0.8344
score: 1.16
acc: 0.83


 testing on review

In [74]:
text = ["The Food we had enjoyed at the time of dinner.It was really delicious taste with great quality, everything had unique taste which we had ordered, nice arrangement and services from the staff while eating, we found nothing bad about this hotel."]
review = re.sub('[^a-zA-Z]',' ',text[0])
review = review.lower() # converting to lower
review = review.split() # spliting words
ps = PorterStemmer() # stemming api to remove ing,es,s etc
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review) # recreating sentence using processed words

In [75]:
reviews=[]
reviews.append(review)
reviews

['food enjoy time dinner realli delici tast great qualiti everyth uniqu tast order nice arrang servic staff eat found noth bad hotel']

In [76]:
reviews = tokenizer.texts_to_sequences(reviews)
reviews = pad_sequences(reviews,maxlen=932)

[[16, 69, 22, 448, 20, 84, 3, 12, 104, 346, 815, 3, 24, 72, 4443, 471, 2899, 27, 57, 283, 123, 2482]]


In [77]:
pred = model.predict(reviews)

In [78]:
print(pred)

[[2.9491412e-06 9.9999702e-01]]


In [79]:
if pred[0][0] < pred[0][1]:
    print("pos")
else:
    print("neg")

pos
