In [28]:
import re
import string
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import LSTM, Bidirectional, Dropout, Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras import Model, Input
from keras.callbacks import ReduceLROnPlateau, EarlyStopping


## Pre-processing

In [32]:
data = pd.read_csv("Restaurant reviews.csv")

In [33]:
data = data.drop(['Restaurant', 'Reviewer','Metadata', 'Time', 'Pictures', '7514'], axis=1)

In [34]:
data = data.dropna()

In [35]:
data = data[data.Rating != 'Like']

In [36]:
data['Rating'] = data['Rating'].astype('float')

In [28]:
url_pattern = re.compile(r'https?://\S+|www\.\S+')
punctuation_pattern = re.compile('[%s]' % re.escape(string.punctuation))
newline_pattern = re.compile('\n')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tag_dict = {'J': wordnet.ADJ,'N': wordnet.NOUN,'V': wordnet.VERB,'R': wordnet.ADV}

In [29]:
def preprocess_text(text):
    text = str(text).lower()
    clean_text = re.sub('https?://\S+|www\.\S+', '', text) #URLs
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), '', clean_text) #Punctuations
    clean_text = re.sub('\n', '', clean_text) #Backslash n
    clean_text = [word for word in clean_text.split(' ') if word not in stop_words] #Remove stopwords
    # clean_text =" ".join(clean_text)
    # tokens = clean_text.split()
    pos_tags = nltk.pos_tag(clean_text)
    wordnet_tags = [(token, tag_dict.get(tag[0].upper(), wordnet.NOUN)) for token, tag in pos_tags]
    tokens = [lemmatizer.lemmatize(token, tag) for token, tag in wordnet_tags] #Lemmatize
    return " ".join(tokens)

In [30]:
data['Clean_Review'] = data['Review'].apply(preprocess_text)

In [31]:
data

Unnamed: 0,Review,Rating,Clean_Review
0,"The ambience was good, food was quite good . h...",5.0,ambience good food quite good saturday lunch ...
1,Ambience is too good for a pleasant evening. S...,5.0,ambience good pleasant even service prompt foo...
2,A must try.. great food great ambience. Thnx f...,5.0,must try great food great ambience thnx servic...
3,Soumen das and Arun was a great guy. Only beca...,5.0,soumen das arun great guy behavior sincerety g...
4,Food is good.we ordered Kodi drumsticks and ba...,5.0,food goodwe order kodi drumstick basket mutton...
...,...,...,...
9995,Madhumathi Mahajan Well to start with nice cou...,3.0,madhumathi mahajan well start nice courteous s...
9996,This place has never disappointed us.. The foo...,4.5,place never disappoint u food courteous staff ...
9997,"Bad rating is mainly because of ""Chicken Bone ...",1.5,bad rating mainly chicken bone find veg food a...
9998,I personally love and prefer Chinese Food. Had...,4.0,personally love prefer chinese food couple tim...


In [33]:
data['Sentiment'] = (data['Rating'] > 3).astype('int')

In [52]:
data.drop(['Review', 'Rating'], axis=1, inplace=True)

In [None]:
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
data['tokens'] = data['Clean_Review'].map(lambda x: word_tokenize(x)) 

In [85]:
data.to_csv('Cleaned_dataset.csv', index=False)

In [61]:
pdata = data[data['Sentiment'] == 1]
ndata = data[data['Sentiment'] == 0]

In [64]:
import collections
all_words = []


for line in list(pdata['Clean_Review']):
  words = line.split()
  for word in words:
    all_words.append(word.lower())
    
collections.Counter(all_words).most_common(15)

[('good', 4922),
 ('place', 4254),
 ('food', 3704),
 ('chicken', 1804),
 ('service', 1730),
 ('taste', 1662),
 ('great', 1488),
 ('order', 1441),
 ('try', 1424),
 ('visit', 1279),
 ('one', 1265),
 ('best', 1241),
 ('time', 1229),
 ('really', 1220),
 ('ambience', 1195)]

In [65]:
import collections
all_words = []


for line in list(ndata['Clean_Review']):
  words = line.split()
  for word in words:
    all_words.append(word.lower())
    
collections.Counter(all_words).most_common(15)

[('food', 2088),
 ('order', 1906),
 ('good', 1790),
 ('place', 1557),
 ('chicken', 1093),
 ('taste', 1084),
 ('bad', 1059),
 ('service', 898),
 ('go', 745),
 ('like', 711),
 ('time', 699),
 ('get', 660),
 ('one', 649),
 ('restaurant', 626),
 ('even', 585)]

## Training

In [2]:
data = pd.read_csv("Cleaned_dataset.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9940 entries, 0 to 9939
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Review        9940 non-null   object 
 1   Rating        9940 non-null   float64
 2   Clean_Review  9940 non-null   object 
 3   Sentiment     9940 non-null   int64  
 4   tokens        9940 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 388.4+ KB


In [4]:
# sentence_bow = vectorizer.fit_transform(data['Clean_Review'])
x_train, x_test, y_train, y_test = train_test_split(data['Clean_Review'], data['Sentiment'], test_size=0.2, random_state=42)

In [57]:
reviews = data['Clean_Review']
reviews_list = []
for i in range(len(reviews)):
    reviews_list.append(reviews[i])

sentiment = data['Sentiment']
y = []
for i in range(len(sentiment)):
  y.append(sentiment[i])

In [58]:
reviews_list

['ambience good food quite good  saturday lunch  cost effective good place sate brunch one also chill friend parentswaiter soumen das really courteous helpful',
 'ambience good pleasant even service prompt food good good experience soumen das  kudos service',
 'must try great food great ambience thnx service pradeep subroto personal recommendation penne alfredo pasta  also music background amazing',
 'soumen das arun great guy behavior sincerety good food course would like visit place',
 'food goodwe order kodi drumstick basket mutton biryani good thanks pradeep serve well enjoy ambience also good',
 'ambiance good service good food apradeecp subro best servicefood good papiya good hostess ur caption good 4star restaurant',
 'nice place ambience different food order tasty service also gud worth visit reasonable well really must visit place',
 'well read many review finally visit placeambience good come food crispy corn nice tawa fish ok basket biryani disappoint u biryani ok flattering

In [21]:
def lstm_model(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = LSTM(128, return_sequences=True)(embeddings)

  X = Dropout(0.6)(X)

  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.6)(X)

  X = LSTM(128)(X)

  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [6]:
def conv1d_model(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = Conv1D(512,3,activation='relu')(embeddings)
  
  X = MaxPooling1D(3)(X)

  X = Conv1D(256,3,activation='relu')(X)
  
  X = MaxPooling1D(3)(X)

  X = Conv1D(256,3,activation='relu')(X)
  X = Dropout(0.8)(X)
  X = MaxPooling1D(3)(X)

  X = GlobalMaxPooling1D()(X)

  X = Dense(256, activation='relu')(X)
  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model
                

In [None]:
callbacks = [EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

## Word Embeddings

### Bag of Words

In [50]:
vectorizer = CountVectorizer(max_features=10000)
x_train_bow = vectorizer.fit_transform(x_train)
x_test_bow = vectorizer.transform(x_test)

In [54]:
x_train_bow

<7963x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 174996 stored elements in Compressed Sparse Row format>

In [55]:
clf_bow = RandomForestClassifier()
clf_bow.fit(x_train_bow, y_train)

In [56]:
y_pred_bow = clf_bow.predict(x_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print("Accuracy using Bag of Words:", accuracy_bow)

Accuracy using Bag of Words: 0.8598694123556002


### Tf-idf vectorizer

In [53]:
vectorizer_tfidf = TfidfVectorizer(max_features=10000, lowercase=True)
x_train_tfidf = vectorizer_tfidf.fit_transform(x_train)
x_test_tfidf = vectorizer_tfidf.transform(x_test)

In [54]:
clf_tfidf = RandomForestClassifier()
clf_tfidf.fit(x_train_tfidf, y_train)

In [55]:
y_pred_tfidf = clf_tfidf.predict(x_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("Accuracy using TF IDF:", accuracy_tfidf)

Accuracy using TF IDF: 0.8455734406438632


### GloVe

In [42]:
tokenizer = Tokenizer(num_words=10000, oov_token="<oov>")
tokenizer.fit_on_texts(x_train)
words_to_index = tokenizer.word_index

In [8]:
len(words_to_index)

21183

In [9]:
with open('glove.6B.50d.txt', 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

In [10]:
maxLen = 200

vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['the'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

In [11]:
x_train_glove = tokenizer.texts_to_sequences(x_train)
x_test_glove = tokenizer.texts_to_sequences(x_test)

x_test_glove = pad_sequences(x_test_glove, maxlen=maxLen, padding='post')
x_train_glove = pad_sequences(x_train_glove, maxlen=maxLen, padding='post')

In [37]:
len(emb_matrix)

21183

In [14]:
embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [17]:
model = lstm_model((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding (Embedding)       (None, 200, 50)           1059150   
                                                                 
 lstm (LSTM)                 (None, 200, 128)          91648     
                                                                 
 dropout (Dropout)           (None, 200, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 200, 128)          131584    
                                                                 
 dropout_1 (Dropout)         (None, 200, 128)          0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584

In [18]:
model_1d = conv1d_model((maxLen,))
model_1d.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding (Embedding)       (None, 200, 50)           1059150   
                                                                 
 conv1d (Conv1D)             (None, 198, 512)          77312     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 66, 512)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 64, 256)           393472    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 21, 256)          0         
 1D)                                                       

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train_glove, y_train, epochs=15, validation_data = (x_test_glove, y_test), callbacks=callbacks)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x23b7fa0c430>

In [19]:
model_1d.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_1d.fit(x_train_glove, y_train, epochs=15, validation_data = (x_test_glove, y_test), callbacks=callbacks)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x23a81d527c0>

In [66]:
model_1d.save_weights('glove_weights_con1vd.hdf5')
model.save_weights('glove_weights_lstm.hdf5')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(pd.DataFrame(data['tokens']), pd.DataFrame(data['Sentiment']), test_size=0.2, random_state=42)

In [80]:
class W2vVectorizer(object):
    """Class to generate mean word embeddings from word vectors. This class is
    quoted from Flatiron School Curriculum Learn.co Mod 4 appendix"""
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(word_to_vec_map))])
    
    # Note: Even though it doesn't do anything, 
    # it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        # X should be a series of lists of tokens
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])
     

In [88]:
x_train

715     tried 5 high rating shawarma place hyderabad b...
809     good taste food nice ambiencestaff really frie...
3205    look north indian food must say liked alot amb...
7532    order special chicken biryani awsomeand good c...
6435                                        fast delivery
                              ...                        
5734                                      awesome service
5191                            reach delivery time thanq
5390        superb taste little delay get product deliver
860     amaze ipl offer600 4 pint  french friesnice am...
7270    absolutely delighted overall experience shop e...
Name: Clean_Review, Length: 7952, dtype: object

In [100]:
vectorizer = W2vVectorizer(word_to_vec_map)
x_train_glove = vectorizer.transform(x_train.tokens)

In [101]:
x_test_glove = vectorizer.transform(x_test.tokens)

In [102]:
clf_glove = RandomForestClassifier()
clf_glove.fit(x_train_glove, y_train)

  return fit_method(estimator, *args, **kwargs)


In [103]:
y_pred_glove = clf_glove.predict(x_test_glove)
accuracy_glove = accuracy_score(y_test, y_pred_glove)
print("Accuracy using Glove:", accuracy_glove)

Accuracy using Glove: 0.790241448692153


### Word2Vec

In [7]:
Embedding_dimensions = 100

Word2vec_train_data = list(map(lambda x: x.split(), x_train))

In [8]:
Word2vec_train_data

[['tried',
  '5',
  'high',
  'rating',
  'shawarma',
  'place',
  'hyderabad',
  'best',
  'fill',
  'place',
  'simply',
  'amaze',
  'special',
  'rumali',
  'best'],
 ['good',
  'taste',
  'food',
  'nice',
  'ambiencestaff',
  'really',
  'friendlysandipmusic',
  'really',
  'good',
  'atmosphere',
  'energeticits',
  'good',
  'place',
  'njoy',
  'friend',
  'family'],
 ['look',
  'north',
  'indian',
  'food',
  'must',
  'say',
  'liked',
  'alot',
  'ambience',
  'good',
  'service',
  'good',
  'regular',
  'place',
  'go',
  'place',
  'would',
  'say',
  'know',
  'miss',
  'place',
  'try',
  'schezwan',
  'fry',
  'rice',
  'kadai',
  'paneer',
  'roti',
  'friend',
  'plan',
  'try',
  'buffet',
  'next',
  'time',
  '45',
  'side',
  '❤😊'],
 ['order',
  'special',
  'chicken',
  'biryani',
  'awsomeand',
  'good',
  'crevice',
  'tasty',
  'good',
  'ambience',
  '35',
  'food',
  '55',
  'service',
  '55',
  'expense',
  'good'],
 ['fast', 'delivery'],
 ['pathetic',
 

In [9]:
word2vec_model = Word2Vec(Word2vec_train_data,
                 vector_size=100,
                 workers=8,
                 min_count=5)

In [10]:
maxLen = 200
tokenizer = Tokenizer(num_words=10000, oov_token="<oov>")
tokenizer.fit_on_texts(x_train)

In [11]:
words_to_index = tokenizer.word_index
vocab_len = len(words_to_index)

In [12]:
x_train_w2v = tokenizer.texts_to_sequences(x_train)
x_test_w2v = tokenizer.texts_to_sequences(x_test)

x_test_w2v = pad_sequences(x_test_w2v, maxlen=maxLen, padding='post')
x_train_w2v = pad_sequences(x_train_w2v, maxlen=maxLen, padding='post')

In [14]:
x_train_w2v.shape

(7952, 200)

In [15]:
emb_matrix = np.zeros((vocab_len, Embedding_dimensions))

for word, token in tokenizer.word_index.items():
    if word2vec_model.wv.__contains__(word):
        emb_matrix[token] = word2vec_model.wv.__getitem__(word)

In [17]:
emb_matrix.shape

(21184, 100)

In [18]:
embedding_layer = Embedding(input_dim=vocab_len, output_dim=Embedding_dimensions, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [22]:
model = lstm_model((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding (Embedding)       (None, 200, 100)          2118400   
                                                                 
 lstm_1 (LSTM)               (None, 200, 128)          117248    
                                                                 
 dropout (Dropout)           (None, 200, 128)          0         
                                                                 
 lstm_2 (LSTM)               (None, 200, 128)          131584    
                                                                 
 dropout_1 (Dropout)         (None, 200, 128)          0         
                                                                 
 lstm_3 (LSTM)               (None, 128)               131584

In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train_w2v, y_train, epochs=15, validation_data = (x_test_w2v, y_test), callbacks=callbacks)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15


<keras.callbacks.History at 0x1d9043381f0>

### Doc2Vec

In [29]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data['tokens'])]

In [55]:
doc2vec_model = Doc2Vec(documents, vector_size=50, window=3, min_count=1, workers=4)

In [37]:
data['tokens'][0]

"['ambience', 'good', 'food', 'quite', 'good', 'saturday', 'lunch', 'cost', 'effective', 'good', 'place', 'sate', 'brunch', 'one', 'also', 'chill', 'friend', 'parentswaiter', 'soumen', 'das', 'really', 'courteous', 'helpful']"

In [None]:
data['tokens'] = [simple_preprocess(line, deacc=True) for line in data['Clean_Review']]

In [69]:
x_train, x_test, y_train, y_test = train_test_split(pd.DataFrame(data['token']), pd.DataFrame(data['Sentiment']), test_size=0.2, random_state=42)

In [72]:
x_train_d2v = []
for index, row in x_train.iterrows():
    model_vector = doc2vec_model.infer_vector(row['token'])
    x_train_d2v.append(model_vector)

In [73]:
x_test_d2v = []
for index, row in x_test.iterrows():
    model_vector = doc2vec_model.infer_vector(row['token'])
    x_test_d2v.append(model_vector)

In [77]:
x_train_d2v = np.array(x_train_d2v)
x_test_d2v = np.array(x_test_d2v)

In [78]:
x_train_d2v.shape

(7952, 50)

In [61]:
data

Unnamed: 0,Review,Rating,Clean_Review,Sentiment,tokens
0,"The ambience was good, food was quite good . h...",5.0,ambience good food quite good saturday lunch ...,1,"['ambience', 'good', 'food', 'quite', 'good', ..."
1,Ambience is too good for a pleasant evening. S...,5.0,ambience good pleasant even service prompt foo...,1,"['ambience', 'good', 'pleasant', 'even', 'serv..."
2,A must try.. great food great ambience. Thnx f...,5.0,must try great food great ambience thnx servic...,1,"['must', 'try', 'great', 'food', 'great', 'amb..."
3,Soumen das and Arun was a great guy. Only beca...,5.0,soumen das arun great guy behavior sincerety g...,1,"['soumen', 'das', 'arun', 'great', 'guy', 'beh..."
4,Food is good.we ordered Kodi drumsticks and ba...,5.0,food goodwe order kodi drumstick basket mutton...,1,"['food', 'goodwe', 'order', 'kodi', 'drumstick..."
...,...,...,...,...,...
9935,Madhumathi Mahajan Well to start with nice cou...,3.0,madhumathi mahajan well start nice courteous s...,0,"['madhumathi', 'mahajan', 'well', 'start', 'ni..."
9936,This place has never disappointed us.. The foo...,4.5,place never disappoint u food courteous staff ...,1,"['place', 'never', 'disappoint', 'u', 'food', ..."
9937,"Bad rating is mainly because of ""Chicken Bone ...",1.5,bad rating mainly chicken bone find veg food a...,0,"['bad', 'rating', 'mainly', 'chicken', 'bone',..."
9938,I personally love and prefer Chinese Food. Had...,4.0,personally love prefer chinese food couple tim...,1,"['personally', 'love', 'prefer', 'chinese', 'f..."


In [79]:
clf_d2v = RandomForestClassifier()
clf_d2v.fit(x_train_d2v, y_train)

  return fit_method(estimator, *args, **kwargs)


In [80]:
y_pred_d2v = clf_d2v.predict(x_test_d2v)
accuracy_d2v = accuracy_score(y_test, y_pred_d2v)
print("Accuracy using d2v:", accuracy_d2v)

Accuracy using d2v: 0.6373239436619719
