In [1]:
# importing all the necessary library

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D, MaxPooling1D, Conv1D, Bidirectional
from tensorflow.keras.layers import Embedding
from nltk.tokenize import word_tokenize
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df = pd.read_csv(r'zomato_reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,rating,review
0,0,5,nice
1,1,5,"best biryani , so supportive staff of outlet ,..."
2,2,4,delivery boy was very decent and supportive.👌👍
3,3,1,"worst biryani i have tasted in my life, half o..."
4,4,5,all food is good and tasty . will order again ...


In [3]:
print("Number of rows in data:", df.shape[0])
print("Number of columns in data:", df.shape[1])

Number of rows in data: 5479
Number of columns in data: 3


In [4]:
df.rating.value_counts()

5    2288
1    1891
3     474
4     458
2     368
Name: rating, dtype: int64

In [5]:
# Check the null values
df.isnull().sum()

Unnamed: 0    0
rating        0
review        1
dtype: int64

In [6]:
# drop a row contain a null value
df = df.dropna()

In [7]:
df.isnull().sum()

Unnamed: 0    0
rating        0
review        0
dtype: int64

In [8]:
#We need only two columns rating and review, so we will drop other columns

df=df.drop(columns=['Unnamed: 0'])

* We must convert the rating, which has a value range of 1 to 5, in order to do an easy sentiment analysis.
* We shall treat ratings of 1, 2, and 3 as 0 (negative) and 1 (positive).

### this is code we here seprate 1,2,3 as neagative and 4,5 as positive

In [9]:
df.rating=df.rating.replace([1,2,3],0)
df.rating=df.rating.replace([4,5],1)

In [10]:
df.head()

Unnamed: 0,rating,review
0,1,nice
1,1,"best biryani , so supportive staff of outlet ,..."
2,1,delivery boy was very decent and supportive.👌👍
3,0,"worst biryani i have tasted in my life, half o..."
4,1,all food is good and tasty . will order again ...


In [11]:
df.rating.value_counts()

1    2745
0    2733
Name: rating, dtype: int64

In [12]:
X = df["review"]
y = df["rating"]

In [13]:
X.head()

0                                                 nice
1    best biryani , so supportive staff of outlet ,...
2       delivery boy was very decent and supportive.👌👍
3    worst biryani i have tasted in my life, half o...
4    all food is good and tasty . will order again ...
Name: review, dtype: object

## Text Preprocessing

We must first eliminate the unwanted words from our review column using text preprocessing.
Here, I've completed some of the simplest text preprocessing steps.

In [14]:
def stringprocess(text):
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    
    return text

In [15]:
from string import digits
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def  textpreprocess(text):
    
    text = map(lambda x: x.lower(), text) # Lower case
    text = map(lambda x: re.sub(r"https?://\S+|www\.\S+", "", x), text) # Remove Links
    text = map(lambda x: re.sub(re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"),"", x), text) # Remove html tags
    text = map(lambda x: re.sub(r'[^\x00-\x7f]',r' ', x), text) # Remove non-ASCII characters 
    # Remove special special characters, including symbols, emojis, and other graphic characters

    emoji_pattern = re.compile(
            '['
            u'\U0001F600-\U0001F64F'  # emoticons
            u'\U0001F300-\U0001F5FF'  # symbols & pictographs
            u'\U0001F680-\U0001F6FF'  # transport & map symbols
            u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
            u'\U00002702-\U000027B0'
            u'\U000024C2-\U0001F251'
            ']+',
            flags=re.UNICODE)

    text = map(lambda x: emoji_pattern.sub(r'', x), text) 
    text = map(lambda x: x.translate(str.maketrans('', '', string.punctuation)), text) # Remove punctuations
    
#     text = text.apply(lambda x: TextBlob(x).correct()) # Spelling correction
    
    remove_digits = str.maketrans('', '', digits)
    text = [i.translate(remove_digits) for i in text]
    text = [w for w in text if not w in stop_words]
    text = ' '.join([lemmatizer.lemmatize(w) for w in text])
    text = text.strip()
    return text

In [16]:
import nltk
#nltk.download('omw-1.4')

In [17]:
X = X.apply(lambda x: stringprocess(x))
word_tokens = X.apply(lambda x: word_tokenize(x))

preprocess_text = word_tokens.apply(lambda x: textpreprocess(x))

This dataset has categorical labels. Machines can only understand numerical data. Utilizing the factorise() function, change the categorical variables to numeric values. This gives back a category index and an array of numerical values.

In [18]:
## Train and Test (validation) for this dataset

In [19]:
training_portion = 0.9
train_size = int(len(preprocess_text) * training_portion)

train_data = preprocess_text[0: train_size]
train_labels = np.array(y[0: train_size])

validation_data = preprocess_text[train_size:]
validation_labels = np.array(y[train_size:])


print(len(train_data))
print(len(train_labels))
print(len(validation_data))
print(len(validation_labels))

4930
4930
548
548


In [20]:
vocab_size = 5000
oov_tok = '<OOV>'

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(preprocess_text)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'br': 2,
 'good': 3,
 'food': 4,
 'taste': 5,
 'order': 6,
 'bad': 7,
 'quality': 8,
 'pizza': 9,
 'nice': 10}

In [21]:
train_sequences = tokenizer.texts_to_sequences(train_data)
print(train_sequences[10])

[71, 411, 1212, 5, 18, 723, 313, 2233]


In [22]:
print(train_sequences[11])

[3, 5, 150, 896, 897, 1510, 118, 266, 3, 4, 56]


In [23]:
embedding_dim = 32
max_length = 70
trunc_type = 'post'  # remove or truncate last words in sentences if max_length > 50 ans "post" defined last at sentence
padding_type = 'post'

In [24]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[1]))
print(len(train_padded[1]))
print(train_padded.shape)

12
70
(4930, 70)


In [25]:
validation_sequences = tokenizer.texts_to_sequences(validation_data)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

548
(548, 70)


In [26]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_data(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_data(train_padded[10]))
print('---')
print(train_data[10])

full paisa vasool taste best bawarchi aur everest ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
---
full paisa vasool taste best bawarchi aur everest


Our machine learning model should be able to understand the text data that we have. In essence, we have to turn the text into a collection of vector embeddings. Word embeddings are a wonderful method to show how the words in a text relate to one another.

To accomplish this, we first assign a different number to each of the distinctive words before replacing the word in question with the number.

Let's use Tokenizer to tokenize every word in the text. Tokenization is the process of disassembling a text's words and phrases into discrete units called tokens.

![1_sAJdxEsDjsPMioHyzlN3_A.png](attachment:c333147e-555f-47f1-b18e-7a521e7c6c32.png)


In [29]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=70))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(32,return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 70, 128)           640000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 70, 128)           49280     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 35, 128)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 35, 64)            41216     
_________________________________________________________________
lstm_4 (LSTM)                (None, 35, 64)            33024     
_________________________________________________________________
dropout_2 (Dropout)          (None, 35, 64)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)               

![image.png](attachment:58092bf1-a45f-4416-a269-af735cf635b9.png)

In [30]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 20
history = model.fit(train_padded, train_labels, epochs=num_epochs, verbose=2, validation_data=(validation_padded, validation_labels))

Train on 4930 samples, validate on 548 samples
Epoch 1/20
4930/4930 - 22s - loss: 0.6936 - acc: 0.5012 - val_loss: 0.6932 - val_acc: 0.3923
Epoch 2/20
4930/4930 - 18s - loss: 0.6929 - acc: 0.5176 - val_loss: 0.7027 - val_acc: 0.3668
Epoch 3/20
4930/4930 - 18s - loss: 0.6903 - acc: 0.5331 - val_loss: 0.7206 - val_acc: 0.3960
Epoch 4/20
4930/4930 - 18s - loss: 0.6447 - acc: 0.6138 - val_loss: 0.7546 - val_acc: 0.4380
Epoch 5/20
4930/4930 - 18s - loss: 0.5558 - acc: 0.7004 - val_loss: 0.9823 - val_acc: 0.4562
Epoch 6/20
4930/4930 - 17s - loss: 0.4779 - acc: 0.7434 - val_loss: 0.9508 - val_acc: 0.5493
Epoch 7/20
4930/4930 - 17s - loss: 0.4356 - acc: 0.7773 - val_loss: 1.1707 - val_acc: 0.5073
Epoch 8/20
4930/4930 - 17s - loss: 0.3969 - acc: 0.7923 - val_loss: 1.2240 - val_acc: 0.4763
Epoch 9/20
4930/4930 - 17s - loss: 0.3709 - acc: 0.8112 - val_loss: 1.3356 - val_acc: 0.4398
Epoch 10/20
4930/4930 - 21s - loss: 0.3592 - acc: 0.8219 - val_loss: 1.4848 - val_acc: 0.4690
Epoch 11/20
4930/4930 

In [31]:
string = ['worst biryani i have tasted in my life and food quality was also bad']
#vectorizing the tweet by the pre-fitted tokenizer instance
token = tokenizer.texts_to_sequences(string)
#padding the tweet to have exactly the same shape as `embedding_2` input
token_list = pad_sequences(token, maxlen=max_length-1, padding=padding_type, truncating=trunc_type)
# print(token_list)
sentiment = model.predict(token_list,batch_size=2,verbose = 2)[0]
print(string)
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

ValueError: Error when checking input: expected embedding_1_input to have shape (70,) but got array with shape (69,)

#### 