In [None]:
# Before starting the implementation try understanding this article
# https://www.linkedin.com/pulse/understanding-batch-normalization-layer-group-implementing-pasha-s/

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Dataset-SA.csv")
df.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [3]:
df = df[['Review','Sentiment']]
df = df.dropna()
df.shape

(180388, 2)

In [4]:
df['Review'] = [str(text) for text in df['Review']]

In [5]:
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
# Fit the label encoder on the data and transform the data
encoded_data = label_encoder.fit_transform(df['Sentiment'])
df['Sentiment_Coded'] = encoded_data

In [6]:
# Create Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=1000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
    char_level=False,
    #oov_token="<OOV>",
    analyzer=None,
    )
tokenizer.fit_on_texts(df['Review'])

In [7]:
max_sequence_length = max([len(i.split()) for i in df['Review']])
max_sequence_length

22

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['Review'], df['Sentiment_Coded'], test_size=0.2, random_state=42, stratify=df['Sentiment_Coded'])

In [9]:
x_train, x_test, y_train, y_test  = list(x_train), list(x_test), list(y_train), list(y_test)

In [10]:
sequences_train = tokenizer.texts_to_sequences(x_train)
sequences_test =  tokenizer.texts_to_sequences(x_test)

In [11]:
# Pad Sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_sequence_train = pad_sequences(sequences_train, maxlen=max_sequence_length)
pad_sequence_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

In [12]:
vocab_size = len(tokenizer.index_word)
output_size = len(df['Sentiment'].unique())
vocab_size, output_size

(1320, 3)

In [13]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, LSTM, Bidirectional, Dropout, LayerNormalization, BatchNormalization
model = Sequential()
model.add(Input(shape=(max_sequence_length,)))
model.add(Embedding(input_dim=vocab_size+1, output_dim=128, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(units=128)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(64,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(32,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(output_size,activation='sigmoid'))
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 22, 128)           169088    
                                                                 
 bidirectional (Bidirection  (None, 22, 256)           263168    
 al)                                                             
                                                                 
 batch_normalization (Batch  (None, 22, 256)           1024      
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 22, 256)           0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 256)               394240    
 onal)                                                           
                                                        

In [15]:
model.fit(pad_sequence_train, y_train_categorical, epochs = 1, batch_size = 1028, validation_data=(pad_sequence_test, y_test_categorical))



<keras.src.callbacks.History at 0x2b5b1064610>

In [17]:
text= x_test[0]
sequences_inference = tokenizer.texts_to_sequences([text])
padded_sequences_inference = pad_sequences(sequences_inference,maxlen=max_sequence_length)

In [18]:
sequences_inference

[[32, 33, 6]]

In [19]:
model.predict(padded_sequences_inference)



array([[0.27361867, 0.26205122, 0.69570553]], dtype=float32)

In [20]:
model.predict(padded_sequences_inference[0].reshape(1,22))



array([[0.27361867, 0.26205122, 0.69570553]], dtype=float32)