In [33]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk

In [34]:
data = pd.read_csv('IMDB Dataset.csv')

In [61]:
data

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,im going disagree previous comment side maltin...,negative


# Data Preprocessing
Remove HTML tags, URLs, and non-alphanumeric characters from the reviews

In [63]:
def remove_tags(string):
    # Remove HTML tags
    result = re.sub(r'<[^>]*>', '', string)
    # Remove URLs
    result = re.sub(r'https?://\S+', '', result)
    # Remove non-alphanumeric characters
    result = re.sub(r'[^a-zA-Z0-9\s]', '', result)
    result = result.lower()
    return result
data['review']=data['review'].apply(lambda cw : remove_tags(cw))
data

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,im going disagree previous comment side maltin...,negative


# Remove Stopwords

In [37]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrikantvarma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Lemmatization
We now perform lemmatization on the text. Lemmatization is a useful technique in NLP to obtain the root form of words, known as lemmas. For example, the lemma of the words reading, reads, read is read. This helps save unnecessary computational overhead in trying to decipher entire words, as the meanings of most words are well-expressed by their separate lemmas. We perform lemmatization using the WordNetLemmatizer() from nltk. The text is first broken into individual words using the WhitespaceTokenizer() from nltk. We write a function lemmatize_text to perform lemmatization on the individual tokens.

In [38]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shrikantvarma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['review'] = data.review.apply(lemmatize_text)
data

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,im going disagree previous comment side maltin...,negative


In [65]:
s = 0.0
for i in data['review']:
    word_list = i.split()
    s = s + len(word_list)
print("Average length of each review : ",s/data.shape[0])
pos = 0
for i in range(data.shape[0]):
    if data.iloc[i]['sentiment'] == 'positive':
        pos = pos + 1
neg = data.shape[0]-pos
print("Percentage of reviews with positive sentiment is "+str(pos/data.shape[0]*100)+"%")
print("Percentage of reviews with negative sentiment is "+str(neg/data.shape[0]*100)+"%")


Average length of each review :  119.79238
Percentage of reviews with positive sentiment is 50.0%
Percentage of reviews with negative sentiment is 50.0%


# Encoding Labels and Making Train-Test Splits


In [41]:
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)


In [67]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

The following code sets up hyperparameters for text preprocessing, including vocabulary size, embedding dimensions, and sequence length, then initializes and fits a tokenizer on the training sentences to create a word index. It converts the training and test sentences into sequences of tokens based on this word index and pads these sequences to a uniform length. The padding is done at the end of each sequence to ensure they all have the same length, which is necessary for consistent input size in neural network models.

In [68]:
# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [53]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 200, 100)          300000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 24)                3096      
                                                                 
 dense_5 (Dense)             (None, 1)                 25        
                                                                 
Total params: 387,601
Trainable params: 387,601
Non-trainable params: 0
_________________________________________________________________


2024-05-27 13:13:32.591512: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-05-27 13:13:32.591939: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-05-27 13:13:32.592516: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [55]:
num_epochs = 5
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [69]:
# reviews on which we need to predict
sentence = ["The movie was very touching and heart whelming", 
            "I have never seen an awesome movie like this", 
            "the movie plot was great but it had terrible acting",
           "I thought the movie would be boring",
           "I thought the movie would be boring, but it was surprisingly good!",
               "The movie was an absolute masterpiece, with stunning visuals and a compelling storyline.",
    "I loved every minute of the film; the acting was top-notch, and the soundtrack was beautiful.",
    "The plot was incredibly dull, and the characters were poorly developed. A complete waste of time.",
    "I found the movie to be a predictable and uninspired rehash of better films.",
    "The movie had an interesting concept, but the execution fell flat, and the pacing was all over the place.",
    "While the cinematography was breathtaking, the plot was convoluted and hard to follow.",
    "This film exceeded all my expectations with its brilliant script and stellar performances. A must-watch!",
    "An emotionally gripping story that kept me on the edge of my seat. Highly recommend it.",
    "The special effects were outdated, and the dialogue was cringeworthy. I couldn't wait for it to end.",
    "Despite a few decent scenes, the overall experience was boring and forgettable.",
    "The first half of the movie was engaging, but it lost momentum in the second half and ended on a weak note.",
    "I appreciated the artistic direction, but the acting was subpar, and the story lacked coherence."
]
# convert to a sequence
sequences = tokenizer.texts_to_sequences(sentence)
# pad the sequence
padded = pad_sequences(sequences, padding='post', maxlen=max_length)
# Get labels based on probability 1 if p>= 0.5 else 0
prediction = model.predict(padded)
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
for i in range(len(sentence)):
    print(sentence[i])
    if pred_labels[i] == 1:
        s = 'Positive'
    else:
        s = 'Negative'
    print("Predicted sentiment : ",s)

The movie was very touching and heart whelming
Predicted sentiment :  Positive
I have never seen an awesome movie like this
Predicted sentiment :  Positive
the movie plot was great but it had terrible acting
Predicted sentiment :  Positive
I thought the movie would be boring
Predicted sentiment :  Positive
I thought the movie would be boring, but it was surprisingly good!
Predicted sentiment :  Positive
The movie was an absolute masterpiece, with stunning visuals and a compelling storyline.
Predicted sentiment :  Positive
I loved every minute of the film; the acting was top-notch, and the soundtrack was beautiful.
Predicted sentiment :  Negative
The plot was incredibly dull, and the characters were poorly developed. A complete waste of time.
Predicted sentiment :  Positive
I found the movie to be a predictable and uninspired rehash of better films.
Predicted sentiment :  Negative
The movie had an interesting concept, but the execution fell flat, and the pacing was all over the place.
P