In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('IMDB Dataset.csv')

In [17]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [19]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [21]:
data.shape

(50000, 2)

In [23]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [29]:
data['sentiment'] = data['sentiment'].map({'positive':1, 'negative':0})

In [37]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. <br /><br />the...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


In [39]:
# Lowercasing
data['review'] = data['review'].str.lower()

In [41]:
import re

In [43]:
# removing HTML Tags
def tags_removal(text):
    return re.sub(r'<.*?>', '', text)

In [47]:
data['review'] = data['review'].apply(tags_removal)

In [49]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


In [51]:
# removing Punctuation
import string

In [75]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [79]:
data['review'] = data['review'].apply(remove_punctuation)

In [71]:
data.drop(columns=['n_review'], inplace=True)

In [65]:
type(data['review'][0])

str

In [81]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [97]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [89]:
# stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [91]:
def stemming_on(text):
    return ' '.join([ps.stem(word) for word in text.split()])

In [93]:
data['review'].apply(stemming_on)

0        one of the other review ha mention that after ...
1        a wonder littl product the film techniqu is ve...
2        i thought thi wa a wonder way to spend time on...
3        basic there a famili where a littl boy jake th...
4        petter mattei love in the time of money is a v...
                               ...                        
49995    i thought thi movi did a down right good job i...
49996    bad plot bad dialogu bad act idiot direct the ...
49997    i am a cathol taught in parochi elementari sch...
49998    im go to have to disagre with the previou comm...
49999    no one expect the star trek movi to be high ar...
Name: review, Length: 50000, dtype: object

In [94]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [177]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [179]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


In [181]:
tokenizer = Tokenizer(num_words=5000) #most repeated 5000 words in corpus
tokenizer.fit_on_texts(train_data['review']) #this func fit the tokenizer on review and a vocab dict will be created and ranked
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [183]:
X_train[0].shape

(200,)

In [185]:
X_test

array([[   0,    0,    0, ...,    8,  970, 2952],
       [ 133,    3, 3696, ...,   63,   46,    9],
       [   0,    0,    0, ...,   50, 1056,   89],
       ...,
       [   0,    0,    0, ...,  121,  194, 3319],
       [   0,    0,    0, ..., 1044,    1, 2209],
       [   0,    0,    0, ...,    1,  334,   27]])

In [187]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

In [188]:
Y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

## LSTM Model

In [190]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [191]:
model.summary()

In [192]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [193]:
model.fit(X_train, Y_train, epochs=5, batch_size=100, validation_split=0.2)

Epoch 1/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 194ms/step - accuracy: 0.7181 - loss: 0.5391 - val_accuracy: 0.8431 - val_loss: 0.3615
Epoch 2/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 184ms/step - accuracy: 0.8484 - loss: 0.3563 - val_accuracy: 0.8683 - val_loss: 0.3221
Epoch 3/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 182ms/step - accuracy: 0.8791 - loss: 0.3032 - val_accuracy: 0.8594 - val_loss: 0.3398
Epoch 4/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 191ms/step - accuracy: 0.8864 - loss: 0.2831 - val_accuracy: 0.8709 - val_loss: 0.3223
Epoch 5/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 186ms/step - accuracy: 0.9027 - loss: 0.2513 - val_accuracy: 0.8461 - val_loss: 0.3553


<keras.src.callbacks.history.History at 0x287f684f750>

## Evaluation

In [195]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.8475 - loss: 0.3550
Test Loss: 0.3538897633552551
Test Accuracy: 0.8485000133514404


### Predictive System for a new review

In [197]:
def predict_sentiment(review):
    # tokenize and pad the review
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_sequence)
    sentiment = 'positive' if prediction[0][0] > 0.5 else 'negative'
    return sentiment

In [198]:
new_review = 'This is a bad movie'
sentiment = predict_sentiment(new_review)
sentiment

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step


'negative'

In [199]:
new_review = 'This is a good movie'
sentiment = predict_sentiment(new_review)
sentiment

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


'positive'