In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('IMDB Dataset.csv')

In [199]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
X = df.iloc[:, 0].to_numpy()
y = df.iloc[:, 1].to_numpy()

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [7]:
## Input tokenization and embedding
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
import re
def tokenize_preprocessing(data):
    corpus = []
    for sent in data:
        review = re.sub('[^a-zA-Z0-9]', ' ', sent)
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word not in stop_words]
        review = ' '.join(review)
        corpus.append(review)

    return corpus

corpus = tokenize_preprocessing(X)

## Dataset loading and Pytorch

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader, Dataset
import numpy as np

In [11]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = "cpu"
print(f"Using device: {device}")

Using device: cuda


In [13]:
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

In [14]:
vocab = build_vocab_from_iterator(
    yield_tokens(corpus),
    specials=["<unk>", "<pad>"]
)
vocab.set_default_index(vocab["<unk>"])

In [21]:
# Find the maximum length
max_len = max(len(sentence.split()) for sentence in corpus)

In [171]:
import tensorflow as tf
from tensorflow import keras


In [172]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab = tokenizer.word_index

In [173]:
input_sequences = []

for sentence in corpus:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    # text_indices = token_list + [vocab['<PAD>']] * (max_len - len(token_list))
    input_sequences.append(token_list)

In [174]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

In [180]:
X = padded_sequences

In [179]:
voc_size = len(vocab)+1
voc_size

70763

In [184]:
from keras import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional

model = Sequential()
model.add(Embedding(vocab_size, embed_size, input_length=max_len))
model.add(Bidirectional(LSTM(150)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [185]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1455, 300)         21229200  
                                                                 
 bidirectional_3 (Bidirecti  (None, 300)               541200    
 onal)                                                           
                                                                 
 dropout_3 (Dropout)         (None, 300)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 301       
                                                                 
Total params: 21770701 (83.05 MB)
Trainable params: 21770701 (83.05 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [186]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [187]:
model.fit(X_train, y_train, epochs=5, validation_split=0.2, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78c18ab21960>

In [188]:
predictions = model.predict(X_test)



In [189]:
predictions = [np.round(prediction) for prediction in predictions]

In [190]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.8787
Precision: 0.8768715524034673
Recall: 0.8833101805913872
F1 Score: 0.8800790904597133


In [204]:
def prediction_making(data, model):
    corpus = tokenize_preprocessing(data)
    input_sequences = []
    for sentence in corpus:
        token_list = tokenizer.texts_to_sequences([sentence])[0]
        # text_indices = token_list + [vocab['<PAD>']] * (max_len - len(token_list))
        input_sequences.append(token_list)
    padded_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')
    X = padded_sequences
    predictions = model.predict(X)
    predictions = [np.round(prediction) for prediction in predictions]
    return predictions

In [205]:
data = [
    "This is a nice movie.",
    "What a waste of time",
    "How can someone watch anything like this, this was such a, uff",
    "I was attached to my seat the whole time and couldn't take my eyes off for a single second also"
]

In [206]:
prediction = prediction_making(data, model)



In [207]:
prediction

[array([1.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([1.], dtype=float32)]

In [200]:
model.save('sentiment_analysis_model.h5')

  saving_api.save_model(


In [208]:
from tensorflow.keras.models import load_model

model = load_model('sentiment_analysis_model.h5')

In [209]:
def predict_statement(sentence):
  sentence = [sentence]
  return "Positive" if prediction_making(sentence, model)[0] == 1 else "Negative"

In [211]:
import gradio as gr
iface = gr.Interface(
    fn=predict_statement,
    inputs=gr.Textbox(lines=2, placeholder="Enter statement here..."),
    outputs="text",
    title="Sentiment Analysis",
    description="Enter a statement and get a sentiment prediction."
)

In [212]:
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://29bd82e2d4c263d2a3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


