In [8]:
import os
import torch

In [10]:
# Check device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [12]:
# Import Libraries
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.tokenize import word_tokenize
import urllib.request
import tarfile

In [14]:
# Check Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Fix random seeds
torch.manual_seed(7)
np.random.seed(7)


Using device: cuda


In [46]:
# Download NLTK punkt tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\souvik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
# Download IMDB dataset if not exists
if not os.path.exists('aclImdb'):
    print("Downloading IMDB dataset...")
    urllib.request.urlretrieve(
        'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
        'aclImdb_v1.tar.gz'
    )
    with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
        tar.extractall()

In [16]:
# Read IMDB dataset
def load_imdb_data(data_path):
    texts, labels = [], []
    for label_type in ['pos', 'neg']:
        dir_path = os.path.join(data_path, label_type)
        for file_name in os.listdir(dir_path):
            with open(os.path.join(dir_path, file_name), encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1 if label_type == 'pos' else 0)
    return texts, labels

In [18]:
train_texts, train_labels = load_imdb_data('aclImdb/train')
test_texts, test_labels = load_imdb_data('aclImdb/test')

In [20]:
# Preprocessing: Tokenization and Cleaning
def preprocess(text):
    text = text.lower()
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = word_tokenize(text)
    return tokens

train_tokens = [preprocess(text) for text in train_texts]
test_tokens = [preprocess(text) for text in test_texts]

In [22]:
# Build Vocabulary
all_tokens = train_tokens + test_tokens
vocab = {"<pad>": 0, "<unk>": 1}
for tokens in all_tokens:
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab)


In [74]:
# Save the vocabulary to a file
torch.save(vocab, 'vocab.pt')  # Save vocab.pt

print("Vocabulary saved to vocab.pt")
# Load the saved vocabulary
#vocab = torch.load('vocab.pt')

Vocabulary saved to vocab.pt


In [24]:
# Encode sequences
def encode(tokens, vocab):
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]


In [26]:
train_sequences = [encode(tokens, vocab) for tokens in train_tokens]
test_sequences = [encode(tokens, vocab) for tokens in test_tokens]

In [28]:
# Create custom Dataset
class IMDBDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


In [30]:
# Pad batches
def collate_fn(batch):
    sequences, labels = zip(*batch)
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return sequences_padded.to(device), labels.to(device)


In [32]:
# Create DataLoaders
batch_size = 64
train_dataset = IMDBDataset(train_sequences, train_labels)
test_dataset = IMDBDataset(test_sequences, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [34]:
# Define LSTM Model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        hidden = self.dropout(hidden[-1])
        output = self.fc(hidden)
        return output

In [48]:
# Improved LSTM Model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1, bidirectional=True):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.embedding(x)                          # (batch_size, seq_len, embed_dim)
        lstm_out, (hidden, _) = self.lstm(x)           # hidden: (num_layers * num_directions, batch, hidden_dim)

        # Concatenate final forward and backward hidden states
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)  # (batch, hidden_dim * 2)
        else:
            hidden = hidden[-1]                                  # (batch, hidden_dim)

        hidden = self.dropout(hidden)
        output = self.fc(hidden)
        return output


In [50]:
# Instantiate model
vocab_size = len(vocab)
embed_dim = 128
hidden_dim = 128
output_dim = 2

In [52]:
model = SentimentLSTM(vocab_size, embed_dim, hidden_dim, output_dim).to(device)

In [54]:
!nvidia-smi

Sun May  4 20:59:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 561.19                 Driver Version: 561.19         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   48C    P8              1W /   30W |    3913MiB /   4096MiB |     12%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [56]:
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [58]:
import time

num_epochs = 10
for epoch in range(num_epochs):
    start_time = time.time()  # Start timer
    
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for sequences, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()

    end_time = time.time()  # End timer
    epoch_time = end_time - start_time
    minutes = int(epoch_time // 60)
    seconds = int(epoch_time % 60)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {running_loss/len(train_loader):.4f} - Accuracy: {100*correct/total:.2f}% - Time: {minutes}m {seconds}s")

print("Training Complete!")


Epoch [1/10] - Loss: 0.6429 - Accuracy: 61.94% - Time: 1m 33s
Epoch [2/10] - Loss: 0.5675 - Accuracy: 71.80% - Time: 1m 30s
Epoch [3/10] - Loss: 0.6076 - Accuracy: 65.59% - Time: 1m 30s
Epoch [4/10] - Loss: 0.4904 - Accuracy: 76.49% - Time: 1m 31s
Epoch [5/10] - Loss: 0.3280 - Accuracy: 87.20% - Time: 1m 30s
Epoch [6/10] - Loss: 0.2120 - Accuracy: 92.47% - Time: 1m 29s
Epoch [7/10] - Loss: 0.1436 - Accuracy: 95.33% - Time: 1m 32s
Epoch [8/10] - Loss: 0.0983 - Accuracy: 97.06% - Time: 1m 31s
Epoch [9/10] - Loss: 0.0673 - Accuracy: 98.16% - Time: 1m 29s
Epoch [10/10] - Loss: 0.0467 - Accuracy: 98.81% - Time: 1m 31s
Training Complete!


In [60]:
# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for sequences, labels in test_loader:
        outputs = model(sequences)
        _, preds = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()

print(f"Test Accuracy: {100*correct/total:.2f}%")

Test Accuracy: 83.81%


In [62]:
# Save model
torch.save(model.state_dict(), "sentiment_lstm_new.pth")
print("Model saved as 'sentiment_lstm_new.pth'")


Model saved as 'sentiment_lstm_new.pth'


In [64]:
# Predict function
def predict(text):
    input_tensor = preprocess_and_encode(text, vocab).to(device)
    with torch.no_grad():
        output = model(input_tensor)
        probs = torch.softmax(output, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred].item()
    sentiment = "Positive" if pred == 1 else "Negative"
    return sentiment, confidence

In [70]:
text = "This movie was absolutely fantastic!"
sentiment, confidence = predict(text)
print(f"Predicted: {sentiment} ({confidence*100:.2f}% confidence)")


Predicted: Positive (88.15% confidence)


In [68]:
import re
from nltk.tokenize import word_tokenize

# Preprocessing and token encoding function
def preprocess_and_encode(text, vocab, max_len=300):
    text = text.lower()
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = word_tokenize(text)
    ids = [vocab.get(token, vocab["<unk>"]) for token in tokens[:max_len]]
    tensor = torch.tensor(ids, dtype=torch.long).unsqueeze(0)  # Add batch dimension
    return tensor

In [81]:
# app.py
import streamlit as st
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
from nltk.tokenize import word_tokenize
import nltk
import os

# Download tokenizer
nltk.download('punkt')

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load vocab and model
vocab_path = "vocab.pt"
model_path = "sentiment_lstm_new.pth"

if not os.path.exists(vocab_path) or not os.path.exists(model_path):
    st.error("Model or vocabulary file not found.")
    st.stop()

vocab = torch.load(vocab_path)

# Model definition
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1, bidirectional=True):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            hidden = hidden[-1]
        hidden = self.dropout(hidden)
        return self.fc(hidden)

# Load model (set output_dim = 2 for binary classification)
vocab_size = len(vocab)
embed_dim = 128
hidden_dim = 128
output_dim = 2  # Binary classification (Positive/Negative)

model = SentimentLSTM(vocab_size, embed_dim, hidden_dim, output_dim).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

# Preprocessing
def preprocess_and_encode(text, vocab, max_len=300):
    text = text.lower()
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = word_tokenize(text)
    ids = [vocab.get(token, vocab["<unk>"]) for token in tokens[:max_len]]
    tensor = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    return tensor

# Prediction
def predict(text):
    input_tensor = preprocess_and_encode(text, vocab).to(device)
    with torch.no_grad():
        output = model(input_tensor)
        probs = F.softmax(output, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred].item()
    sentiment_map = {0: "Negative", 1: "Positive"}
    return sentiment_map[pred], confidence

# Streamlit GUI
st.title("Sentiment Analysis App")
st.write("Enter a sentence and get sentiment prediction:")

user_input = st.text_area("Input Text")

if st.button("Predict Sentiment"):
    if user_input.strip():
        sentiment, confidence = predict(user_input)
        st.markdown(f"### Sentiment: {sentiment}")
        st.markdown(f"**Confidence:** {confidence*100:.2f}%")
    else:
        st.warning("Please enter some text to analyze.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\souvik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  vocab = torch.load(vocab_path)
  model.load_state_dict(torch.load(model_path, map_location=device))


In [83]:
# Load vocab and model
vocab_path = "vocab.pt"
model_path = "sentiment_lstm_new.pth"

# Ensure the files exist
if not os.path.exists(vocab_path) or not os.path.exists(model_path):
    st.error("Model or vocabulary file not found.")
    st.stop()

# Load vocab and model weights only (avoid executing unsafe code)
vocab = torch.load(vocab_path)  # Assuming vocab is just a dictionary, no need for weights_only=True here

# Load model architecture and weights
vocab_size = len(vocab)
embed_dim = 128
hidden_dim = 128
output_dim = 2  # Binary classification (Positive/Negative)

model = SentimentLSTM(vocab_size, embed_dim, hidden_dim, output_dim).to(device)

# Use weights_only=True to avoid loading unnecessary objects
model.load_state_dict(torch.load(model_path, map_location=device, weights_only=True))

model.eval()


  vocab = torch.load(vocab_path)  # Assuming vocab is just a dictionary, no need for weights_only=True here


SentimentLSTM(
  (embedding): Embedding(166760, 128)
  (lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)