In [1]:
import nltk
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english')).union(set(ENGLISH_STOP_WORDS))

punctuation = string.punctuation

df = pd.read_csv('IMDB Dataset.csv')

def preprocess_text(text):
   
    tokens = word_tokenize(text.lower())
    
    filtered_tokens = [word for word in tokens if word not in stop_words and word not in punctuation]

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemmatized_tokens)

df['preprocessed_review'] = df['review'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
glove_path = 'glove.6B.100d.txt'  
embeddings_index = {}

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

def get_review_embedding(review):
   
    words = review.split()
    
    vectors = [embeddings_index.get(word) for word in words if word in embeddings_index]
    if not vectors:  
        return np.zeros(100)  
    return np.mean(vectors, axis=0)


df['embedding'] = df['preprocessed_review'].apply(get_review_embedding)


In [3]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

X = np.stack(df['embedding'].values)
y = df['sentiment'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)



In [4]:
# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size)  
        out, _ = self.rnn(x.unsqueeze(1), h0)  
        out = self.fc(out[:, -1, :])  
        return out


input_size = 100  
hidden_size = 50 
output_size = 2  
learning_rate = 0.01
num_epochs = 10


model = SimpleRNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


for epoch in range(num_epochs):
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total:.2f}%')


Epoch [1/10], Loss: 0.0659
Epoch [2/10], Loss: 1.5526
Epoch [3/10], Loss: 0.3042
Epoch [4/10], Loss: 0.6330
Epoch [5/10], Loss: 0.0284
Epoch [6/10], Loss: 0.1007
Epoch [7/10], Loss: 0.8189
Epoch [8/10], Loss: 0.3127
Epoch [9/10], Loss: 2.2959
Epoch [10/10], Loss: 0.0655
Test Accuracy: 78.90%


In [5]:

# Define a simple LSTM model
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(SimpleLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  # Hidden state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  # Cell state
        
        # Forward propagate LSTM
        out, _ = self.lstm(x.unsqueeze(1), (h0, c0))  
        out = self.fc(out[:, -1, :]) 
        return out


input_size = 100  # GloVe embedding size
hidden_size = 50  # LSTM hidden size
output_size = 2   # Number of output classes (positive, negative)
learning_rate = 0.01
num_epochs = 10
num_layers = 1    # Number of LSTM layers



model = SimpleLSTM(input_size, hidden_size, output_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


for epoch in range(num_epochs):
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
       
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total:.2f}%')


Epoch [1/10], Loss: 0.0608
Epoch [2/10], Loss: 0.5858
Epoch [3/10], Loss: 0.3887
Epoch [4/10], Loss: 0.1999
Epoch [5/10], Loss: 0.1765
Epoch [6/10], Loss: 1.3982
Epoch [7/10], Loss: 0.5605
Epoch [8/10], Loss: 0.6674
Epoch [9/10], Loss: 1.3666
Epoch [10/10], Loss: 0.4021
Test Accuracy: 78.23%


In [6]:
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


df = pd.read_csv('IMDB Dataset.csv')


df = df.sample(frac=0.1, random_state=42)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

df['processed'] = df['review'].apply(preprocess)

vocab = {}
word_count = 1
encoded_reviews = []

for review in df['processed']:
    encoded_review = []
    for word in review:
        if word not in vocab:
            vocab[word] = word_count
            word_count += 1
        encoded_review.append(vocab[word])
    encoded_reviews.append(encoded_review)

max_length = max(len(review) for review in encoded_reviews)
encoded_reviews = [review + [0] * (max_length - len(review)) for review in encoded_reviews]

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

X = torch.tensor(encoded_reviews, dtype=torch.long)
y = torch.tensor(df['sentiment'].values, dtype=torch.long)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

class LSTMWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=1):
        super(LSTMWithEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  # Hidden state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  # Cell state
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  
        return out


vocab_size = len(vocab) + 1 
embedding_dim = 100
hidden_size = 50
output_size = 2  
learning_rate = 0.01
num_epochs = 10
num_layers = 1

model = LSTMWithEmbedding(vocab_size, embedding_dim, hidden_size, output_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total:.2f}%')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch [1/10], Loss: 0.6943
Epoch [2/10], Loss: 0.7491
Epoch [3/10], Loss: 0.6599
Epoch [4/10], Loss: 0.6977
Epoch [5/10], Loss: 0.7021
Epoch [6/10], Loss: 0.6932
Epoch [7/10], Loss: 0.6944
Epoch [8/10], Loss: 0.7589
Epoch [9/10], Loss: 0.6932
Epoch [10/10], Loss: 0.6955
Test Accuracy: 50.60%


In [7]:
class RNNWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=1):
        super(RNNWithEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  # Hidden state
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # Use the last output of RNN
        return out

# Hyperparameters
vocab_size = len(vocab) + 1  # Add 1 for padding index
embedding_dim = 100
hidden_size = 50
output_size = 2  # Binary classification (positive/negative)
learning_rate = 0.01
num_epochs = 10
num_layers = 1

# Initialize model, loss function, and optimizer
model = RNNWithEmbedding(vocab_size, embedding_dim, hidden_size, output_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Testing loop
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total:.2f}%')

Epoch [1/10], Loss: 0.7265
Epoch [2/10], Loss: 1.1587
Epoch [3/10], Loss: 0.2542
Epoch [4/10], Loss: 0.7119
Epoch [5/10], Loss: 0.7443
Epoch [6/10], Loss: 0.3377
Epoch [7/10], Loss: 0.6932
Epoch [8/10], Loss: 0.6655
Epoch [9/10], Loss: 0.7601
Epoch [10/10], Loss: 0.9419
Test Accuracy: 49.40%
