In [2]:
import numpy as np 
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer

df=pd.read_csv('review1.csv') # name of the csv file is 'review.csv'

text = df['text'] #extracting the reviews
label = df['score'] #extracting the ratings

text[0]

len(label)

nltk.download('wordnet')  # Download WordNet data
lemmatizer = WordNetLemmatizer()
lemmatized_text = []
for review in text:
    lemmatized_words = [lemmatizer.lemmatize(word) for word in review.split()]
    lemmatized_text.append(" ".join(lemmatized_words))

#Tokenize â€” Create Vocab to Int mapping dictionary
from collections import Counter
all_text2 = ' '.join(text)
# create a list of words
words = all_text2.split()
# Count all the words using Counter Method
count_words = Counter(words)

total_words = len(words)
sorted_words = count_words.most_common(total_words)

#In order to create a vocab to int mapping dictionary
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

vocab_to_int

#encoding of reviews (replace words in our reviews by integers)
reviews_int = []
for review in text:
    r = [vocab_to_int[w] for w in review.split()]
    reviews_int.append(r)
print (reviews_int[0:3])

labels = np.array(label)
#converting to binary class
l=[]
for i in labels:
  if(i==0 or i==1 or i==2):
    i=0
  elif(i==3 or i==4 or i==5):
    i=1
  l.append(i)

label = np.array(l)

def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

seq_len = 200  # The length that the sentences will be padded/shortened to

reviews = pad_input(reviews_int, seq_len)



reviews[0]

#80% train, 10% test & 10% validation
split_frac = 0.8
len_feat = len(reviews)
train_x = reviews[0:int(split_frac*len_feat)]
train_y = label[0:int(split_frac*len_feat)]
remaining_x = reviews[int(split_frac*len_feat):]
remaining_y = label[int(split_frac*len_feat):]
valid_x = remaining_x[0:int(len(remaining_x)*0.5)]
valid_y = remaining_y[0:int(len(remaining_y)*0.5)]
test_x = remaining_x[int(len(remaining_x)*0.5):]
test_y = remaining_y[int(len(remaining_y)*0.5):]

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size = 64

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,drop_last=True)

# obtain one batch of training data
for sample_x, sample_y in train_loader:
    print('Sample input size: ', sample_x.size()) # batch_size, seq_length
    print('Sample input: \n', sample_x)
    print()
    print('Sample label size: ', sample_y.size()) # batch_size
    print('Sample label: \n', sample_y)
    break  # Break after printing the first batch to avoid printing the entire dataset


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vedhanshi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[[3715, 67, 199, 8, 420, 22, 29, 2, 19, 327, 5, 3084, 9, 923, 5, 409, 2, 105, 41, 1, 14, 7, 47, 32, 3, 679, 5962, 211, 22, 679, 2, 449, 15, 1133, 11, 4, 2613, 107, 6, 726, 1, 336, 186, 315, 2902, 15, 217, 3, 9326, 527, 350, 2, 3716, 21, 4, 3441, 40, 2822, 68, 7703, 14, 9, 35, 540, 3, 94, 257, 17, 7703, 1152, 12968, 336, 5, 1325, 40, 861, 1022, 94, 257, 17, 120, 337, 168, 71, 7703, 19, 1, 4106, 8, 4612, 21, 140, 336, 309, 7, 13, 519, 11, 2, 46, 105, 108, 12, 14, 5, 345, 168, 15, 758, 53, 20, 197, 11, 747, 7, 52, 35, 205, 350, 406, 3715, 106, 99, 1296, 1196, 6, 68, 151, 199, 3, 67, 250, 12969, 21, 9, 7704, 220, 4107, 14, 2, 327, 2, 83, 279, 4, 27, 8, 9, 329, 5, 2384, 4325, 2, 37, 99, 5963, 16, 2, 41, 12, 6, 1, 1758, 1555, 181, 2, 229, 16, 4, 1804, 8, 14, 11, 562, 5, 171, 230, 2, 656, 103, 2, 25, 250, 18, 25, 4, 2549, 61, 440, 12, 3, 12970, 5964, 12971, 444, 1296, 29, 242, 1133, 4, 32, 47, 211, 2, 299, 17, 1, 3899, 917, 12972, 37, 318, 8, 4, 2823, 188, 113, 1, 243, 257, 12, 19, 4, 32, 188

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        output = self.sigmoid(output)
        return output.squeeze(1)

# Define parameters
vocab_size = len(vocab_to_int) + 1
embedding_dim = 128
hidden_dim = 256
output_dim = 1

# Instantiate the model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = torch.round(outputs)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print('Test Accuracy: {:.2f}%'.format(accuracy * 100))


Test Accuracy: 86.29%


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        gru_out = gru_out[:, -1, :]
        output = self.fc(gru_out)
        output = self.sigmoid(output)
        return output.squeeze(1)

# Define parameters
vocab_size = len(vocab_to_int) + 1
embedding_dim = 128
hidden_dim = 256
output_dim = 1

# Instantiate the model
model = GRUModel(vocab_size, embedding_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = torch.round(outputs)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print('Test Accuracy: {:.2f}%'.format(accuracy * 100))


Test Accuracy: 90.22%
