In [31]:
import pandas as pd

In [32]:
df = pd.read_csv("Dataset-SA.csv")
df.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [33]:
df = df[['Review','Sentiment']]
df = df.dropna()
df.shape

(180388, 2)

In [34]:
df['Review'] = [str(text) for text in df['Review']]

In [35]:
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
# Fit the label encoder on the data and transform the data
encoded_data = label_encoder.fit_transform(df['Sentiment'])
df['Sentiment_Coded'] = encoded_data

In [36]:
# Create Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=1000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
    char_level=False,
    #oov_token="<OOV>",
    analyzer=None,
    )
tokenizer.fit_on_texts(df['Review'])

In [37]:
max_sequence_length = max([len(i.split()) for i in df['Review']])
max_sequence_length

22

In [38]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['Review'], df['Sentiment_Coded'], test_size=0.2, random_state=42, stratify=df['Sentiment_Coded'])

In [39]:
x_train, x_test, y_train, y_test  = list(x_train), list(x_test), list(y_train), list(y_test)

In [40]:
len(x_train), len(x_test), len(y_train), len(y_test)

(144310, 36078, 144310, 36078)

In [41]:
sequences_train = tokenizer.texts_to_sequences(x_train)
sequences_test =  tokenizer.texts_to_sequences(x_test)

In [42]:
# Pad Sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_sequence_train = pad_sequences(sequences_train, maxlen=max_sequence_length)
pad_sequence_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

In [43]:
vocab_size = len(tokenizer.index_word)
output_size = len(df['Sentiment'].unique())
vocab_size, output_size

(1320, 3)

In [44]:
import numpy as np
num_classes = 3
one_hot_labels_y_train = np.eye(num_classes)[y_train]
one_hot_labels_y_test = np.eye(num_classes)[y_test]

In [45]:
import torch                                        # root package
from torch.utils.data import Dataset, DataLoader
class TextClassificationDataset(Dataset):
    def __init__(self, sequences,labels):
        self.sequences = sequences
        self.labels = labels
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self,idx):
        return self.sequences[idx], self.labels[idx]

In [46]:
train_dataset = TextClassificationDataset(pad_sequence_train, one_hot_labels_y_train)
test_dataset = TextClassificationDataset(pad_sequence_test,one_hot_labels_y_test)

In [47]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle= True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [48]:
embedding_dim = 100 # Dimension of word embeddings
hidden_dim = 128 # Number of LSTM units
output_dim = 3 # Number of classes
num_layers = 2 # Number of LSTM layers
dropout = 0.5 # Dropout rate
max_length = 100 # maximum length of sequence

In [49]:
import torch.nn as nn
import torch.nn.functional as F
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings = vocab_size+1, embedding_dim = embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self,x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1,:,:])
        x = F.relu(self.fc1(hidden))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x
model = TextClassificationModel(vocab_size,embedding_dim, hidden_dim, output_dim, num_layers, dropout)

In [50]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [51]:
epochs = 1
from tqdm import tqdm
for epoch in tqdm(range(epochs)):
    for inputs, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, torch.argmax(labels, axis=1))
        loss.backward()
        optimizer.step()

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
  0%|                                                                                         | 0/4510 [00:00<?, ?it/s][A
  0%|                                                                                 | 3/4510 [00:00<02:32, 29.64it/s][A
  0%|▏                                                                                | 7/4510 [00:00<02:15, 33.31it/s][A
  0%|▏                                                                               | 11/4510 [00:00<02:32, 29.42it/s][A
  0%|▏                                                                               | 14/4510 [00:00<02:32, 29.53it/s][A
  0%|▎                                                                               | 17/4510 [00:00<02:38, 28.40it/s][A
  0%|▎                                                                               | 20/4510 [00:00<02:37, 28.55it/s][A
  1%|▍             

In [52]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == torch.argmax(labels, axis=1)).sum().item()

accuracy = correct/total
print("Test Accuracy", accuracy)

Test Accuracy 0.9119685126669993
