In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('../content/drive/MyDrive/Reviews.csv')

df = df[['Text','Score']]
df['Score'].isnull().sum()
df['Score'].isnull().sum()
df.drop_duplicates(subset=['Text','Score'],keep='first',inplace=True)

def set_sent(score):
    if score<=2:
        return -1
    elif score==3:
        return 0
    else:
        return 1

df['sentiment']=df['Score'].apply(set_sent)
df = df[['Text','sentiment']]

# Separate into three sentiment groups
df_neg1 = df[df['sentiment']==-1]
df_0 = df[df['sentiment']==-0]
df_1 = df[df['sentiment']==1]

n = 5000

df_neg1 = df_neg1.sample(n=n, random_state=42, replace=False)
df_0 = df_0.sample(n=n, random_state=42, replace=False)
df_1 = df_1.sample(n=10000, random_state=42, replace=False)
# print("df_neg1: ", df_neg1)
# print("df_0: ", df_0)
# print("df_1: ", df_1)

sub_df = pd.concat([df_neg1, df_0, df_1], axis=0)
X = sub_df['Text']
y = sub_df['sentiment']



In [4]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence)

def stem(word):
    return stemmer.stem(word.lower())

def bag_of_words(tokenized_sentence, words):

    # stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words: 
            bag[idx] = 1

    return bag

In [5]:
sentiments = []
all_words = []
xy = []

for sentiment in y:
    sentiments.append(sentiment)

i = 0

for text in X:
    w = tokenize(text)
    all_words.extend(w)
    xy.append((w, sentiments[i]))
    i += 1


ignore_words = ['?', '!', '.', ',']

all_words = [stem(w) for w in all_words if w not in ignore_words]
all_words = sorted(set(all_words))
sentiments = sorted(set(sentiments))

In [6]:
# train val and test split
train, test = train_test_split(xy, test_size=0.2, random_state=42, shuffle=True)

test, val = train_test_split(test, test_size=0.5, random_state=42)

print(test[0])

(['I', 'thought', 'this', 'fish', 'oil', 'would', 'be', 'a', 'great', 'supplement', 'for', 'my', 'cats', 'after', 'one', 'of', 'them', 'began', 'to', 'overgroom', 'and', 'created', 'bald', 'patches', 'on', 'her', 'stomach', 'and', 'legs', '.', 'All', 'four', 'of', 'my', 'cats', 'refused', 'any', 'type', 'of', 'food', 'with', 'this', 'fish', 'oil', 'added', '.', 'I', 'tried', 'for', 'many', 'days', 'in', 'many', 'ways', '.', 'Finally', 'switched', 'back', 'from', 'a', 'newer', 'dry', 'food', 'I', 'thought', 'would', 'be', 'better', 'for', 'them', ',', 'back', 'to', 'their', 'old', 'dry', 'food', ',', 'which', 'obviously', 'had', 'more', 'oils', 'in', 'the', 'ingredients', '.', 'Bald', 'cat', 'is', 'getting', 'her', 'fur', 'back', '.'], -1)


In [7]:
def make_into_bag (df):
  X = [] 
  y = [] 

  for (pattern_sentence, tag) in df:
      bag = bag_of_words(pattern_sentence, all_words)
      X.append(bag)

      label = sentiments.index(tag)
      y.append(label)

  X = np.array(X)
  y = np.array(y)
  return X, y

X_train, y_train = make_into_bag(train)
X_test, y_test = make_into_bag(test)
X_val, y_val = make_into_bag(train)


In [8]:
class MainData(Dataset):
    def __init__(self, X_data, y_data):
        self.n_samples = len(X_data)
        self.x_data = X_data
        self.y_data = y_data

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.n_samples

In [9]:
num_epochs = 10
batch_size = 32
learning_rate = 0.01
input_size = len(X_train[0])
hidden_size = 64
output_size = len(sentiments)

train_loader = DataLoader(dataset=MainData(X_train, y_train), batch_size=batch_size, shuffle=True, num_workers=0)
test_loader = DataLoader(dataset=MainData(X_test, y_test), batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(dataset=MainData(X_val, y_val), batch_size=batch_size, shuffle=True, num_workers=0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.l1 = nn.Linear(input_size, 32) 
        self.l2 = nn.Linear(32, 64) 
        self.l3 = nn.Linear(64, 256)
        self.l4 = nn.Linear(256, 512)
        self.l5 = nn.Linear(512, 256)
        self.l6 = nn.Linear(256, 64)
        self.l7 = nn.Linear(64, 32)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        out = self.relu(out)
        out = self.l4(out)
        out = self.relu(out)
        out = self.l5(out)
        out = self.relu(out)
        out = self.l6(out)
        out = self.relu(out)
        out = self.l7(out)
        # no activation and no softmax at the end
        return out

model = Net(input_size, hidden_size, output_size).to(device)

# Loss and optimizer (optimizer can be played around)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_loss, val_loss = [], []
train_acc, val_acc = [], []

# Train the model
for epoch in range(num_epochs):

    model.train()
    running_loss = 0.
    correct, total = 0, 0 

    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        
        # Forward pass
        outputs = model(words)
        # if y would be one-hot, we must apply
        # labels = torch.max(labels, 1)[1]
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
            
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    train_loss.append(running_loss / len(train_loader))
    train_acc.append(correct/total)

    model.eval()
    running_loss = 0.
    correct, total = 0, 0

    for (words, labels) in val_loader:
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)

        output = model(words)
        loss = criterion(outputs, labels)

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    val_loss.append(running_loss / len(val_loader))
    val_acc.append(correct/total)
        
    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


    print(f"Epoch {epoch+1}:")
    print(f"Training Loss:", round(train_loss[epoch], 3))
    print(f"Validation Loss:", round(val_loss[epoch], 3))
    print(f"Training Accuracy:", round(train_acc[epoch], 3))
    print(f"Validation Accuracy:", round(val_acc[epoch], 3))
    print()


Epoch 1:
Training Loss: 0.87
Validation Loss: 1.431
Training Accuracy: 0.593
Validation Accuracy: 0.392

Epoch 2:
Training Loss: 0.638
Validation Loss: 2.35
Training Accuracy: 0.721
Validation Accuracy: 0.384

Epoch 3:
Training Loss: 0.504
Validation Loss: 2.72
Training Accuracy: 0.796
Validation Accuracy: 0.413

Epoch 4:
Training Loss: 0.438
Validation Loss: 3.741
Training Accuracy: 0.836
Validation Accuracy: 0.401

Epoch 5:
Training Loss: 0.347
Validation Loss: 4.146
Training Accuracy: 0.873
Validation Accuracy: 0.356

Epoch 6:
Training Loss: 0.303
Validation Loss: 2.923
Training Accuracy: 0.897
Validation Accuracy: 0.371

Epoch 7:
Training Loss: 0.273
Validation Loss: 6.133
Training Accuracy: 0.909
Validation Accuracy: 0.387

Epoch 8:
Training Loss: 0.244
Validation Loss: 4.191
Training Accuracy: 0.923
Validation Accuracy: 0.346

Epoch 9:
Training Loss: 0.224
Validation Loss: 8.973
Training Accuracy: 0.931
Validation Accuracy: 0.397

Epoch 10:
Training Loss: 0.223
Validation Loss: 4