# Sentiment Analysis

In [1]:
import torch
import torch.nn as nn
from torchtyping import TensorType

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

list_of_strings = []
list_of_labels = []

import csv
with open('EcoPreprocessed.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
      list_of_strings.append(row[1])
      list_of_labels.append(float(row[2]))

In [3]:
def get_dataset(list_of_strings):
        words = set()
        for sentence in list_of_strings:
            for word in sentence.split():
                words.add(word)

        vocabulary_size = len(words)
        
        sorted_list = sorted(list(words))
        word_to_int = {}
        for i, c in enumerate(sorted_list):
            word_to_int[c] = i + 1

        def encode(sentence):
            integers = []
            for word in sentence.split():
                integers.append(word_to_int[word])
            return integers

        tensors = []
        for sentence in list_of_strings:
            tensors.append(torch.tensor(encode(sentence)))

        return nn.utils.rnn.pad_sequence(tensors, batch_first=True), vocabulary_size + 1, word_to_int

In [4]:
training_dataset, vocab_size, word_to_int = get_dataset(list_of_strings)
training_labels = torch.unsqueeze(torch.tensor(list_of_labels), dim = -1)

In [5]:
class Solution(nn.Module):
    def __init__(self, vocabulary_size: int, embedding_dimension: int):
        super().__init__()
        torch.manual_seed(0)
        self.embedding_layer = nn.Embedding(vocabulary_size, embedding_dimension)
        self.linear_layer = nn.Linear(embedding_dimension, 1)
        self.tanh = nn.Tanh()
        pass

    def forward(self, x: TensorType[int]) -> TensorType[float]:
        embed = self.embedding_layer(x)
        average = torch.mean(embed, axis=1)
        projected = self.linear_layer(average)
        return self.tanh(projected)
        pass

In [14]:
embedding_dimension = 256
model = Solution(vocab_size, embedding_dimension)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

for i in range(1000):
  randperm = torch.randperm(len(training_dataset))
  training_dataset, training_labels = training_dataset[randperm], training_labels[randperm]
  mini_batch = training_dataset[:150]
  mini_batch_labels = training_labels[:150]
  pred = model(mini_batch)
  optimizer.zero_grad()
  loss = loss_function(pred, mini_batch_labels)
  if i % 100 == 0:
    print(loss.item())
  loss.backward()
  optimizer.step()

1.1428980827331543
0.10114537924528122
0.11631269007921219
0.11385755240917206
0.13156601786613464
0.1279372125864029
0.08182553201913834
0.07628636807203293
0.09965605288743973
0.08477086573839188


In [19]:
example_one = "worst movie ever"

example_two = "best movie ever"

example_three = "weird movie"

examples = [example_one] + [example_two] + [example_three]

# Let's encode these strings as numbers using the dictionary from earlier
var_len = []
for example in examples:
  int_version = []
  for word in example.split():
    int_version.append(word_to_int[word])
  var_len.append(torch.tensor(int_version))

testing_tensor = torch.nn.utils.rnn.pad_sequence(var_len, batch_first=True)
model.eval()

print(model(testing_tensor).tolist())

[[-0.9999973773956299], [0.9999677538871765], [-0.09320424497127533]]
