In [1]:
#importing relevant modules
import matplotlib.pyplot as plt
from collections import Counter
import re
import math
import urllib.request
import numpy as np

In [2]:
#Getting the url for the review data
url1 = "https://raw.githubusercontent.com/cbannard/lela60331_24-25/refs/heads/main/coursework/Compiled_Reviews.txt"
urllib.request.urlretrieve(url1, "Compiled_Reviews.txt")

#creating lists for the review data to go into
reviews=[]
sentiment_ratings=[]
product_types=[]
helpfulness_ratings=[]

#splitting the data into relevant lists
with open("Compiled_Reviews.txt", encoding="utf-8") as f:
   for line in f.readlines()[1:]:
        fields = line.rstrip().split('\t')
        reviews.append(fields[0])
        sentiment_ratings.append(fields[1])
        product_types.append(fields[2])
        helpfulness_ratings.append(fields[3])

#making all the reviews lower case
reviews = [review.lower() for review in reviews]



In [3]:
#downloading a list of stopwords from github
url2 =  "https://raw.githubusercontent.com/Alir3z4/stop-words/master/english.txt"
file_path = "english.txt"
urllib.request.urlretrieve(url2, file_path)


with open(file_path, "r", encoding="utf-8") as file:
    stop_words = file.read().splitlines()

#making the stop words lower case
stop_words = [word.lower() for word in stop_words]

In [None]:
#tokenising and removing punctuation and stopwords
tokenised_sentences = [
    [word for word in re.findall("[^ \.,?\":()-]+", text) if word not in stop_words]
    for text in reviews]

#Creating a vocabulary 

#Creating a token list from the tokenised sentences
tokens=[]
for s in tokenised_sentences:
  tokens.extend(s)

#counting tokens
counts = Counter(tokens)

#sorts tokens with their frequencies and puts them in a descending order
so=sorted(counts.items(), key=lambda item: item[1], reverse=True)

#creates a list of only the tokens
so=list(zip(*so))[0]

#creates a vocabulary of the 5000 most frequent words 
type_list=so[0:5000]

In [None]:
#Encoding the words

#creates matrix of reviews the size of the reviews by the vocab 
M = np.zeros((len(reviews), len(type_list)))

#iterates through the reviews and checks if the word appears in appears in the vocab. then indicates it with a 1 if its there.
for i, rev in enumerate(reviews):
  tokens = re.findall("[^ ]+",rev)
  for j,t in enumerate(type_list):
    if t in tokens:
      M[i,j] = 1

In [5]:
#Splitting the data into sets 

#Randomly seperating the data - 80% for the training set and 20% for the testing set and not allowing repeats
train = np.random.choice(len(reviews), int(len(reviews)*0.8), replace=False)
test = list(set(range(0,len(reviews))) - set(train))

#creating matrices based on the training and testing sets
M_train = M[train,]
M_test = M[test,]

#creating labels for the training and test set.
SR_train = [sentiment_ratings[i] for i in train]
SR_test = [sentiment_ratings[i] for i in test]

In [None]:
#Multilayer neural network

#the input to the model is equal to the vocab size
num_features=5000
#number of iterations and the learning rate
n_iters = 2500
lr = 0.1
#encodes the sentiment labels. 1 for pos and 0 for neg
y=[1 if label == "positive" else 0 for label in SR_train]
#sets the total number of labels
num_samples = len(y)
#creates an empty list for the loss
logistic_loss = []

#setting the number of nodes in the hidden layer
hidden_layer = 3

#initialising the layers of weights
weights_0_1 = np.random.rand(num_features, hidden_layer)
weights_1_2 = np.random.rand(hidden_layer, 1) 

#initilaisng the bias values
bias_0_1 = np.zeros((1, hidden_layer))
bias_1_2 = np.zeros((1, 1))



for i in range(n_iters):

  #forward pass
  #hidden layer
  z_1 = np.dot(M_train, weights_0_1) + bias_0_1
  #ReLU activation 
  hidden_relu = np.maximum(0, z_1)

  #output layer
  z_2 = np.dot(hidden_relu, weights_1_2) + bias_1_2
  #sigmoid activation
  q = 1/(1 + np.exp(-z_2))

  #loss
  #small constant to avoid division by 0
  eps = 0.00001
  #shape of the true labels
  y = np.array(y).flatten()
  #shape of the predicted probabilities
  q = q.flatten()
  #loss
  loss = -np.mean(y * np.log(q + eps) + (1 - y) * np.log(1 - q + eps))
  logistic_loss.append(loss)

  #backwards pass
  #error at the output layer
  error_output = (q - y).reshape(-1, 1)
  #error of the hidden layer
  error_hidden = np.dot(error_output, weights_1_2.T) * (z_1 > 0)

  #calculating gradients for second layer weights and bias
  dw_1_2 = np.dot(hidden_relu.T, error_output) / num_samples
  db_1_2 = np.mean(error_output)

  #calculating gradients for first layer weights and bias
  dw_0_1 = np.dot(M_train.T, error_hidden) / num_samples
  db_0_1 = np.mean(error_hidden)

  #updating weights and biases

  weights_1_2 -= lr * dw_1_2
  bias_1_2 -= lr * db_1_2

  weights_0_1 -= lr * dw_0_1
  bias_0_1 -= lr * db_0_1

  #converting the predicted probabilities to binary
  y_pred = [int(ql > 0.5) for ql in q]


#plotting loss gradient
plt.plot(range(1,n_iters), logistic_loss[1:])
plt.xlabel("Iterations")
plt.ylabel("Loss")

In [None]:
#testing true values against predictions

#output of the hidden layer with the test set
layer_1 = np.maximum(M_test.dot(weights_0_1) + bias_0_1, 0) 
#output of the output layer with the test set
layer_2 = layer_1.dot(weights_1_2) + bias_1_2
#sigmoid activation
q = 1/(1+np.exp(-layer_2))

#convert predictions to binary
y_test_pred = [int(prob > 0.5) for prob in q]
print(y_test_pred)
#create true value labels
y_test=[int(l == "positive") for l in SR_test]
print(y_test)

In [None]:
#Testing accuracy, precision, recall 

#list of correct predictions
acc_test=[int(yp == y_test[s]) for s,yp in enumerate(y_test_pred)]
#calculating accuracy
print(f'accuracy: {sum(acc_test)/len(acc_test)}')


labels_test_pred=["positive" if s == 1 else "negative" for s in y_test_pred]

#calculating true pos, true neg, false pos, false neg. 
true_positives=sum([int(yp == "positive" and SR_test[s] == "positive") for s,yp in enumerate(labels_test_pred)])
true_negatives=sum([int(yp == "negative" and SR_test[s] == "negative") for s,yp in enumerate(labels_test_pred)])
false_positives=sum([int(yp == "positive" and SR_test[s] == "negative") for s,yp in enumerate(labels_test_pred)])
false_negatives=sum([int(yp == "negative" and SR_test[s] == "positive") for s,yp in enumerate(labels_test_pred)])


#calculating precision and recall
precision = true_positives/(true_positives + false_positives)
recall = true_positives/(true_positives + false_negatives)
print(f'precision: {precision}')
print(f'recall: {recall}')

In [None]:
#Identifying positive and negatively weighted tokens

#combining all weights
all_weights = np.dot(weights_0_1, weights_1_2).flatten()

#sorting them by highest weighted
postokens = [type_list[x] for x in np.argsort(all_weights)[::-1][:20]]
negtokens = [type_list[x] for x in np.argsort(all_weights)[:20]]

print(f'Positive tokens: {postokens} 
      Negative tokens: {negtokens}')