In [2]:
#importing relevant modules
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import re
import math
import urllib.request

In [None]:
#Getting the url for the review data
url1 = "https://raw.githubusercontent.com/cbannard/lela60331_24-25/refs/heads/main/coursework/Compiled_Reviews.txt"
urllib.request.urlretrieve(url1, "Compiled_Reviews.txt")

#creating lists for the review data to go into
reviews=[]
sentiment_ratings=[]
product_types=[]
helpfulness_ratings=[]

#splitting the data into relevant lists
with open("Compiled_Reviews.txt", encoding="utf-8") as f:
   for line in f.readlines()[1:]:
        fields = line.rstrip().split('\t')
        reviews.append(fields[0])
        sentiment_ratings.append(fields[1])
        product_types.append(fields[2])
        helpfulness_ratings.append(fields[3])

In [None]:
#Tokenising the sentences in reviews
tokenised_sentences = [re.findall("[^ ]+", text) for text in reviews]

#Creating a token list from the tokenised sentences
tokens=[]
for s in tokenised_sentences:
  tokens.extend(s)

#counting the tokens
counts = Counter(tokens)

#sorts tokens with their frequencies and puts them in a descending order
so=sorted(counts.items(), key=lambda item: item[1], reverse=True)

#creates a list of only the tokens
so=list(zip(*so))[0]

#creates a vocabulary of the 5000 most frequent words 
type_list=so[0:5000]


In [None]:
#Encoding the words

#creates matrix of reviews the size of the reviews by the vocab 
M = np.zeros((len(reviews), len(type_list)))

#iterates through the reviews and checks if the word appears in appears in the vocab. then indicates it with a 1 if its there.
for i, rev in enumerate(reviews):
  tokens = re.findall("[^ ]+",rev)
  for j,t in enumerate(type_list):
    if t in tokens:
      M[i,j] = 1

In [None]:
#Splitting the data into sets 

#Randomly seperating the data - 80% for the training set and 20% for the testing set and not allowing repeats
train = np.random.choice(len(reviews), int(len(reviews)*0.8), replace=False)
test = list(set(range(0,len(reviews))) - set(train))

#creating matrices based on the training and testing sets
M_train = M[train,]
M_test = M[test,]

#creating labels for the training and test set.
SR_train = [sentiment_ratings[i] for i in train]
SR_test = [sentiment_ratings[i] for i in test]

In [None]:
#Logistic regression

#the input to the model is equal to the vocab size
num_features=5000
#encodes the sentiment labels. 1 for pos and 0 for neg
y=[int(1=="positive") for l in SR_train]
#initiliases the weights
weights = np.random.rand(num_features)
#initialises the bias
bias=np.random.rand(1)
#sets the number of iterations and the learning rate
n_iters = 1000
lr = 0.4
#creates an empty list for the loss
logistic_loss = []
#sets the total number of labels
num_samples = len(y)

for i in range(n_iters):
  #forward pass
  z = M_train.dot(weights)+bias
  #applies the sigmoid function
  q = 1/(1+np.exp(-z))
  #creating a small constant to avoid division byy 0
  eps = 0.00001
  #calculating loss and appending it to the list
  loss = -sum((y*np.log2(q+eps)+(np.ones(len(y))-y)*np.log2(np.ones(len(y))-q+eps)))
  logistic_loss.append(loss)
  #turns predictions into 1 or 0
  y_pred = [int(ql > 0.5) for ql in q]

  #calculating the gradient w respect to the weights and the bias
  dw = (q-y).dot(M_train)/num_samples
  db = (sum(q-y))/num_samples

#updating the weights and bias values
  weights = weights - lr*dw
  bias = bias - lr*db

#plotting the loss
plt.plot(range(1,n_iters), logistic_loss[1:])
plt.xlabel("Iterations")
plt.ylabel("Loss")

print(loss)

In [None]:
#List of probabilities

#calculates the values for the test data
z = M_test.dot(weights)+bias
#converts values in z to probabilities 
q = 1/(1+np.exp(-z))
#creates empty list for the test set predictions
y_test_pred = []

#appends the test predicitons and makes them binary outputs. 1 for pos and 0 for neg
for prob in q:
    if prob > 0.5:
      y_test_pred.append(1)
    else:
      y_test_pred.append(0)


In [None]:
#calculating accuracy

#the true values of the reviews
y_test=[int(l == "positive") for l in SR_test]
#checks is predicted label matches true label
acc_test=[int(yp == y_test[s]) for s,yp in enumerate(y_test_pred)]
#calculates accuracy
Accuracy = (sum(acc_test)/len(acc_test))

print(f'Accuracy: {Accuracy}')

In [None]:
#calculating precision and recall

#the predictions of sentiment rating in the test set 
SR_test_pred=["positive" if s == 1 else "negative" for s in y_test_pred]

#calculating the true pos, false pos, true neg and false neg predictions
true_positives=sum([int(yp == "positive" and SR_test[s] == "positive") for s,yp in enumerate(SR_test_pred)])
true_negatives=sum([int(yp == "negative" and SR_test[s] == "negative") for s,yp in enumerate(SR_test_pred)])
false_positives=sum([int(yp == "positive" and SR_test[s] == "negative") for s,yp in enumerate(SR_test_pred)])
false_negatives=sum([int(yp == "negative" and SR_test[s] == "positive") for s,yp in enumerate(SR_test_pred)])

#calculating precision and recall
precision = true_positives/(true_positives+false_positives)
recall = true_positives/(true_positives+false_negatives)
print(f'Precision: {precision}')
print(f'Recall: {recall}')

In [None]:
#top 10 words for a negative outcome
neg_weights = [type_list[x] for x in np.argsort(weights)[0:10]]

#top 10 words weighted for a positive outcome
pos_weights = [type_list[x] for x in np.argsort(weights)[::-1][0:10]]

print(f'The top 10 negative weighted words are {neg_weights} and the top 10 of positively weighted words are {pos_weights}')