In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install tensorflow==2.9.0
!pip install keras==2.9.0
!pip install -U gensim
from gensim.models import Word2Vec

import tensorflow as tf

import os

import nltk
from nltk import * 
from nltk.stem import WordNetLemmatizer
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
from nltk.corpus import stopwords

import re

from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import regularizers
from keras.layers import Embedding
from keras.models import load_model, Sequential
from keras import optimizers
from keras.layers import Dropout
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

import numpy as np

from sklearn.model_selection import train_test_split


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
path = "/content/drive/MyDrive/product_reviews"

reviews = []
files = []

for p in os.listdir(path):
  with open(path +"/" +p, "r") as f:
    if p != "README.txt":
      files.append(p)
      reviews.append(f.read().split("[t]"))

reviews = [review for product in reviews for review in product]

In [4]:
vecs = []
labels = []
stops = Counter(stopwords.words())
lm = WordNetLemmatizer()
for index, review in enumerate(reviews):
  # use regex to remove all unneccessary tags
  reviews[index] = re.sub("##", "", reviews[index])
  reviews[index] = re.sub("\\n", "", reviews[index])
  reviews[index] = reviews[index].lower()
  reviews[index] = re.sub("(\[cs\])", " ", reviews[index])
  reviews[index] = re.sub("(\[u\])", " ", reviews[index])
  reviews[index] = re.sub("(\[t\])", "", reviews[index])
  reviews[index] = re.sub("(\[s\])", " ", reviews[index])
  reviews[index] = re.sub("(\[cc\])", " ", reviews[index])
  reviews[index] = re.sub("(\[p\])", " ", reviews[index])

  # only count those reviews that indicate sentiment
  scores = [int(s[0][1:-1]) for s in re.findall("(\[(\+|\-)\d\])", reviews[index])]
  if len(scores) != 0:
    # remove tags, non-alphanum chars, and redundant spacing
    reviews[index] = re.sub("(\[(\+|\-)\d\])", " ", reviews[index])
    reviews[index] = re.sub("[^A-Za-z0-9' ]+", " ", reviews[index])
    reviews[index] = re.sub("\s\s+", " ", reviews[index])

    # remove stopwords
    reviews[index] = [word for word in reviews[index].split(" ") if word not in stops]

    # lemmatize each word
    reviews[index] = [lm.lemmatize(word) for word in reviews[index]]

    # reduce time steps by trimming off really long reviews
    if len(reviews[index]) < 300:
      # append cleaned review to feature vector list
      vecs.append(reviews[index])
      # assign class labels
      if numpy.mean(scores) > 0:
        labels.append(1)
      else:
        labels.append(0)
    

In [6]:
# initialize fold size given 5 folds, initialize performance metrics
fold_size = int(len(vecs) / 5)
loss = []
accuracy = []

# build and train word2vec encoding model
word_model = Word2Vec(min_count=1,window=5,vector_size=32, sg=1)
word_model.build_vocab(vecs)
word_model.train(vecs, total_examples=word_model.corpus_count, epochs=7, report_delay=1)

#average each word vector to create 1 dimensional sentence vector based on encoding
sentence_embeddings = []
for ind, sentence in enumerate(vecs):
  embedding = np.array([np.mean(word_model.wv[word]) for word in sentence])
  sentence_embeddings.append(embedding)

for i in range(5):
  # pad sequences so all sequences are same length
  sequences = pad_sequences(sentence_embeddings, dtype="float32").tolist()

  # label weights generated based on ratio of pos to neg class
  #weights = {0:1.55154639, 1:0.7377451}

  # split data and labels into test and train based on current fold
  vecs_test = sequences[i*fold_size:(i+1)*fold_size]
  vecs_test = np.asarray(vecs_test).astype("float32")
  labels_test = labels[i*fold_size:(i+1)*fold_size]
  vecs_train = sequences[:i*fold_size] + sequences[(i+1)*fold_size:]
  vecs_train = np.asarray(vecs_train).astype("float32")
  labels_train = labels[:i*fold_size] + labels[(i+1)*fold_size:]


  # reshape for proper lstm dimensionality
  labels_train = np.asarray(labels_train).reshape((-1,1))
  vecs_train = vecs_train.reshape((vecs_train.shape[0], vecs_train.shape[1],1))

  # Definining binary classifier
  # Start with LSTM layer of size 64 with dropout of .5 to reduce loss
  # End with dense layer with sigmoid activation for binary classification
  classifier = Sequential()
  classifier.add(layers.LSTM(64, dropout=.3))
  classifier.add(layers.Dense(1, activation="sigmoid"))

  # compile model with binary_crossentropy loss because we doing binary classification
  # set checkpoint to allow us to recover best model generated during training
  classifier.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy",])
  cp = ModelCheckpoint("1.hdf5", monitor="accuracy", verbose=0 ,save_best_only=True, mode='auto', save_freq=1,save_weights_only=False)

  # train model for 5 epochs with a validation split of .1, referencing previous callbacks, with preiously established class weights
  classifier.fit(vecs_train, labels_train, epochs=5, verbose=0, validation_split=.1, callbacks = [cp])

  # reshape testing data and labels
  labels_test = np.asarray(labels_test).reshape((-1,1))
  vecs_test = vecs_test.reshape(vecs_test.shape[0], vecs_test.shape[1], 1)
  vecs_test = np.asarray(vecs_test).astype("float32")

  # load best model
  classifier = load_model("1.hdf5")
  scores = classifier.evaluate(vecs_test, labels_test)
  loss.append(scores[0])
  accuracy.append(scores[1])
  acc = scores[1]
  

print("Mean accuracy: " +str(np.mean(accuracy)))
print("Standard Deviation: " +str(np.std(accuracy)))
print("Mean MSE Loss: " +str(np.mean(loss)))




Mean accuracy: 0.6799999892711639
Standard Deviation: 0.12927146361241207
Mean MSE Loss: 0.7477992653846741
