<a href="https://colab.research.google.com/github/spdrnl/aspect-extraction/blob/master/Unsupervised_aspect_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import re
import gzip
import json
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import gensim
from gensim.test.utils import datapath
from gensim import utils
from google.colab import drive
from os import path
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# General settings

In [19]:
# Mount Google drive to save generated data and models
drive.mount('/content/drive')

# The location of the downloaded raw data
file_name = "/tmp/reviews_Beauty.json.gz"

# The location of the cleaned data
corpus_file = "/content/drive/My Drive/corpus.txt"

# The size of the word vectors
vector_size = 200

# The location of the gensim model
model_file = '/content/drive/My Drive/w2v_model.gensim'

# The maximum allowed length of a sentence
max_length = 35

# The assumed number of aspects, this is a hyperparameter
n_aspects = 14

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Downloading the data

In [None]:
! wget -P /tmp http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Beauty.json.gz

--2020-07-04 14:23:13--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Beauty.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 352748278 (336M) [application/x-gzip]
Saving to: ‘/tmp/reviews_Beauty.json.gz.1’


2020-07-04 14:23:44 (10.9 MB/s) - ‘/tmp/reviews_Beauty.json.gz.1’ saved [352748278/352748278]



## Show a sample of the data

In [5]:
with gzip.open(file_name) as f:
  for id, line in enumerate(f):
    print(line)
    if id >= 10:
      break

FileNotFoundError: ignored

# Cleaning the data

In [20]:
n = 0
max_observed_length = 0

# Sentences are split on .?!
split_pattern = re.compile('[\.\?!]')

# Remove all non alphanumerical values, except spaces
pattern = re.compile('[^a-zA-Z0-9\s]+')

# Consolidate all sequential white space to a single space
white_space_pattern = re.compile('\s+')

stop_words = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", 
                  "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", 
                  "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", 
                  "their", "theirs", "themselves", "what", "which", "who", "whom", "this", 
                  "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", 
                  "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", 
                  "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", 
                  "of", "at", "by", "for", "with", "about", "against", "between", "into", 
                  "through", "during", "before", "after", "to", "from", 
                  "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", 
                  "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", 
                  "both", "each", "few", "more", "most", "other", "some", "such", "only", "own",
                  "same", "so", "than", "too", "very", "s", "t", "can", 
                  "will", "just", "don", "should", "now"])

# Clean the data if not already done so
# The cleaned data will be output to disk
if not path.exists(corpus_file):
  with open(corpus_file, mode='w') as out:
    with gzip.open(file_name) as f:
        for line in f:
          raw_review_text = json.loads(line)['reviewText']
          lower_review_text = raw_review_text.lower()
          for review_line in lower_review_text.splitlines():
            for sentence in split_pattern.split(review_line):
              sentence = pattern.sub('', sentence)
              sentence = white_space_pattern.sub(' ', sentence)
              words = [word for word in sentence.split() if not word in stop_words]
              max_observed_length = len(words) if len(words) > max_observed_length else max_observed_length
              sentence = " ".join(words[:max_length])
              if len(sentence) > 5: 
                out.write("{}\n".format(sentence))
                n += 1

  print("Created {} sentences with a maximum length of {}".format(n, max_length))

## Sample the data

In [21]:
! head '$corpus_file'

love moisturizer would recommend someone dry skin fine lines wrinkles
using brand day night serum
received product deadline
tested baby kabuki quality material best
packaging cute
fibers not smell soft
love set
great buy price
dont wear makeup time love feels
nice moisturizer natural ingredients no parabens


# Create word vectors using Gensim from sentences

In [22]:
class BeautyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = out_file
        for line in open(corpus_path):
            yield utils.simple_preprocess(line)

In [23]:
from os import path

# Only run gensim if there is no previous model file
if not path.exists(model_file):
  model = gensim.models.Word2Vec(sentences=BeautyCorpus(), min_count=10, size=vector_size, workers=1, window=10, negative=5)
  model.save(model_file)
else:
  model = gensim.models.Word2Vec.load(model_file)

print("There are {} words in the word2ve model".format(len(model.wv.index2word)))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


There are 46047 words in the word2ve model


# Format input data for ABAE model

In [24]:
# Note <PAD> is added to the vocabulary to allow for padding in the ABAE model
index2word = ['<PAD>'] + model.wv.index2word
word2index = {word: idx for idx, word in enumerate(index2word)}

In [25]:
encoded_sentences = []
with open(corpus_file) as corpus:
  for line_idx, line in enumerate(corpus):
    words = line.split()[:max_length]
    encoded_words = [word2index.get(word, -1) for word in words if word2index.get(word, -1) != -1]
    if len(encoded_words) > 0:
      encoded_words = encoded_words + [0] * max_length
      encoded_words = encoded_words[:max_length]
      encoded_sentences.append(encoded_words)
encoded_sentences = np.array(encoded_sentences)

In [26]:
embeddings = np.zeros([len(index2word), vector_size])
for idx, word in enumerate(index2word):
  if idx == 0: continue
  else: embeddings[idx] = model.wv[word]

In [None]:
from sklearn.cluster import KMeans
sample_size = 100000
sample_words = encoded_sentences[:sample_size].flatten().astype(np.int32)
sample_words = sample_words[sample_words > 4]
embedded_words = embeddings[sample_words, :]
kmeans = KMeans(n_clusters=14, random_state=0).fit(embedded_words)
T_init = kmeans.cluster_centers_
sample_words = None
embedded_words = None
T_init

# Create  Attention-based Aspect  Extraction (ABAE) model

## Create the ABAE model

In [14]:
class AttentionBasedEncoder(keras.layers.Layer):
  def __init__(self, embeddings, max_sentence_length):
    super(AttentionBasedEncoder, self).__init__()
    self.vocab_size = embeddings.shape[0]
    self.vector_size = embeddings.shape[1]
    self.max_sentence_length = max_sentence_length

    self.embedding = keras.layers.Embedding(self.vocab_size, 
                                            self.vector_size, 
                                            embeddings_initializer=keras.initializers.Constant(embeddings),
                                            trainable=False,
                                            mask_zero=True)
    
    self.M = tf.Variable(tf.random.uniform([self.vector_size,self.vector_size], minval=-.00001, maxval=.00001), name='M')

  def get_words_per_sentence(self, x):
    # x is batch_size, max_sentence_length
    non_padding_mask = tf.cast(tf.math.greater(x, 0), tf.float32)
    words_per_sentence = tf.reduce_sum(non_padding_mask, axis=-1)
    return words_per_sentence

  def get_y_s(self, x, e):
    # y_s is batch_size, vector_size
    words_per_sentence = self.get_words_per_sentence(x)
    y_s = tf.reduce_sum(e, axis=1) / tf.expand_dims(words_per_sentence, axis=-1)
    return y_s

  def get_d_i(self, e, y_s):
    # d_i is batch_size, max_sentence_length
    tmp = tf.expand_dims(tf.matmul(y_s, self.M), axis=-1)
    d_i = tf.matmul(e, tmp)  
    return d_i

  def get_a_i(self, d_i):
    # a_i is batch_size, max_sentence_length 
    exp_d = tf.exp(d_i)
    tmp = tf.expand_dims(tf.reduce_sum(exp_d, axis=1), axis=-1)
    a_i = tf.divide(exp_d, tmp)
    return a_i

  def get_z_s(self, a, e):
    # z_s is batch_size, vector_size
    tmp = tf.multiply(e, a)
    z_s = tf.reduce_sum(e * tmp, axis=1)
    return z_s

  def call(self, x): 

    # e is batch_size, max_sentence_length, vector_size
    e = self.embedding(x) 
    
    # y_s is batch_size, vector_size
    y_s = self.get_y_s(x, e)
    
    # d_i is batch_size, max_sentence_length
    d_i = self.get_d_i(e, y_s)

    # a_i is batch_size, max_sentence_length 
    a_i = self.get_a_i(d_i)

    # z_s is batch_size, vector_size
    z_s = self.get_z_s(a_i, e)

    return z_s

In [15]:
def get_model():
  # Define the input layers
  pos_input = keras.layers.Input(shape=[max_length], name='pos_input')
  neg_input = keras.layers.Input(shape=[max_length], name='neg_input')

  # Apply the attention based encoder to the input layers
  attention_based_encoder = AttentionBasedEncoder(embeddings, max_sentence_length=max_length)
  pos_zs = attention_based_encoder(pos_input)
  neg_zs = attention_based_encoder(neg_input)

  # Calculate the rs value for the positive samples
  pos_pt = keras.layers.Dense(n_aspects, activation='softmax', name='pt')(pos_zs)
  pos_rs = keras.layers.Dense(vector_size, use_bias=False, kernel_initializer=keras.initializers.Constant(T_init), name='rs')(pos_pt)

  # Output the pos_rs, pos_zs, neg_zs values necessary for the triplet loss
  nn = tf.keras.Model(inputs=[pos_input, neg_input], outputs=[pos_rs, pos_zs, neg_zs])

  return nn

## Train the ABAE model

In [16]:
class MyDatasetGenerator:
  """ A dataset can be created using this generator. The generator outputs 
  tuples of positive and negative samples. To correct for the number of
  negative samples, a negative factor is used to repeat the training data.
  This setup replicates the original loss function, trading of more compute time
  for simpler code.""" 

  def __init__(self, encoded_sentences, negative_factor=20):
    self.n = encoded_sentences.shape[0]
    self.encoded_sentences = encoded_sentences
    self.pos_sentence_idxs = np.repeat(np.arange(n), repeats=negative_factor)
    self.neg_sentence_idxs = np.repeat(np.arange(n), repeats=negative_factor)
    np.random.shuffle(self.pos_sentence_idxs)
    np.random.shuffle(self.neg_sentence_idxs)

  def __call__(self):
    for x, y in zip(self.pos_sentence_idxs, self.neg_sentence_idxs):
      yield (encoded_sentences[x,:],encoded_sentences[y, :])

In [17]:
! rm -rf logs
%tensorboard --logdir logs

import datetime

#@tf.function
def train(dataset, model, objective_fn, regularization_fn, optimizer, regularizer, num_training_samples, batch_size, train_summary_writer, train_los):

  for batch, (pos_samples, neg_samples) in enumerate(dataset):
    with tf.GradientTape() as tape:
      [rs_pos, zs_pos, zs_neg] = model([pos_samples, neg_samples], training=True)
      J = objective_fn(rs_pos, zs_pos, zs_neg)
      U = regularization_fn(model)
      loss = J + regularizer * U

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    with train_summary_writer.as_default():
      tf.summary.scalar('loss', train_loss.result(), step=batch)

#@tf.function 
def regularization_fn(model):
    T = tf.squeeze(model.get_layer("rs").weights)
    TT = tf.matmul(T, tf.transpose(T))
    U = tf.reduce_sum(tf.square(TT - tf.identity(TT)))
    return U

#@tf.function
def objective_fn(rs_pos, zs_pos, zs_neg):
  o = 1 - tf.reduce_sum(rs_pos * zs_pos, axis=-1) + tf.reduce_sum(rs_pos * zs_neg, axis=-1)
  mask = tf.cast(tf.math.greater(o, 0), tf.float32)
  objective = tf.reduce_sum(o * mask, axis=0)
  return objective

batch_size = 64
dataset = tf.data.Dataset.from_generator(MyDatasetGenerator(encoded_sentences, 5), output_types=(tf.int32, tf.int32)).repeat(15).batch(batch_size)

optimizer = tf.keras.optimizers.Adam(lr=0.0000001)

regularizer = 1.0

train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

train(dataset, get_model(), objective_fn, regularization_fn, optimizer, regularizer, n, batch_size, train_summary_writer, train_loss)

Reusing TensorBoard on port 6006 (pid 1726), started 0:22:28 ago. (Use '!kill 1726' to kill it.)

<IPython.core.display.Javascript object>

NameError: ignored

# Testing section

In [1]:
!kill 1726

/bin/bash: line 0: kill: (1726) - No such process


In [None]:
@tf.function
def train(dataset, model, loss_fn, optimizer):
  for x, y in dataset:
    with tf.GradientTape() as tape:
      # training=True is only needed if there are layers with different
      # behavior during training versus inference (e.g. Dropout).
      prediction = model(x, training=True)
      loss = loss_fn(y, prediction)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))


@tf.function
def loss_fn(y, prediction):
  return tf.reduce_mean(tf.square(tf.subtract(y, prediction)))

a = tf.Variable(10.0, dtype=tf.float32)
b = tf.Variable(5.0, dtype=tf.float32)
input = tf.keras.layers.Input([1])
output = tf.add(a, tf.multiply(input, b))
model = tf.keras.Model(inputs=input, outputs=output)

# dx = np.arange(-100,100, dtype=np.float32)
# dy = (10.0 + 3.0 * dx + np.random.rand(200)).astype(np.float32)
# dataset = tf.data.Dataset.from_tensor_slices((dx.reshape([-1,1]), dy)).batch(25)
dataset = tf.data.Dataset.from_generator(DatasetGenerator(-100,100), output_types=(tf.float32, tf.float32)).batch(25)

optimizer = tf.keras.optimizers.Adam(lr=0.0001)
train(dataset, model, loss_fn, optimizer)

In [None]:
a

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>

In [None]:
model.predict([5])

array([[35.]], dtype=float32)

In [None]:
list(dataset.as_numpy_iterator())


In [None]:
class MyDatasetGenerator:

  def __init__(self, low, high):
    data = np.arange(low, high, dtype=np.float32)
    self.X = data.reshape([-1,1])
    self.y = (10.0 + 3.0 * data + np.random.rand(200)).astype(np.float32)

  def __call__(self):
    for x, y in zip(self.X, self.y):
      yield (x,y)
