In [54]:
import numpy as np
import pandas as pd
from scipy import optimize
from scipy.special import digamma


In [55]:
import time

In [86]:
def normalize_rows(input_matrix):
    """
    Normalizes the rows of a 2d input_matrix so they sum to 1
    """
    row_sums = input_matrix.sum(axis=1)
    new_matrix = input_matrix / row_sums[:, np.newaxis]
    return new_matrix


def normalize_columns(input_matrix):
    """
    Normalizes the columns of a 2d input_matrix so they sum to 1
    """
    col_sums = input_matrix.sum(axis=0)
    new_matrix = input_matrix / col_sums[np.newaxis :]
    return new_matrix

def normalize_vector(input_vector):
    return input_vector/input_vector.sum()

# Function Definitions

In [57]:
def build_corpus(df, N=0):
    """
    Return documents list, ratings list, and number of documents
    """
    documents = []
    ratings = []

    for index, row in df.iterrows():
        documents.append(row['review_words'])
        ratings.append(float(row['rating']))
#         ratings.append((float(row['rating']) - 1.0) / 5.0) # Normalize the ratings form 1-5 to 0-1

    # here for testing purposes
    if N>0:
        documents = documents[:N]
        ratings = ratings[:N]
        number_of_documents = N
    else:
        number_of_documents = len(documents)
#         max_doc_length = max(self.max_doc_length, len(row['review_words']))
    return documents, ratings, number_of_documents

In [77]:
def build_vocabulary(documents, size_V=0):
    """
    Construct a list of unique words in the whole corpus. Put it in self.vocabulary
    for example: ["rain", "the", ...]

    Update self.vocabulary_size
    """
    words = set()
    for line in documents:
        words.update(line)
    vocabulary = sorted(words)

    if size_V > 0:
        vocabulary = vocabulary[:size_V]
        vocabulary_size = size_V
    else:
        vocabulary_size = len(vocabulary)

    return vocabulary, vocabulary_size

In [68]:
def build_term_doc_matrix(documents, number_of_documents, vocabulary, vocabulary_size):
    """
    Construct the term-document matrix where each row represents a document, 
    and each column represents a vocabulary term.

    self.term_doc_matrix[i][j] is the count of term j in document i
    """

    idx = dict(zip(vocabulary, range(vocabulary_size)))
    # print(idx)
    term_doc_matrix = np.zeros([number_of_documents, vocabulary_size], dtype=np.float)
    for i, document in enumerate(documents):
        for word in document:
            term_doc_matrix[i][idx[word]] += 1
    # print(self.term_doc_matrix)
    
    return term_doc_matrix


#  Inputs

In [82]:
# number of aspects (k)
number_of_aspects = 5

# N - number of documents
N = 5

# size_V - vocabulary size
size_V = 5

# doc_df - dataframe of docs
doc_df = pd.read_pickle('processed_amazon_reviews.pkl')
high_ratings_df = doc_df[doc_df.rating > '3.0']
low_ratings_df = doc_df[doc_df.rating < '4.0']


In [61]:
# build document list and ratings list and get number of documents
documents, ratings, number_of_documents = build_corpus(high_ratings_df)

In [78]:
# build vocabulary and get vocabulary size
vocabulary, vocabulary_size = build_vocabulary(documents)

In [79]:
# build term doc matrix
term_doc_matrix = build_term_doc_matrix(documents, number_of_documents, vocabulary, vocabulary_size)

In [80]:
term_doc_matrix.shape

(11533, 8628)

# Initialize Corpus Level Parameters

In [None]:
epsilon = normalize_columns(np.random.rand(self.vocabulary_size, number_of_aspects))
s = np.zeros([self.number_of_documents, number_of_aspects])
alpha = normalize_rows(np.random.rand(self.number_of_documents, number_of_aspects))

z = np.random.randint(number_of_aspects, size=[self.number_of_documents, self.max_doc_length])


In [91]:
# gamma - distribution of aspects in whole corpus
gamma = normalize_vector(np.random.rand(number_of_aspects))
beta = np.random.uniform(-1, 1, (number_of_aspects, vocabulary_size))

# Initialize Document Level Parameters