# ASUM

In [43]:
import re
import json
import time

import numpy as np
import pandas as pd

from itertools import chain
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from reviews.config import (
    processed_data_dir,
    asum_model_path,
    asum_input_dir,
    asum_output_dir,
    data_dir,
)
from reviews.utils import flat_sentence_tokens
from reviews.models import ASUM

from sklearn.metrics import f1_score

## Generate Input Data

In [44]:
NORM = "stemming"  # stemming, lemmatization
FIELD = "text"  # text, text+summary

In [45]:
# load tokens
reviews_df = pd.read_json(processed_data_dir / f"reviews_{FIELD}_{NORM}.json.gz")

too_long = reviews_df["tokens"].apply(lambda x: any([len(sent) > 50 for sent in x]))  # len(list(chain.from_iterable(x))) > 50
empty = reviews_df["tokens"].apply(lambda x: len(list(chain.from_iterable(x))) == 0)

print(f"Too Long Reviews: {too_long.sum() / len(reviews_df) * 100:.2f}%")
print(f"Empty Reviews: {empty.sum() / len(reviews_df) * 100:.2f}%")

reviews_df["tokens"] = reviews_df["tokens"].apply(lambda x: [sent for sent in x if len(sent) <= 50])

Too Long Reviews: 0.50%
Empty Reviews: 0.05%


In [46]:
reviews_df.to_csv("reviews_filtered.json.gz")

In [47]:
# vocabulary
tokens = flat_sentence_tokens(reviews_df["tokens"])
vocabulary = dict((e, i) for i, e in enumerate(set(tokens)))

vocabs = set(vocabulary.keys())

with open(asum_input_dir / "WordList.txt", "w") as f:
    f.write("\n".join(vocabs))

In [48]:
# docs
with open(asum_input_dir / "BagOfSentences.txt", "w") as f:
    lines = []
    for doc in reviews_df["tokens"]:
        n = len(doc)
        lines.append(str(n))

        for tokens in doc:
            indexes = " ".join([str(vocabulary[t]) for t in tokens])
            lines.append(indexes)

    f.write("\n".join(lines))

In [49]:
# sentiment words
with open(data_dir / "sentiwords.json", "r") as f:
    senti_words = json.load(f)

    normalized = senti_words[NORM]

    pos_words = normalized["positive"]
    neg_words = normalized["negative"]

with open(data_dir / "asum" / "SentiWords-0.txt", "w") as f:
    f.write("\n".join(pos_words))

with open(data_dir / "asum" / "SentiWords-1.txt", "w") as f:
    f.write("\n".join(neg_words))

## Run

In [50]:
n_runs = 10
n_topics = 50
iterations = 1000

alpha = 0.1  # 50 / n_topics # 0.1
beta = [0.001, 0.1, 0]
gamma = [1, 1]

metrics = []
for run in range(n_runs):
    # train the model
    model = ASUM(asum_model_path, asum_input_dir, asum_output_dir)
    start_time = time.time()
    model.estimate(alpha, beta, gamma, n_topics, iterations)
    print(f"Run {run}: {(time.time() - start_time)}s")

    # compute metrics
    filename_prefix = f"STO2-T{n_topics}-S2(2)-A{alpha}-B{','.join([f'{float(x)}' for x in beta])}-G{','.join([f'{float(x):.1f}' for x in gamma])}-I{iterations}"
    pi_df = pd.read_csv(asum_output_dir / f"{filename_prefix}-Pi.csv", header=None)
    doc_sentiment = pi_df.idxmax(axis=1).map(lambda x: "positive" if x == 0 else "negative")
    reviews_df["sentiment"] = doc_sentiment

    gt = reviews_df[reviews_df["overall"] != 3]["overall"]
    pred = reviews_df[reviews_df["overall"] != 3]["sentiment"]

    y_true = gt.apply(lambda x: "negative" if x < 3 else "positive").astype("category")
    y_pred = pred.astype("category")

    metrics.append([
        run, n_topics,
        f1_score(y_true, y_pred, average="macro"),
        f1_score(y_true, y_pred, average="weighted"),
        f1_score(y_true, y_pred, average="micro"),
    ])

metrics_df = pd.DataFrame(metrics, columns=['run', 'n_topics', 'f1_macro', 'f1_weighted', 'f1_micro'])
metrics_df.to_csv(asum_output_dir / f'metrics_asum_{n_topics}_{FIELD}_{NORM}.csv')
metrics_df

Documents: 28983
Unique Words: 20362
Documents: 28983
Unique Words: 20362
Topics: 50
Sentiments: 2 (dictionary: 2)
Alpha: 0.1
Beta: 
0.001 0.1 0 
Gamma: 1 1 
Iterations: 1000
Threads: 3
Input Dir: /home/ubuntu/Desktop/amazon-reviews/data/asum
Dictionary Dir: /home/ubuntu/Desktop/amazon-reviews/data/asum
Output Dir: /home/ubuntu/Desktop/amazon-reviews/output/asum
Too Long Sentences: 0
Gibbs sampling started (Iterations: 1000, Threads: 3)
  - Iteration 0
glad/231/0/ perfect/2229/0/ fun/85/0/ worth/700/0/ happi/982/0/ superior/35/0/ great/8053/0/ enjoy/121/0/ posit/171/0/ comfort/50/0/ favorit/74/0/ attract/32/0/ awesom/684/0/ thank/706/0/ fantast/323/0/ nice/2088/0/ recommend/1952/0/ best/1320/0/ amaz/772/0/ good/5283/0/ satisfi/255/0/ fortun/65/0/ correct/295/0/ love/1176/0/ impress/352/0/ excel/1274/0/ 
terribl/0/126/ annoy/0/187/ not_worth/0/135/ worst/0/66/ inferior/0/15/ not_like/0/711/ negat/0/100/ problem/0/1747/ unfortun/0/201/ poor/0/179/ unaccept/0/28/ junk/0/98/ upset/0/17/ ha

Unnamed: 0,run,n_topics,f1_macro,f1_weighted,f1_micro
0,0,50,0.62843,0.844109,0.81509
1,1,50,0.621672,0.8402,0.809873
2,2,50,0.618471,0.83455,0.800934
3,3,50,0.616923,0.838309,0.807684
4,4,50,0.638262,0.849304,0.82184
5,5,50,0.623417,0.83629,0.802831
6,6,50,0.630242,0.842991,0.812755
7,7,50,0.636666,0.848965,0.821621
8,8,50,0.628552,0.840253,0.80845
9,9,50,0.622294,0.835045,0.800971
