# ASUM

In [None]:
import os
import re
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from reviews.config import processed_data_dir, asum_input_dir, asum_output_dir
from reviews.preprocess import preprocess
from reviews.models import asum

In [None]:
# read data
df = pd.read_json(processed_data_dir / "reviews_digital_cameras.json.gz", orient="records")
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

df = df.sample(100) # subset

df.reset_index(inplace=True, drop=True)
df.info()

## Preprocessing

In [None]:
# apply preprocessing
df["preprocess"] = df["text"].apply(lambda x : preprocess(x))

In [None]:
tokens = [word for doc in df["preprocess"] for sent in doc for word in sent]
c = Counter(tokens)

In [None]:
# generate the word cloud
w = WordCloud(width=800, height=400, background_color="white").fit_words(c)
w.to_image()

In [None]:
vocabulary = dict((e, i) for i, e in enumerate(set(tokens)))
print(len(vocabulary))

## Generate data for ASUM

In [None]:
with open(asum_input_dir / "WordList.txt", "w") as f:
  for word in vocabulary:
    f.write(f"{word}\n")

In [None]:
sentence_list = asum_input_dir / "BagOfSentences.txt"

if os.path.exists(sentence_list):
  os.remove(sentence_list)

with open(sentence_list, "a") as f:
  for doc in df["preprocess"]:
    f.write(f"{len(doc)}\n")
    for tokens in doc:
      indexes = " ".join([str(vocabulary[t]) for t in tokens])
      f.write(f"{indexes}\n")

In [None]:
asum("0.1", "0.001/0.1/0", "1/1", "5", iterations="500")

## Print Values

In [None]:
df = pd.read_csv(
    asum_output_dir / "STO2-T5-S2(2)-A0.1-B0.001,0.1,0.0-G1.0,1.0-I100-ProbWords.csv"
)

def map_topic_sentiment(column_name):
  if ("Unnamed" in column_name):
    return None

  values = column_name.split("-")
  return {"sentiment": int(values[0][1]), "topic": int(values[1][1]), "colname": column_name }

columns = df.columns.map(map_topic_sentiment)

In [None]:
def get_word_weight(item):
  match = re.match("(\w+)\s+\((.*?)\)", item)
  word, weight = match.groups()
  return word, float(weight)

n_top_words = 10

fig, axes = plt.subplots(2, n_top_words, figsize=(15, 6), sharex=True)
axes = axes.flatten()

for idx, item in enumerate(columns):
  if item is None:
    continue
  
  ax = axes[idx]
  words = df[item["colname"]][:n_top_words]
  weights = list(map(list, zip(*words.map(get_word_weight))))

  weights = pd.DataFrame({"names": weights[0], "weights": weights[1]})
  weights.sort_values("weights", inplace=True)
  
  ax.barh(weights["names"], weights["weights"], height=0.5)
  
  if idx > n_top_words - 1:
    ax.set_xlabel(f"Topic {item['topic'] + 1}")

  if idx == 0 or idx == n_top_words:
    ax.set_ylabel("Positive" if item['sentiment'] == 0 else "Negative")
fig.tight_layout()