# JST

In [9]:
import json
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import chain
from pandarallel import pandarallel
from wordcloud import WordCloud
from collections import Counter
from sklearn.metrics import f1_score

from reviews.config import processed_data_dir, data_dir, out_dir
from reviews.utils import flat_sentence_tokens


pandarallel.initialize(progress_bar=False, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Generate Input Data

In [3]:
NORM = "stemming"
FIELD = "text"

In [4]:
reviews_df = pd.read_json(
    processed_data_dir / f"reviews_{FIELD}_{NORM}.json.gz", orient="records"
)

In [None]:
# load tokens

too_long = reviews_df["tokens"].apply(lambda x: len(list(chain.from_iterable(x))) > 50)
reviews_df.loc[too_long, "tokens"] = "[]"

print(f"Too Long Reviews: {too_long.sum() / len(reviews_df) * 100:.2f}%")

In [5]:
# docs
sentences = list(reviews_df["tokens"].apply(lambda r: " ".join(chain.from_iterable(r))))
docs = [f"d{i} {x}" for i, x in enumerate(sentences)]

with open(data_dir / "jst" / "docs.dat", "w") as f:
    f.write("\n".join(docs))

In [6]:
# vocabulary
tokens = flat_sentence_tokens(reviews_df["tokens"])
vocabs = [f"{k} {v}" for (k, v) in dict(Counter(tokens)).items()]

with open(data_dir / "jst" / "wordmap.txt", "w") as f:
    f.write("\n".join(vocabs))

In [7]:
# sentiment words
with open(data_dir / "sentiwords.json", "r") as f:
    senti_words = json.load(f)

    normalized = senti_words[NORM]

    pos_words = [f"{w} 1 0" for w in normalized["positive"]]
    neg_words = [f"{w} 0 1" for w in normalized["negative"]]

with open(data_dir / "jst" / "sentiwords.txt", "w") as f:
    f.write("\n".join(pos_words + neg_words))

## Run

In [None]:
# JST execution
from reviews.config import data_dir, bin_dir, out_dir
from reviews.models import JST

n_topics = 10
n_runs = 10
iterations = 1000

alpha = -1
beta = -1
gamma = -1

metrics = []
for run in range(n_runs):
    # train the model
    model = JST(bin_dir, data_dir / "jst", out_dir / "jst")
    start_time = time.time()
    model.estimate(alpha, beta, gamma, n_topics, iterations)
    print(f"Run {run}: {(time.time() - start_time)}s")

    # compute metrics
    pi = pd.read_csv(out_dir / "jst" / "final.pi", sep=" ", header=None)
    pi.drop([0, 1, 4], axis=1, inplace=True)
    pi.columns = ["S0", "S1"]
    pi_df = pi
    doc_sentiment = pi_df.idxmax(axis=1).map(
        lambda x: "positive" if x == "S0" else "negative"
    )
    reviews_df["sentiment"] = doc_sentiment

    gt = reviews_df[reviews_df["overall"] != 3]["overall"]
    pred = reviews_df[reviews_df["overall"] != 3]["sentiment"]

    y_true = gt.apply(lambda x: "negative" if x < 3 else "positive").astype("category")
    y_pred = pred.astype("category")

    metrics.append([
        run, n_topics,
        f1_score(y_true, y_pred, average="macro"),
        f1_score(y_true, y_pred, average="weighted"),
        f1_score(y_true, y_pred, average="micro"),
    ])

metrics_df = pd.DataFrame(metrics, columns=['run', 'n_topics', 'f1_macro', 'f1_weighted', 'f1_micro'])
metrics_df.to_csv(out_dir / 'jst' / f'metrics_jst_{n_topics}_{FIELD}_{NORM}.csv')
metrics_df

## prepare data for analysis

In [None]:
# columns name
columns_name = []

for x in range(2):
    for y in range(n_topics):
        columns_name.append("S" + str(x) + "-T" + str(y))

In [None]:
# prepare phi file for analysis
with open(out_dir / "jst" / "final.phi") as f:
    lines = f.readlines()

    phi = {}
    tmp = ""
    for i, line in enumerate(lines):
        if i % 2 != 0:
            phi[tmp] = [float(x.strip()) for x in line.split(" ") if x.strip() != ""]
        else:
            tmp = line.strip()

phi = pd.DataFrame(phi)
phi.columns = columns_name
phi

In [None]:
# prepare pi file for analysis
pi = pd.read_csv(out_dir / "jst" / "final.pi", sep=" ", header=None)
pi.drop([0, 1, 4], axis=1, inplace=True)
pi.columns = ["S0", "S1"]
pi

In [None]:
with open(out_dir / "jst" / "final.theta") as f:
    lines = f.readlines()

    theta = []
    tmp = []
    for i, line in enumerate(lines):
        if i % 3 != 0:
            values = [float(x.strip()) for x in line.split(" ") if x.strip() != ""]
            tmp.extend(values)
        else:
            if tmp:
                theta.append(tmp)
                tmp = []

theta = pd.DataFrame(theta, columns=columns_name)
theta

## Analysis

In [None]:
pi_df = pi
doc_sentiment = pi_df.idxmax(axis=1).map(
    lambda x: "positive" if x == "S0" else "negative"
)
doc_sentiment.loc[pi_df["S0"] == 0.5] = "neutral"

reviews_df["sentiment"] = doc_sentiment

fig, axes = plt.subplots(1, 2, figsize=(10, 3))
sns.histplot(x=doc_sentiment, ax=axes[0])

counts = doc_sentiment.value_counts()
counts.plot(
    ax=axes[1],
    kind="pie",
    ylabel="sentiment",
    # colors=sns.color_palette("pastel")[0:7],
    autopct="%.0f%%",
)

In [None]:
def get_word_indexes():
    words = {}
    with open(out_dir / "jst" / "wordmap.txt") as f:
        for line in f.readlines()[1:]:
            word_idx = line.split(" ")
            word = word_idx[0].strip()
            idx = word_idx[1].strip()
            words[idx] = word
    return words


words_map = get_word_indexes()

In [None]:
wc = WordCloud(height=400, width=800)
wc.generate_from_frequencies(
    dict(zip(list([words_map[str(idx)] for idx in phi.index]), phi["S1-T0"].values))
)
wc.to_image()

In [None]:
def map_topic_sentiment(column_name):
    if "Unnamed" in column_name:
        return None

    values = column_name.split("-")
    return {
        "sentiment": int(values[0][1]),
        "topic": int(values[1][1:]),
        "colname": column_name,
    }


mask = theta >= 0.1
mask = mask.apply(lambda x: list(mask.columns[x]), axis=1)

reviews_df["topics"] = mask.apply(lambda x: list(map(map_topic_sentiment, x)))

In [None]:
count = Counter()
for x in reviews_df["topics"].dropna().values:
    topics = set(["T" + str(y["topic"]) for y in x])
    count.update(topics)

topics_count = pd.DataFrame(count.items(), columns=["topic", "count"])
topics_count["topic"] = topics_count["topic"].astype("category")

order = topics_count.sort_values(by="count", ascending=False).topic
fig = plt.figure(figsize=(8, 5))
ax = sns.barplot(
    y=topics_count["topic"], x=topics_count["count"], order=order, orient="h"
)
ax.set_title("Topics")
fig.tight_layout()

In [None]:
pos_count = Counter()
neg_count = Counter()

for x in reviews_df["topics"].dropna().values:
    pos_topics = set(["T" + str(st["topic"]) for st in x if st["sentiment"] == 0])
    neg_topics = set(["T" + str(st["topic"]) for st in x if st["sentiment"] == 1])

    pos_count.update(pos_topics)
    neg_count.update(neg_topics)

pos_df = pd.DataFrame(pos_count.items(), columns=["topic", "pos"])
neg_df = pd.DataFrame(neg_count.items(), columns=["topic", "neg"])

st_counts = pd.merge(pos_df, neg_df, on="topic")
st_counts["topic"] = st_counts["topic"].astype("category")

total = st_counts["pos"] + st_counts["neg"]
st_counts["pos"] = st_counts["pos"] / total * 100
st_counts["neg"] = st_counts["neg"] / total * 100

st_counts.set_index("topic", inplace=True)
st_counts.sort_index(inplace=True)
st_counts = st_counts.iloc[[int(o[1]) for o in order][::-1]]

fig, ax = plt.subplots(figsize=(10, 7))
st_counts.plot(
    kind="barh", stacked=True, color=["red", "green"], ax=ax, title="Topics Sentiment"
)
fig.tight_layout()

In [None]:
reviews_df["true"] = (
    reviews_df["overall"]
    .apply(lambda x: "negative" if x < 3 else ("neutral" if x == 3 else "positive"))
    .astype("category")
)

In [None]:
pd.crosstab(
    reviews_df["true"],
    reviews_df["sentiment"].astype("category"),
)

In [None]:
f1_score(reviews_df["true"], reviews_df["sentiment"], average="weighted")