In [None]:
import numpy as np

import pandas as pd
from collections import Counter
from wordcloud import WordCloud
from pandarallel import pandarallel

from reviews.config import processed_data_dir, asum_input_dir
from reviews.preprocess import preprocess

pandarallel.initialize(progress_bar=True, nb_workers=4)

In [None]:
# read data
reviews_df = pd.read_json(
    processed_data_dir / "products_reviews.json.gz", orient="records"
)
reviews_df.info()

In [None]:
# apply preprocessing
reviews_df["text"] = reviews_df["text"].astype("string")
reviews_df["tokens"] = reviews_df["text"].parallel_apply(lambda x: preprocess(x, lemmatization=True))
reviews_df.to_json(processed_data_dir / "preprocessed_reviews.json.gz")

In [None]:
tokens = [word for doc in reviews_df["tokens"] for sent in doc for word in sent]
c = Counter(tokens)

print("Tokens:", len(set(tokens)))

In [None]:
# generate the word cloud
wc = WordCloud(
    collocations=False,
    prefer_horizontal=0.6,
    width=800,
    height=400,
    background_color="white",
)
wc.fit_words(c)
wc.to_image()

In [None]:
vocabulary = dict((e, i) for i, e in enumerate(set(tokens)))
print("Vocabulary size:", len(vocabulary))

# save vocabulary for asum input
np.save(asum_input_dir / "vocabulary", vocabulary)

# Look for bad results

In [None]:
# check long words
set([t for t in tokens if len(t) > 10])

In [None]:
strange_tokens = []

In [None]:
for q in strange_tokens:
    for i, row in reviews_df.iterrows():
        if q in [t for s in row["tokens"] for t in s]:
            print(row["text"])
            print(row["tokens"])
            print("\n")