In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np

# Installing the dataset

User Reviews:

In [None]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Grocery_and_Gourmet_Food", trust_remote_code=True)
print(dataset['full'][0])

In [None]:
df_reviews = dataset['full'].to_pandas()

In [None]:
df_reviews.head(5)

In [None]:
df_reviews.shape

Item Metadata:

In [None]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Grocery_and_Gourmet_Food", trust_remote_code=True)
print(dataset['full'][0])

In [None]:
df_meta = dataset['full'].to_pandas()

In [None]:
df_meta.head(5)

In [None]:
df_meta.shape

# Drop unnecessary columns

In [None]:
df_reviews = df_reviews.drop(columns=['images'], axis=1)
df_reviews

In [None]:
df_meta = df_meta.drop(columns=['images'], axis=1)
df_meta

In [None]:
df_meta = df_meta.drop(columns=['videos'], axis=1)
df_meta

# Find items using categories & parent_asin

In [None]:
df_meta['categories'] = df_meta['categories'].apply(lambda x: tuple(x) if isinstance(x, (list, np.ndarray)) else x)
df_meta[['categories', 'parent_asin']].drop_duplicates()

In [None]:
df_carb = df_meta[df_meta['title'].str.contains('low carb', case=False, na=False)]
parent_asin_list_carb = df_carb['parent_asin'].drop_duplicates().to_list()
len(parent_asin_list_carb)

In [None]:
df_carb.head(5)

In [None]:
df_snacks = df_meta[df_meta['title'].str.contains('snack', case=False, na=False)]
parent_asin_list_snacks = df_snacks['parent_asin'].drop_duplicates().to_list()
len(parent_asin_list_snacks)

In [None]:
df_snacks.head(5)

In [None]:
parent_asin_list = parent_asin_list_carb + parent_asin_list_snacks
df_healthy = df_reviews[df_reviews['parent_asin'].isin(parent_asin_list)]
df_healthy.head(5)

In [None]:
df_meta = df_meta.rename(columns={'parent_asin': 'parent_asin_meta'})
joined_df = pd.merge(df_healthy, df_meta, left_on="parent_asin", right_on="parent_asin_meta", how="left")
joined_df.head(10)

# Remove stop words and tokenize text

In [None]:
import nltk
import spacy
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import string

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
def preprocess_text(text):
    # Tokenize and remove punctuation
    tokens = simple_preprocess(text, deacc=True)

    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    
    return tokens

In [None]:
joined_df['cleaned_text'] = joined_df['text'].apply(preprocess_text)
joined_df.head(10)

In [None]:
processed_reviews = joined_df['cleaned_text'].tolist()

# Train the LDA model

In [None]:
from tqdm import tqdm
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import logging

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
if isinstance(joined_df['cleaned_text'].iloc[0], list):
    processed_reviews = joined_df['cleaned_text'].tolist()
else:
    raise ValueError("Ensure 'processed_reviews' contains lists of tokens")

In [None]:
dictionary = corpora.Dictionary(processed_reviews)

In [None]:
corpus = [dictionary.doc2bow(review) for review in processed_reviews]
total_docs = len(corpus)

In [None]:
def custom_lda_training(corpus, dictionary, num_topics, passes, chunksize):
    num_iterations = passes

    with tqdm(total=total_docs * num_iterations, desc="LDA Training Progress") as pbar:
        # Initialize the LDA Multicore model
        lda_model = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=100,
            chunksize=chunksize,
            passes=1,
            workers=1,
            alpha='symmetric',
            per_word_topics=True,
            batch=True
        )

        # Manually iterate over the passes and update the progress bar
        for pass_ in range(num_iterations):
            for i in range(0, total_docs, chunksize):
                chunk = corpus[i:i + chunksize]
                lda_model.update(chunk)
                pbar.update(len(chunk))

    return lda_model

In [None]:
lda_model = custom_lda_training(corpus, dictionary, num_topics=20, passes=5, chunksize=3000)