In [1]:
num_topics = 5
num_words = 5
passes = 10

In [2]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric, remove_stopwords
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import gensim

# Preprocessing the text data
data = pd.read_csv(r"C:\Users\phili\Downloads\merged_data.csv\merged_data.csv")

# Custom preprocessing function
def custom_preprocess(doc):
    # Apply filters manually
    doc = doc.lower()
    doc = strip_punctuation(doc)
    doc = strip_numeric(doc)
    doc = remove_stopwords(doc)

    # Split into tokens and filter out short tokens
    tokens = doc.split()
    tokens = [token for token in tokens if len(token) > 1]

    return tokens

# Grouping data by business ID
grouped_data = data.groupby('business_id')

# Dictionary to store LDA models for each business
lda_models = {}

for business_id, group in grouped_data:
    # Apply custom preprocessing to each document in the text column of the group
    processed_docs = [custom_preprocess(doc) for doc in group['text'].astype(str)]

    # Create a dictionary and corpus for the group
    dictionary = Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    # Check if corpus is empty
    if len(corpus) == 0 or len(dictionary) == 0:
        print(f"Skipping business ID {business_id} due to empty corpus or dictionary.")
        continue

    # Train LDA model for this group
    lda_model = LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes = passes)

    # Store the model in the dictionary
    lda_models[business_id] = lda_model

    # Find the business name using the business ID from the original dataset
    business_name = data[data['business_id'] == business_id]['name'].iloc[0]

    # Print business name and topics
    print(f"Business ID: {business_id}, Name: {business_name}")
    topics = lda_model.print_topics(num_words = num_words)
    for topic in topics:
        print(topic)

Business ID: -2-ih3mE8KPyeKVIzpBfPQ, Name: SkyGarten
(0, '0.082*"views" + 0.064*"great" + 0.061*"food" + 0.053*"drinks" + 0.048*"drink"')
(1, '0.084*"cover" + 0.066*"drinks" + 0.059*"place" + 0.057*"definitely" + 0.049*"nice"')
(2, '0.220*"wait" + 0.079*"line" + 0.055*"place" + 0.055*"inside" + 0.039*"friends"')
(3, '0.093*"hot" + 0.084*"chocolate" + 0.077*"bar" + 0.062*"place" + 0.051*"city"')
(4, '0.080*"outside" + 0.052*"beer" + 0.047*"people" + 0.046*"bar" + 0.041*"place"')
Business ID: -BjXcRcvDqOQ23eUKx6hjg, Name: Brew
(0, '0.069*"ultimo" + 0.059*"love" + 0.059*"place" + 0.049*"cup" + 0.042*"ve"')
(1, '0.060*"place" + 0.038*"good" + 0.036*"ve" + 0.034*"like" + 0.033*"excellent"')
(2, '0.063*"selection" + 0.061*"shop" + 0.057*"delicious" + 0.045*"bit" + 0.043*"beers"')
(3, '0.059*"ultimo" + 0.059*"good" + 0.054*"brew" + 0.051*"great" + 0.046*"selection"')
(4, '0.119*"great" + 0.107*"place" + 0.065*"nice" + 0.048*"like" + 0.047*"new"')
Business ID: -jNOHaFwWsBpaf9l5gvRwQ, Name: Bla