In [1]:
num_topics = 5
num_words = 5
passes = 10
seed = 100

In [7]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import gensim

# Load the data
data = pd.read_csv(r"C:\Users\Philip\Downloads\merged_data.csv\merged_data.csv")

# Custom filter to remove punctuation and numeric characters
custom_filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

# Apply preprocessing to each document in the text column and keep track of the business ID
processed_docs = []
business_ids = []
id_to_name = {}
review_counts = {}
for doc, business_id, business_name in zip(data['text'], data['business_id'], data['name']):
    processed_docs.append(preprocess_string(doc, custom_filters))
    business_ids.append(business_id)
    id_to_name[business_id] = business_name  # Populate the mapping dictionary

# Create a dictionary representation of the documents
dictionary = Dictionary(processed_docs)

# Filter out extremes to remove too rare or too common words
dictionary.filter_extremes(no_below=15)

# Convert document into the bag-of-words (BoW) format
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Training the LDA model
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=seed)

# Get topic distribution for each document
doc_topics = [lda_model.get_document_topics(bow) for bow in corpus]

# Creating a DataFrame to hold the topic distribution for each business ID
topic_df = pd.DataFrame(0, index=set(business_ids), columns=['Topic_' + str(i) for i in range(num_topics)])

# Aggregate the topic distributions for each business ID
for business_id, doc_topic in zip(business_ids, doc_topics):
    for topic, prop in doc_topic:
        topic_df.at[business_id, 'Topic_' + str(topic)] += prop

# Count the number of reviews for each business ID
for business_id in business_ids:
    review_counts[business_id] = review_counts.get(business_id, 0) + 1

# Normalize the topic distributions for each business ID by the number of reviews
for business_id in topic_df.index:
    topic_df.loc[business_id] /= review_counts[business_id]

# Replace the index of the DataFrame with business names using the mapping dictionary
# topic_df = topic_df.rename(index=id_to_name)

print(topic_df)

                         Topic_0   Topic_1   Topic_2   Topic_3   Topic_4
2-gqxtBtrAfg9gHpKqIkrA  0.077922  0.684999  0.080436  0.107302  0.042711
YeIQRLKVhPsDV8AJdgd-BQ  0.227858  0.284146  0.142546  0.184955  0.154841
mQOSVc6m0hNvJNp2t6mhtA  0.101435  0.206536  0.112693  0.368639  0.204119
cAMQGuopIJ8AgyeXciFNcw  0.054055  0.552558  0.097435  0.270760  0.017171
CQgCsfUUZduNz0fNyeAsyA  0.393863  0.233773  0.183534  0.171507  0.011570
...                          ...       ...       ...       ...       ...
gcSwkcnKHmgZpP82GsieUw  0.117964  0.392383  0.110075  0.310947  0.061610
Pj-A29ajjCUb5ttM3Ff7yA  0.067975  0.443702  0.092441  0.336918  0.050889
lXkjfmdIKGB-WwEuuEEAmw  0.315552  0.106416  0.134411  0.277039  0.160257
zCmdpK9TYREr3sO1QO6BCw  0.034950  0.457502  0.415035  0.075591  0.009062
TnPYPIDubOiO6ntG7f0r5w  0.060559  0.657472  0.217227  0.033596  0.022334

[182 rows x 5 columns]


In [8]:
topics = lda_model.print_topics(num_words=10)
topics

[(0,
  '0.022*"that" + 0.021*"t" + 0.016*"not" + 0.013*"they" + 0.013*"my" + 0.012*"this" + 0.010*"at" + 0.010*"have" + 0.009*"be" + 0.009*"you"'),
 (1,
  '0.017*"you" + 0.017*"beer" + 0.016*"that" + 0.016*"they" + 0.014*"on" + 0.013*"s" + 0.010*"have" + 0.010*"their" + 0.009*"this" + 0.009*"beers"'),
 (2,
  '0.087*"we" + 0.039*"were" + 0.033*"our" + 0.023*"had" + 0.016*"us" + 0.011*"on" + 0.011*"my" + 0.010*"back" + 0.010*"very" + 0.010*"at"'),
 (3,
  '0.028*"great" + 0.022*"food" + 0.022*"place" + 0.021*"beer" + 0.018*"good" + 0.017*"have" + 0.016*"are" + 0.016*"this" + 0.016*"they" + 0.015*"you"'),
 (4,
  '0.016*"had" + 0.014*"my" + 0.014*"good" + 0.014*"were" + 0.013*"burger" + 0.011*"on" + 0.011*"cheese" + 0.010*"fries" + 0.010*"that" + 0.009*"which"')]

In [9]:
topic_names = {
    0: "Customer Views",
    1: "Beer Selection",
    2: "Group Experience",
    3: "Ambience",
    4: "Burger"
}
# Example of renaming columns in a DataFrame
topic_df.rename(columns=lambda x: topic_names[int(x.split('_')[1])], inplace=True)
topic_df.to_csv(r"C:\Users\Philip\Downloads\LDA_output.csv")

In [10]:
topic_df

Unnamed: 0,Customer Views,Beer Selection,Group Experience,Ambience,Burger
2-gqxtBtrAfg9gHpKqIkrA,0.077922,0.684999,0.080436,0.107302,0.042711
YeIQRLKVhPsDV8AJdgd-BQ,0.227858,0.284146,0.142546,0.184955,0.154841
mQOSVc6m0hNvJNp2t6mhtA,0.101435,0.206536,0.112693,0.368639,0.204119
cAMQGuopIJ8AgyeXciFNcw,0.054055,0.552558,0.097435,0.270760,0.017171
CQgCsfUUZduNz0fNyeAsyA,0.393863,0.233773,0.183534,0.171507,0.011570
...,...,...,...,...,...
gcSwkcnKHmgZpP82GsieUw,0.117964,0.392383,0.110075,0.310947,0.061610
Pj-A29ajjCUb5ttM3Ff7yA,0.067975,0.443702,0.092441,0.336918,0.050889
lXkjfmdIKGB-WwEuuEEAmw,0.315552,0.106416,0.134411,0.277039,0.160257
zCmdpK9TYREr3sO1QO6BCw,0.034950,0.457502,0.415035,0.075591,0.009062
