In [1]:
num_topics = 5
num_words = 5
passes = 10
seed = 100

In [2]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import gensim

# Load the data
data = pd.read_csv(r"C:\Users\Philip\Downloads\merged_data.csv\merged_data.csv")

# Custom filter to remove punctuation and numeric characters
custom_filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

# Apply preprocessing to each document in the text column and keep track of the business ID
processed_docs = []
business_ids = []
id_to_name = {}  # Dictionary to map business IDs to names
for doc, business_id, business_name in zip(data['text'], data['business_id'], data['name']):
    processed_docs.append(preprocess_string(doc, custom_filters))
    business_ids.append(business_id)
    id_to_name[business_id] = business_name  # Populate the mapping dictionary

# Create a dictionary representation of the documents
dictionary = Dictionary(processed_docs)

# Filter out extremes to remove too rare or too common words
dictionary.filter_extremes(no_below=15)

# Convert document into the bag-of-words (BoW) format
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Training the LDA model
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=seed)

# Get topic distribution for each document
doc_topics = [lda_model.get_document_topics(bow) for bow in corpus]

# Creating a DataFrame to hold the topic distribution for each business ID
topic_df = pd.DataFrame(0, index=set(business_ids), columns=['Topic_' + str(i) for i in range(num_topics)])

# Aggregate the topic distributions for each business ID
for business_id, doc_topic in zip(business_ids, doc_topics):
    for topic, prop in doc_topic:
        topic_df.at[business_id, 'Topic_' + str(topic)] += prop

# Replace the index of the DataFrame with business names using the mapping dictionary
topic_df = topic_df.rename(index=id_to_name)

print(topic_df)

                                   Topic_0     Topic_1     Topic_2  \
Brewery Tours of Indianapolis     3.942684    5.468915    4.984259   
New Day Craft                    18.079266  185.542788   32.268484   
Indiana on Tap Tasting Society    1.312353   10.701468    0.923355   
Bier Brewery and Tap Room        13.526507   89.301747    8.429210   
Yards Brewing Company            79.472253  328.775005  138.298567   
...                                    ...         ...         ...   
Fishtown Hops                     6.261376    2.903440    3.286236   
Independence Beer Garden        105.849334  128.591223   72.690978   
Chilly Water Brewing Company     34.897681   70.008065   33.460026   
Triton Brewing Company           20.538092   67.763702   15.095107   
Chase's Hop Shop                 15.552614   22.461831    6.793820   

                                   Topic_3    Topic_4  
Brewery Tours of Indianapolis     0.496302   0.011978  
New Day Craft                    61.000022  13.

In [3]:
topics = lda_model.print_topics(num_words=10)
topics

[(0,
  '0.022*"that" + 0.021*"t" + 0.016*"not" + 0.013*"they" + 0.013*"my" + 0.012*"this" + 0.010*"at" + 0.010*"have" + 0.009*"be" + 0.009*"you"'),
 (1,
  '0.017*"you" + 0.017*"beer" + 0.016*"that" + 0.016*"they" + 0.014*"on" + 0.013*"s" + 0.010*"have" + 0.010*"their" + 0.009*"this" + 0.009*"beers"'),
 (2,
  '0.087*"we" + 0.039*"were" + 0.033*"our" + 0.023*"had" + 0.016*"us" + 0.011*"on" + 0.011*"my" + 0.010*"back" + 0.010*"very" + 0.010*"at"'),
 (3,
  '0.028*"great" + 0.022*"food" + 0.022*"place" + 0.021*"beer" + 0.018*"good" + 0.017*"have" + 0.016*"are" + 0.016*"this" + 0.016*"they" + 0.015*"you"'),
 (4,
  '0.016*"had" + 0.014*"my" + 0.014*"good" + 0.014*"were" + 0.013*"burger" + 0.011*"on" + 0.011*"cheese" + 0.010*"fries" + 0.010*"that" + 0.009*"which"')]

In [4]:
topic_names = {
    0: "Customer Views",
    1: "Beer Selection",
    2: "Group Experience",
    3: "Ambience",
    4: "Burger"
}
# Example of renaming columns in a DataFrame
topic_df.rename(columns=lambda x: topic_names[int(x.split('_')[1])], inplace=True)
topic_df.to_csv(r"C:\Users\Philip\Downloads\LDA_output.csv")

In [5]:
topic_df

Unnamed: 0,Customer Views,Beer Selection,Group Experience,Ambience,Burger
Brewery Tours of Indianapolis,3.942684,5.468915,4.984259,0.496302,0.011978
New Day Craft,18.079266,185.542788,32.268484,61.000022,13.882406
Indiana on Tap Tasting Society,1.312353,10.701468,0.923355,0.900521,0.076640
Bier Brewery and Tap Room,13.526507,89.301747,8.429210,30.020590,2.700975
Yards Brewing Company,79.472253,328.775005,138.298567,148.929966,71.172095
...,...,...,...,...,...
Fishtown Hops,6.261376,2.903440,3.286236,5.476594,0.928502
Independence Beer Garden,105.849334,128.591223,72.690978,172.076231,36.326294
Chilly Water Brewing Company,34.897681,70.008065,33.460026,58.920379,35.037186
Triton Brewing Company,20.538092,67.763702,15.095107,34.092501,16.389577
