In [1]:
num_topics = 5
num_words = 5
passes = 10

In [3]:
import pandas as pd
import io
import requests
import zipfile
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric, remove_stopwords
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import gensim

# URL of the zip file
zip_file_url = "https://github.com/pushpit21/Brew628/raw/eda_basic/merged_data.csv.zip"

# Send a GET request to the URL
response = requests.get(zip_file_url)

# Open the ZIP file in memory
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    # Extract the names of files in the ZIP
    file_names = z.namelist()
    # Assuming you want to read 'merged_data.csv'
    with z.open('merged_data.csv') as csvfile:
        # Read the CSV file into a DataFrame
        data = pd.read_csv(csvfile)

# Custom preprocessing function
def custom_preprocess(doc):
    # Apply filters manually
    doc = doc.lower()
    doc = strip_punctuation(doc)
    doc = strip_numeric(doc)
    doc = remove_stopwords(doc)

    # Split into tokens and filter out short tokens
    tokens = doc.split()
    tokens = [token for token in tokens if len(token) > 1]

    return tokens

# Grouping data by business ID
grouped_data = data.groupby('business_id')

# Dictionary to store LDA models for each business
lda_models = {}

for business_id, group in grouped_data:
    # Apply custom preprocessing to each document in the text column of the group
    processed_docs = [custom_preprocess(doc) for doc in group['text'].astype(str)]

    # Create a dictionary and corpus for the group
    dictionary = Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    # Check if corpus is empty
    if len(corpus) == 0 or len(dictionary) == 0:
        print(f"Skipping business ID {business_id} due to empty corpus or dictionary.")
        continue

    # Train LDA model for this group
    lda_model = LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes = passes)

    # Store the model in the dictionary
    lda_models[business_id] = lda_model

    # Find the business name using the business ID from the original dataset
    business_name = data[data['business_id'] == business_id]['name'].iloc[0]

    # Print business name and topics
    print(f"Business ID: {business_id}, Name: {business_name}")
    topics = lda_model.print_topics(num_words = num_words)
    for topic in topics:
        print(topic)

Business ID: -2-ih3mE8KPyeKVIzpBfPQ, Name: SkyGarten
(0, '0.059*"bar" + 0.049*"city" + 0.049*"outside" + 0.048*"beer" + 0.044*"check"')
(1, '0.094*"hot" + 0.080*"chocolate" + 0.058*"drink" + 0.050*"drinks" + 0.047*"views"')
(2, '0.099*"place" + 0.072*"cover" + 0.062*"views" + 0.050*"nice" + 0.050*"great"')
(3, '0.066*"skygarten" + 0.063*"wait" + 0.061*"people" + 0.059*"bar" + 0.054*"outside"')
(4, '0.088*"beer" + 0.056*"food" + 0.048*"good" + 0.046*"outside" + 0.044*"city"')
Business ID: -BjXcRcvDqOQ23eUKx6hjg, Name: Brew
(0, '0.104*"love" + 0.073*"ultimo" + 0.060*"favorite" + 0.053*"nice" + 0.038*"place"')
(1, '0.061*"place" + 0.060*"like" + 0.051*"great" + 0.044*"cup" + 0.043*"good"')
(2, '0.073*"ultimo" + 0.069*"excellent" + 0.040*"drink" + 0.039*"friendly" + 0.038*"neighborhood"')
(3, '0.075*"great" + 0.058*"brew" + 0.054*"good" + 0.045*"place" + 0.041*"ultimo"')
(4, '0.065*"place" + 0.047*"best" + 0.045*"shop" + 0.041*"selection" + 0.040*"ve"')
Business ID: -jNOHaFwWsBpaf9l5gvRwQ,

Business ID: 5lZNaU_zyoVA9tX-yKwA2A, Name: Parks On Tap
(0, '0.528*"great" + 0.472*"parks"')
(1, '0.537*"great" + 0.463*"parks"')
(2, '0.545*"parks" + 0.455*"great"')
(3, '0.988*"great" + 0.012*"parks"')
(4, '0.954*"parks" + 0.046*"great"')
Skipping business ID 5y1UZxEVE8QJob1XMRKp8A due to empty corpus or dictionary.
Skipping business ID 609Lr-Hvo3sr9amdiimOJA due to empty corpus or dictionary.
Business ID: 68YNg7QVIglBZYfZ8ZFwSg, Name: Garfield Brewery
(0, '0.291*"beers" + 0.203*"good" + 0.170*"nice" + 0.100*"friendly" + 0.084*"staff"')
(1, '0.267*"park" + 0.180*"friendly" + 0.131*"staff" + 0.128*"beers" + 0.075*"ale"')
(2, '0.435*"place" + 0.175*"beers" + 0.115*"ale" + 0.079*"garfield" + 0.062*"park"')
(3, '0.303*"staff" + 0.205*"friendly" + 0.168*"good" + 0.109*"ale" + 0.103*"nice"')
(4, '0.311*"garfield" + 0.185*"park" + 0.139*"beers" + 0.120*"patio" + 0.087*"nice"')
Skipping business ID 6GOdPucdzalKnlytAyOmDw due to empty corpus or dictionary.
Business ID: 6IwF2rpoaiR_W6DzOvuceA,

Business ID: Cmfs_i-FtaR6O1C07OEcOg, Name: Dock Street Brewery South
(0, '0.110*"menu" + 0.084*"place" + 0.083*"beers" + 0.078*"love" + 0.052*"cheese"')
(1, '0.154*"dock" + 0.148*"street" + 0.068*"south" + 0.065*"philly" + 0.053*"pizza"')
(2, '0.252*"place" + 0.150*"pizza" + 0.084*"seating" + 0.077*"definitely" + 0.073*"bar"')
(3, '0.091*"time" + 0.090*"bar" + 0.085*"coffee" + 0.070*"nice" + 0.070*"went"')
(4, '0.225*"good" + 0.103*"people" + 0.079*"like" + 0.059*"location" + 0.057*"tables"')
Business ID: DZRkJUI9iv1mDY_JaCY3sg, Name: Scotty's Brewhouse
(0, '0.078*"great" + 0.029*"ve" + 0.027*"fried" + 0.027*"staff" + 0.026*"beer"')
(1, '0.075*"good" + 0.056*"scotty" + 0.046*"service" + 0.035*"menu" + 0.030*"great"')
(2, '0.060*"burger" + 0.029*"scotty" + 0.029*"good" + 0.028*"like" + 0.027*"server"')
(3, '0.043*"location" + 0.032*"good" + 0.032*"chicken" + 0.031*"wrap" + 0.029*"fries"')
(4, '0.058*"wings" + 0.044*"scotty" + 0.034*"place" + 0.032*"bar" + 0.031*"great"')
Business ID: EL

Business ID: LG4uWRyTsQHr2fC9lv8TPA, Name: BrewDog Indianapolis
(0, '0.127*"great" + 0.087*"wings" + 0.082*"service" + 0.072*"cauliflower" + 0.046*"amazing"')
(1, '0.136*"food" + 0.087*"time" + 0.086*"brewdog" + 0.075*"great" + 0.064*"wings"')
(2, '0.097*"place" + 0.068*"nice" + 0.067*"good" + 0.065*"beers" + 0.063*"great"')
(3, '0.061*"beers" + 0.049*"good" + 0.046*"menu" + 0.042*"place" + 0.039*"wings"')
(4, '0.093*"food" + 0.068*"fries" + 0.061*"brewdog" + 0.052*"place" + 0.051*"great"')
Business ID: LY1WmE1DwtkGd37eG3SoSQ, Name: Thr3e Wise Men Brewing Company
(0, '0.028*"table" + 0.023*"wise" + 0.021*"men" + 0.019*"restaurant" + 0.018*"thre"')
(1, '0.036*"service" + 0.034*"kids" + 0.022*"cheese" + 0.021*"great" + 0.021*"like"')
(2, '0.038*"great" + 0.027*"place" + 0.024*"time" + 0.018*"bar" + 0.016*"service"')
(3, '0.034*"place" + 0.018*"like" + 0.018*"wings" + 0.017*"wise" + 0.017*"great"')
(4, '0.034*"great" + 0.020*"ipa" + 0.019*"place" + 0.018*"like" + 0.018*"beers"')
Skipping 

Business ID: VSVtG0BfSCRIZfo03VgHCQ, Name: William Street Common
(0, '0.096*"coffee" + 0.091*"eggs" + 0.088*"service" + 0.080*"donuts" + 0.069*"like"')
(1, '0.175*"like" + 0.133*"try" + 0.106*"went" + 0.091*"great" + 0.079*"time"')
(2, '0.129*"bar" + 0.085*"pretty" + 0.073*"people" + 0.071*"tipping" + 0.069*"like"')
(3, '0.110*"coffee" + 0.077*"donuts" + 0.075*"great" + 0.074*"went" + 0.062*"drink"')
(4, '0.136*"tip" + 0.119*"service" + 0.072*"nice" + 0.056*"table" + 0.055*"charge"')
Skipping business ID VmkntDoCpazOVmNIlXx0ug due to empty corpus or dictionary.
Skipping business ID VqbS5tyfB0t1VIbjqtudWQ due to empty corpus or dictionary.
Business ID: W9vWGD3b2qYbC6lHUK5IUg, Name: Redemption Alewerks
(0, '0.053*"time" + 0.037*"chicken" + 0.036*"ve" + 0.027*"tacos" + 0.026*"like"')
(1, '0.048*"place" + 0.042*"redemption" + 0.032*"bar" + 0.030*"like" + 0.029*"ordered"')
(2, '0.087*"great" + 0.052*"service" + 0.051*"burger" + 0.026*"took" + 0.024*"place"')
(3, '0.052*"beers" + 0.042*"menu

Business ID: cM3C8AT3IhOIM4ccBofuSg, Name: Blind Owl Brewery
(0, '0.045*"great" + 0.042*"place" + 0.038*"beer" + 0.023*"outside" + 0.021*"outdoor"')
(1, '0.034*"minutes" + 0.022*"time" + 0.021*"server" + 0.020*"table" + 0.020*"came"')
(2, '0.041*"good" + 0.029*"beer" + 0.022*"service" + 0.020*"menu" + 0.019*"like"')
(3, '0.029*"great" + 0.028*"service" + 0.024*"place" + 0.021*"ordered" + 0.017*"server"')
(4, '0.037*"owl" + 0.037*"blind" + 0.029*"beer" + 0.022*"area" + 0.018*"good"')
Business ID: cwn_MBPFUJtme68WURSgKA, Name: Bainbridge Street Barrel House
(0, '0.025*"great" + 0.025*"burger" + 0.019*"fries" + 0.019*"service" + 0.017*"place"')
(1, '0.028*"service" + 0.024*"great" + 0.021*"bar" + 0.019*"place" + 0.016*"time"')
(2, '0.046*"place" + 0.035*"great" + 0.032*"bar" + 0.020*"selection" + 0.019*"list"')
(3, '0.031*"like" + 0.027*"drinks" + 0.026*"time" + 0.022*"ordered" + 0.021*"beers"')
(4, '0.027*"brunch" + 0.020*"place" + 0.019*"time" + 0.019*"came" + 0.016*"ordered"')
Skipping

Business ID: ltaDKBb6_PMAOAXthRFKLA, Name: Alma Mater
(0, '1.000*"good"')
(1, '1.000*"good"')
(2, '1.000*"good"')
(3, '1.000*"good"')
(4, '1.000*"good"')
Skipping business ID mLDc_u44EiqJCSR3L4nQZw due to empty corpus or dictionary.
Business ID: mQOSVc6m0hNvJNp2t6mhtA, Name: Mayfair Taproom
(0, '0.731*"fries" + 0.141*"good" + 0.091*"beer" + 0.025*"place" + 0.012*"selection"')
(1, '0.200*"good" + 0.200*"place" + 0.200*"beer" + 0.200*"selection" + 0.200*"fries"')
(2, '0.392*"beer" + 0.311*"selection" + 0.290*"good" + 0.004*"place" + 0.003*"fries"')
(3, '0.201*"good" + 0.200*"place" + 0.200*"beer" + 0.200*"selection" + 0.200*"fries"')
(4, '0.586*"place" + 0.245*"good" + 0.111*"beer" + 0.043*"selection" + 0.016*"fries"')
Business ID: nk2a2nI1jfln9zLpxEvTnw, Name: Crime & Punishment Brewing
(0, '0.084*"bar" + 0.057*"place" + 0.054*"beers" + 0.047*"got" + 0.047*"good"')
(1, '0.076*"beers" + 0.056*"food" + 0.053*"like" + 0.048*"crime" + 0.047*"brewery"')
(2, '0.168*"good" + 0.073*"food" + 0.0

Business ID: yuIhKvOjgDbYCkgw-j55sA, Name: Goose Island Brewhouse
(0, '0.061*"time" + 0.058*"good" + 0.051*"burger" + 0.038*"hour" + 0.038*"going"')
(1, '0.055*"happy" + 0.050*"fries" + 0.047*"love" + 0.044*"cauliflower" + 0.043*"fried"')
(2, '0.057*"beers" + 0.057*"goose" + 0.054*"island" + 0.052*"nice" + 0.047*"place"')
(3, '0.096*"like" + 0.073*"drink" + 0.053*"ve" + 0.045*"people" + 0.039*"table"')
(4, '0.093*"service" + 0.067*"good" + 0.035*"got" + 0.034*"server" + 0.034*"came"')
Skipping business ID zCaEDKsY7f3J2ZRb3aNhfA due to empty corpus or dictionary.
Business ID: zCmdpK9TYREr3sO1QO6BCw, Name: Philly Brew Tours by City Brew Tours
(0, '0.140*"beers" + 0.105*"time" + 0.103*"brewery" + 0.078*"breweries" + 0.073*"knowledgeable"')
(1, '0.107*"food" + 0.106*"dock" + 0.103*"philly" + 0.096*"brewing" + 0.074*"city"')
(2, '0.197*"fun" + 0.133*"recommend" + 0.115*"breweries" + 0.114*"went" + 0.088*"awesome"')
(3, '0.118*"fun" + 0.102*"group" + 0.090*"brewery" + 0.088*"city" + 0.081*"b