In [1]:
# !pip install bertopic
from bertopic import BERTopic

# !pip install bertopic[visualization]
import pandas as pd

# NLTK Stop words
import nltk

nltk.download("stopwords")
from nltk.corpus import stopwords

# !pip install spacy
import spacy

# python -m spacy download en

import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from pprint import pprint

[nltk_data] Downloading package stopwords to /Users/blam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Import Reviews
reviews = pd.read_json(
    "/Users/blam/Documents/Datascience capstone/yelp_academic_dataset_review.json",
    lines=True,
)
reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5,1,2,1,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20
6990276,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5,2,1,2,"This spot offers a great, affordable east week...",2021-03-31 16:55:10
6990277,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
6990278,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5,1,0,0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27


In [3]:
reviews_text = reviews["text"]
reviews_text

0          If you decide to eat here, just be aware it is...
1          I've taken a lot of spin classes over the year...
2          Family diner. Had the buffet. Eclectic assortm...
3          Wow!  Yummy, different,  delicious.   Our favo...
4          Cute interior and owner (?) gave us tour of up...
                                 ...                        
6990275    Latest addition to services from ICCU is Apple...
6990276    This spot offers a great, affordable east week...
6990277    This Home Depot won me over when I needed to g...
6990278    For when I'm feeling like ignoring my calorie-...
6990279    Located in the 'Walking District' in Nashville...
Name: text, Length: 6990280, dtype: object

In [4]:
# Text preprocessing functions
stop_words = stopwords.words("english")
stop_words.extend(["from", "to", "how", "they", "very", "many"])


# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags]
        )
    return texts_out

In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(
    reviews_text, min_count=5, threshold=100
)  # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[reviews_text], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(reviews_text)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(
    data_words_bigrams, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]
)
print(data_lemmatized[:1])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=6,
    random_state=100,
    chunksize=100,
    passes=10,
    per_word_topics=True,
)

In [None]:
# Compute Perplexity
print(
    "\nPerplexity: ", lda_model.log_perplexity(corpus)
)  # a measure of how good the model is. lower the better.

# Compute Coherence Score. Higher the topic coherence, the topic is more human interpretable.
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score: ", coherence_lda)

In [None]:
# matplotlib inline
!pip install pyLDAvis==2.1.2
import pyLDAvis
import pyLDAvis.gensim

vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=id2word)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [None]:
# create model
model = BERTopic(verbose=True, nr_topics=5)

# convert to list
docs = reviews_text.to_list()

topics, probabilities = model.fit_transform(docs)

# Select top topics
print(model.get_topic_freq().head(11))
print(model.get_topic(6))
model.visualize_topics()

In [None]:
model.visualize_barchart(top_n_topics=8, n_words=20, height=500)

In [None]:
model.visualize_hierarchy(height=1000)