In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus

In [2]:
file_path = "D://My Folder//Forage//British Airways//BA_cleaned_reviews.csv"  
df = pd.read_csv(file_path)

In [4]:
texts = df['cleaned_reviews'].dropna().tolist()

In [5]:
# Create a bag-of-words representation using CountVectorizer

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(texts)

In [6]:
# Create a dictionary and corpus for Gensim

dictionary = corpora.Dictionary([vectorizer.get_feature_names_out()])
corpus = Sparse2Corpus(dtm, documents_columns=False)

In [7]:
# Build LDA model

num_topics = 5  # Specify the number of topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42)

In [8]:
# Display the top words for each topic

topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx + 1}: {topic}")

Topic 1: 0.018*"flight" + 0.017*"service" + 0.012*"airways" + 0.012*"british" + 0.012*"ba" + 0.010*"trip" + 0.010*"customer" + 0.007*"heathrow" + 0.006*"luggage" + 0.006*"london"
Topic 2: 0.023*"ba" + 0.020*"class" + 0.018*"business" + 0.015*"flight" + 0.015*"seat" + 0.013*"trip" + 0.011*"good" + 0.010*"service" + 0.010*"seats" + 0.009*"food"
Topic 3: 0.027*"flight" + 0.019*"ba" + 0.014*"trip" + 0.012*"service" + 0.011*"crew" + 0.010*"time" + 0.010*"food" + 0.010*"good" + 0.009*"cabin" + 0.008*"staff"
Topic 4: 0.020*"flight" + 0.014*"trip" + 0.012*"food" + 0.011*"ba" + 0.011*"crew" + 0.009*"cabin" + 0.008*"time" + 0.008*"service" + 0.007*"seat" + 0.007*"good"
Topic 5: 0.041*"flight" + 0.019*"ba" + 0.011*"trip" + 0.010*"hours" + 0.010*"british" + 0.009*"told" + 0.009*"service" + 0.009*"airways" + 0.009*"customer" + 0.009*"cancelled"


In [9]:
# Visualize the topics using pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

ModuleNotFoundError: No module named 'pyLDAvis'

In [10]:
pip install pyLDAvis

Collecting pyLDAvis
  Obtaining dependency information for pyLDAvis from https://files.pythonhosted.org/packages/6b/5a/66364c6799f2362bfb9b7100bc1ce6ffcdfe7f17e8d2e85a591bfe427643/pyLDAvis-3.4.1-py3-none-any.whl.metadata
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Obtaining dependency information for funcy from https://files.pythonhosted.org/packages/d5/08/c2409cb01d5368dcfedcbaffa7d044cc8957d57a9d0855244a5eb4709d30/funcy-2.0-py2.py3-none-any.whl.metadata
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting FuzzyTM>=0.4.0 (from gensim->pyLDAvis)
  Obtaining dependency information for FuzzyTM>=0.4.0 from https://files.pythonhosted.org/packages/2d/30/074bac7a25866a2807c1005c7852c0139ac22ba837871fc01f16df29b9dc/FuzzyTM-2.0.9-py3-none-any.whl.metadata
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Obtaining dependency information for pyfume

In [13]:
# Visualize the topics using pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [14]:
pyLDAvis.enable_notebook()
lda_visualization = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_visualization)