In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting scipy>=1.7.0
  Downloading scipy-1.10.1-cp38-cp38-win_amd64.whl (42.2 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
Installing collected packages: smart-open, scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.6.2
    Uninstalling scipy-1.6.2:
      Successfully uninstalled scipy-1.6.2
Successfully installed gensim-4.3.2 scipy-1.10.1 smart-open-6.4.0


In [5]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import gensim

# Preprocessing the text data
data = pd.read_csv(r'C:\Users\phili\Downloads\merged_data.csv')

# Custom filter to remove punctuation and numeric characters
custom_filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

# Apply preprocessing to each document in the text column
processed_docs = [preprocess_string(doc, custom_filters) for doc in data['text']]

# Create a dictionary representation of the documents
dictionary = Dictionary(processed_docs)

# Filter out extremes to remove too rare or too common words
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Convert document into the bag-of-words (BoW) format
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Training the LDA model
# Using 10 topics as an example
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

# Extract the words and their weights for each topic
topics = lda_model.print_topics(num_words=10)
topics


[(0,
  '0.033*"good" + 0.029*"food" + 0.024*"not" + 0.021*"t" + 0.020*"beer" + 0.016*"pizza" + 0.014*"that" + 0.014*"s" + 0.013*"really" + 0.013*"like"'),
 (1,
  '0.062*"we" + 0.027*"our" + 0.025*"were" + 0.018*"us" + 0.016*"had" + 0.013*"at" + 0.013*"she" + 0.012*"my" + 0.012*"t" + 0.011*"that"'),
 (2,
  '0.037*"ve" + 0.035*"been" + 0.034*"my" + 0.029*"have" + 0.023*"here" + 0.021*"this" + 0.020*"time" + 0.019*"always" + 0.018*"best" + 0.018*"are"'),
 (3,
  '0.053*"brunch" + 0.018*"coffee" + 0.014*"vegan" + 0.014*"eggs" + 0.013*"delicious" + 0.012*"so" + 0.011*"hummus" + 0.011*"dessert" + 0.011*"you" + 0.009*"sunday"'),
 (4,
  '0.052*"great" + 0.028*"place" + 0.027*"food" + 0.024*"good" + 0.024*"beer" + 0.021*"very" + 0.019*"this" + 0.019*"we" + 0.016*"back" + 0.016*"were"'),
 (5,
  '0.029*"that" + 0.023*"you" + 0.019*"t" + 0.017*"this" + 0.016*"they" + 0.015*"s" + 0.012*"not" + 0.011*"my" + 0.010*"be" + 0.010*"me"'),
 (6,
  '0.021*"were" + 0.020*"had" + 0.019*"burger" + 0.017*"cheese

In [10]:
num_topics = 10  # or however many topics we want
num_words = 10   # the number of words to retrieve for most and least impactful

# Function to get the most and least impactful words for each topic
def get_topic_words(lda_model, num_topics, num_words):
    topic_words = {}

    for topic_id in range(num_topics):
        # Most impactful words
        top_words = lda_model.show_topic(topic_id, topn=num_words)
        # Check if the word IDs or word strings are returned
        if top_words and isinstance(top_words[0][0], int):
            top_words = [(dictionary[word_id], weight) for word_id, weight in top_words]
        else:
            top_words = [(word, weight) for word, weight in top_words]

        # Least impactful words
        bottom_words = lda_model.show_topic(topic_id, topn=-num_words)
        # Check if the word IDs or word strings are returned and if bottom_words is not empty
        if bottom_words and isinstance(bottom_words[0][0], int):
            bottom_words = [(dictionary[word_id], weight) for word_id, weight in bottom_words]
        elif bottom_words:
            bottom_words = [(word, weight) for word, weight in bottom_words]
        else:
            bottom_words = []

        topic_words[topic_id] = {'Most Impactful': top_words, 'Least Impactful': bottom_words}

    return topic_words

# Retrieve the words
topic_words = get_topic_words(lda_model, num_topics, num_words)

# Print the results
for topic, words in topic_words.items():
    print(f"Topic {topic}:")
    print("  Most Impactful:", words['Most Impactful'])
    print("  Least Impactful:", words['Least Impactful'])
    print("\n")

Topic 0:
  Most Impactful: [('good', 0.033091985), ('food', 0.029058008), ('not', 0.023974163), ('t', 0.021299744), ('beer', 0.019742806), ('pizza', 0.015662752), ('that', 0.014423424), ('s', 0.014103242), ('really', 0.013232016), ('like', 0.013023522)]
  Least Impactful: []


Topic 1:
  Most Impactful: [('we', 0.06158882), ('our', 0.026781134), ('were', 0.025471117), ('us', 0.01790044), ('had', 0.016279588), ('at', 0.013036208), ('she', 0.01268147), ('my', 0.0119477855), ('t', 0.011570202), ('that', 0.011219331)]
  Least Impactful: []


Topic 2:
  Most Impactful: [('ve', 0.036889236), ('been', 0.03452868), ('my', 0.034296367), ('have', 0.02897691), ('here', 0.022974644), ('this', 0.021006577), ('time', 0.01982171), ('always', 0.019248096), ('best', 0.01757086), ('are', 0.01752876)]
  Least Impactful: []


Topic 3:
  Most Impactful: [('brunch', 0.05289516), ('coffee', 0.01767959), ('vegan', 0.014429281), ('eggs', 0.013971929), ('delicious', 0.012632333), ('so', 0.011589699), ('hummus',

In [11]:
for topic_id in range(num_topics):
    # Get all words and weights for a topic
    all_words = lda_model.show_topic(topic_id, topn=len(dictionary))

    # Select a threshold or simply pick words from the lower end of the list
    # For example, picking the last 'num_words' words in the list
    least_impactful_words = all_words[-num_words:]

    print(f"Topic {topic_id}: Least impactful words:", least_impactful_words)


Topic 0: Least impactful words: [('oushe', 4.866634e-07), ('atar', 4.8666305e-07), ('meshwi', 4.866627e-07), ('fatar', 4.8666266e-07), ('disrespectful', 4.866624e-07), ('labneh', 4.8666203e-07), ('tia', 4.86662e-07), ('chapman', 4.8666146e-07), ('cashews', 4.8666107e-07), ('guides', 4.8666055e-07)]
Topic 1: Least impactful words: [('kouign', 2.4866264e-07), ('labne', 2.4866262e-07), ('herbal', 2.486626e-07), ('croissant', 2.486625e-07), ('turmeric', 2.4866247e-07), ('shish', 2.4866222e-07), ('taouk', 2.4866208e-07), ('hanger', 2.4866202e-07), ('richness', 2.4866185e-07), ('manoushe', 2.486617e-07)]
Topic 2: Least impactful words: [('tehina', 8.9983564e-07), ('comp', 8.998354e-07), ('kebab', 8.998353e-07), ('gravlax', 8.998352e-07), ('atar', 8.9983416e-07), ('chickpea', 8.9983416e-07), ('scared', 8.998331e-07), ('labne', 8.998326e-07), ('maibock', 8.998323e-07), ('ipad', 8.998277e-07)]
Topic 3: Least impactful words: [('koelschip', 2.3026198e-06), ('christian', 2.3026198e-06), ('racist'