## Topic modeling using small policy dataset

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np

In [2]:
# Create small dataframe with customer feedback
text_train = pd.DataFrame({
    "policy_views": ["I would like more funding for pollution mitigation",
                "Environmental regulation and reducing pollution is important to me",
                "Minimum wage and raising living standards",
                "Wages are so low and they need to go up whether by minimum wage increases or collective bargaining",
               "Climate change and environmental degredation are my main focus",
               "Investing in renewable fuels and environmental regulation",
               "Minimum wage and climate change, environmental policy",
               "environment, climate change global warming, solar power",
               "Increase federal minimum wage to a livable level to ensure nobody is in poverty"]
})

In [None]:
text_train

In [None]:
# Steps:
    # 1: Instantiate vectorizer with parameters: vec
    # 2: Vectorize column of text with fit_transform: X
    # 3: Instantiate LDA model with parameters: lda
    # 4: Fit LDA model to vectorized text: doc_topics

In [None]:
# 1: create vectorizer
vec = CountVectorizer(stop_words="english")

In [None]:
# 2: create dtm
X = vec.fit_transform(text_train["policy_views"])

In [None]:
# 3 create lda
lda = LatentDirichletAllocation(n_components=2)

In [None]:
# 4 fit lda
doc_topics = lda.fit_transform(X)

In [None]:
# 5
print(f"There are {lda.components_.shape[0]} topics and {lda.components_.shape[1]} words")

### Extract top words from each topic

In [None]:
## Get feature names (vocabulary)
voc = np.array(vec.get_feature_names())

In [None]:
# Set number of top words you want
n_words=5

# Create lambda function to extra top words from voc
imp_words = lambda x: [voc[each] for each in np.argsort(x)[:-n_words-1:-1]]

In [None]:
# Use imp_words to extract words with the highest weights from our lda model
words_in_topic = ([imp_words(x) for x in lda.components_])

In [None]:
# Examine words
words_in_topic

In [None]:
# Join words into single string
main_topics = [' '.join(x) for x in words_in_topic]

In [None]:
# Create dataframe with top words and set index to topic number
main_topics_df = pd.DataFrame({"top_words": main_topics},
                             index=["Topic_" + str(each) for each in range(lda.n_components)])

In [None]:
# Examine top words
main_topics_df

In [None]:
# Examine topic 1
print(f"Top words in topic 1: {main_topics_df['top_words'][0]}")

In [None]:
# Loop to look at multiple topics' top words
for i, each in enumerate(main_topics_df["top_words"]):
    print(f"Top words in topic {i}: {each}\n")

### Extract top words from topics (as a function)

In [None]:
def top_words_f(n_words):
    n_words=n_words
    imp_words = lambda x: [voc[each] for each in np.argsort(x)[:-n_words-1:-1]]
    words_in_topic = ([imp_words(x) for x in lda.components_])
    main_topics = [' '.join(x) for x in words_in_topic]
    df = pd.DataFrame(main_topics).T
    df.columns = columns=["Topic_" + str(each) for each in range(lda.n_components)]
    return df

In [None]:
# Call function and specify number of top words
main_topics_df = top_words_f(3)

In [None]:
# Print top words in topic 1
print(f"Top words in topic 1: {main_topics_df['top_words'][0]}")

In [None]:
# Print top words in each topic
for i, each in enumerate(main_topics_df["top_words"]):
    print(f"Top words in topic {i}: {each}\n")

### Create a document topic matrix from results

In [None]:
cols = ["Topic_" + str(each) for each in range(lda.n_components)]
docs = ["Document_" + str(each) for each in range(X.shape[0])]

In [None]:
docs

In [None]:
# Our input weight vector is doc_topics
# This is a 2d array that is the proportion of the words in each document generated from that topic
doc_topics

In [None]:
# Create dataframe with term weights and document # and topic # as rows, columns
df_topics = pd.DataFrame(np.round(doc_topics, 2),
                        columns=cols,
                        index=docs)

In [None]:
# Extract most important topics from those values
imp_topic = np.argmax(df_topics.values, axis=1)

In [None]:
text_train

In [None]:
df_topics

In [None]:
df_topics["top_topic"] = imp_topic

In [None]:
df_topics

In [None]:
# Assign name based on domain
df_topics["topic_name"] = np.where(df_topics["top_topic"] == 1,
                                   "env",
                                   "econ")

In [None]:
df_topics

In [None]:
# How dominant is the topic in each document. 
# Fundamentally: the proportion of the words in each document generated from that topic
df_topics

In [None]:
text_train

In [None]:
# Assign name based on domain
df_topics["topic_name"] = np.where(df_topics["top_topic"] == 0,
                                   "environment",
                                   "economic")

In [None]:
df_topics

### PyLDAvis

In [2]:
import pyLDAvis.sklearn
lda_viz = pyLDAvis.sklearn.prepare(lda_model=lda,
                                   dtm=X,
                                   vectorizer=vec,
                                      sort_topics=False)

NameError: name 'lda' is not defined

In [None]:
pyLDAvis.display(lda_viz)

# pyLDAvis
# Left panel: 
    # global view of topic model
    # centers of circle are distance between topics then projected onto two dimensions
    # area of circles is the overall prevalence of the topic in the whole topic model
    # examine how prevalent each topic is
    # examine how topics relate to each other
# Right panel:
    # Bars represent individual terms that are most useful for interpreting selected topic on left
    # Blue bar represents corpus wide frequencies
    # Red bar represents topic-specific frequencies
    # examine the meaning of each topic

# Gensim

In [3]:
from nltk.tokenize import word_tokenize
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [4]:
# Remove punctuation
text_train["policy_views"] = text_train['policy_views'].str.replace("[^\w\s]", '')

In [10]:
# Enforce uniform lower casing
text_train["policy_views"] = text_train['policy_views'].str.lower()

In [11]:
# Tokenize text column
text_train["tokenized"] = text_train["policy_views"].apply(word_tokenize)

In [12]:
# Remove stopwords
from nltk.corpus import stopwords
sw = stopwords.words("english")
text_train["tokenized"] = text_train['tokenized'].apply(lambda x: [each for each in x if each not in sw])

In [13]:
# Create gensim dictionary with tokens and ids
dictionary = Dictionary(text_train["tokenized"])

In [14]:
# Dictionary of all tokens and ids
dictionary.token2id

{'funding': 0,
 'like': 1,
 'mitigation': 2,
 'pollution': 3,
 'would': 4,
 'environmental': 5,
 'important': 6,
 'reducing': 7,
 'regulation': 8,
 'living': 9,
 'minimum': 10,
 'raising': 11,
 'standards': 12,
 'wage': 13,
 'bargaining': 14,
 'collective': 15,
 'go': 16,
 'increases': 17,
 'low': 18,
 'need': 19,
 'wages': 20,
 'whether': 21,
 'change': 22,
 'climate': 23,
 'degredation': 24,
 'focus': 25,
 'main': 26,
 'fuels': 27,
 'investing': 28,
 'renewable': 29,
 'policy': 30,
 'chamge': 31,
 'environment': 32,
 'global': 33,
 'power': 34,
 'solar': 35,
 'warming': 36,
 'ensure': 37,
 'federal': 38,
 'increase': 39,
 'level': 40,
 'livable': 41,
 'nobody': 42,
 'poverty': 43}

In [15]:
# Create gensim corpus, uses bag of words with token ids and tf
gensim_corpus = [dictionary.doc2bow(doc) for doc in text_train["tokenized"]]

In [16]:
# gensim_corpus: Returned in gensim_corpus: series of tuples with (token_id, token_freq_in_doc)

In [38]:
# Fit model
ldamodel = LdaModel(corpus=gensim_corpus,
                   num_topics = 10,
                   id2word=dictionary)

In [39]:
# Show topics gensim. Maps 
ldamodel.show_topics()

[(0,
  '0.023*"environmental" + 0.023*"wage" + 0.023*"climate" + 0.023*"minimum" + 0.023*"regulation" + 0.023*"pollution" + 0.023*"investing" + 0.023*"living" + 0.023*"change" + 0.023*"funding"'),
 (1,
  '0.117*"pollution" + 0.117*"would" + 0.117*"mitigation" + 0.117*"funding" + 0.117*"like" + 0.011*"environmental" + 0.011*"wage" + 0.011*"minimum" + 0.011*"climate" + 0.011*"regulation"'),
 (2,
  '0.023*"wage" + 0.023*"environmental" + 0.023*"climate" + 0.023*"minimum" + 0.023*"pollution" + 0.023*"regulation" + 0.023*"change" + 0.023*"fuels" + 0.023*"living" + 0.023*"investing"'),
 (3,
  '0.023*"wage" + 0.023*"environmental" + 0.023*"pollution" + 0.023*"minimum" + 0.023*"climate" + 0.023*"change" + 0.023*"regulation" + 0.023*"investing" + 0.023*"raising" + 0.023*"funding"'),
 (4,
  '0.067*"climate" + 0.067*"power" + 0.067*"environment" + 0.067*"chamge" + 0.067*"environmental" + 0.067*"renewable" + 0.067*"global" + 0.067*"regulation" + 0.067*"solar" + 0.067*"warming"'),
 (5,
  '0.128*"en

In [40]:
import pyLDAvis
import pyLDAvis.gensim_models as gensim_pyvis
pyLDAvis.enable_notebook()

In [41]:
vis = gensim_pyvis.prepare(ldamodel,
                       gensim_corpus,
                       dictionary)

In [45]:
vis

In [42]:
from gensim.models import CoherenceModel

In [43]:
# Calculate coherence
coh = CoherenceModel(model=ldamodel,
                     texts=text_train["tokenized"],
                    corpus=gensim_corpus,
                    coherence="c_v").get_coherence()

In [44]:
coh

0.5079802147422494

  from imp import reload
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/dev

In [None]:
### Ways to evluate model
# Look at it
# Perplexity
# Coherence: 
    # measure of the degree of semantic similarity between words in topics
    # more similar words are in topic -> higher coherence score
    # gensim four stages: segmentation -> probability estimation -> confirmation measure -> aggregation