# BERT topic modeling 

The code is adapted from https://maartengr.github.io/BERTopic/index.html#quick-start 

## Import pkgs 

In [1]:
from bertopic import BERTopic

# for word + document embedding
from bertopic.backend import WordDocEmbedder
import gensim.downloader as api
from sentence_transformers import SentenceTransformer

import pandas as pd
import os
from sklearn.datasets import fetch_20newsgroups # for test data

# print every ouput
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
# print the current wd 
os.getcwd()

# change the current wd 
os.chdir('/Users/jaeyeonkim/scraping_slsv')

'/Users/jaeyeonkim/scraping_slsv/outputs'

In [9]:
# check the current wd
os.getcwd()

'/Users/jaeyeonkim/scraping_slsv'

In [40]:
df = pd.read_csv('./outputs/filtered_df.csv')

In [41]:
# check the file 
df

Unnamed: 0,text,name
0,voter education voter turnout voter registrat...,2019-2020-Nazareth-College-Action-Plan.pdf
1,gw votes 2020 action plan honey w nashman cen...,2020-Action-Plan-FINAL.pdf
2,tricia debertolis alfred university 1 saxon dr...,Action-Plan-Alfred.pdf
3,allegany college all in one campus plan to in...,Action-Plan-Allegany-College.pdf
4,all in campus democracy challenge 2016 action...,Action-Plan-Anne-Arundel-Community-College.pdf
...,...,...
1221,voter education engagement master plan 429 n c...,Wofford-College-Action-Plan-2018.pdf
1222,voter education engagement action plan 2022 ex...,Wofford-College-Action-Plan-2022-May.pdf
1223,worcester state university has aimed to increa...,Worcester-State-University-Action-Plan-2018.pdf
1224,worcester state university 2020 action plan th...,Worcester-State-University-Action-Plan-2020.pdf


# Feature engineering

In [42]:
# stopwords dictionary
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaeyeonkim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
# using list comprehension
df['text'] = df["text"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))

## BERT topic modeling

### Silent agenda 

In [None]:
#topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(embedding_model=sentence_model)

# Word embedding model
#ft = api.load('fasttext-wiki-news-subwords-300')

# Document embedding model
#embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create a model that uses both language models and pass it through BERTopic
#word_doc_embedder = WordDocEmbedder(embedding_model=embedding_model, word_embedding_model=ft)

#topic_model = BERTopic(embedding_model=word_doc_embedder)

In [14]:
topics, probs = topic_model.fit_transform(df['text'].unique())

topic_model.get_topic_info() # 2 topics: (1) health spending, (2) tax, income, jobs, (3) immigration, trade, (4) education, job creation

Unnamed: 0,Topic,Count,Name
0,-1,424,-1_and_the_to_of
1,0,219,0_the_and_of_to
2,1,121,1_to_the_and_of
3,2,55,2_our_campus_the_following
4,3,48,3_and_the_of_to
5,4,35,4_to_the_and_of
6,5,30,5_the_and_to_of
7,6,30,6_the_to_and_texas
8,7,29,7_the_to_and_of
9,8,27,8_asu_the_to_and


In [None]:
topic_model.get_topic(0) # heatlh care topic 

In [86]:
topic_model.get_topic(1) # tax, income, jobs

[('tax', 0.18777503804760035),
 ('flat', 0.06509608083706696),
 ('income', 0.0422358435166987),
 ('fair', 0.03976813842409393),
 ('option', 0.03976813842409393),
 ('payer', 0.03820079628689287),
 ('cuts', 0.03690491058008278),
 ('jobs', 0.03484706739969161),
 ('options', 0.030855414571679684),
 ('taxes', 0.03019622287354521)]

In [87]:
topic_model.get_topic(2) # immigration, trade

[('immigration', 0.2714462824557912),
 ('illegal', 0.12668919139449786),
 ('overseas', 0.08223374908152511),
 ('trade', 0.07277239210629413),
 ('effect', 0.06725180219713811),
 ('spending', 0.06209583315436263),
 ('state', 0.059448475650289455),
 ('federal', 0.05515291849049353),
 ('economy', 0.05398318381800181),
 ('government', 0.053435065933553905)]

In [88]:
topic_model.get_topic(3) # education, job creation

[('education', 0.27811338413014397),
 ('jobs', 0.16768513635941823),
 ('job', 0.15503310190708683),
 ('really', 0.14602596271260546),
 ('creation', 0.12034115778220543),
 ('waste', 0.1118396762896731),
 ('economy', 0.08296362986766594),
 ('cuts', 0.07769454858964796),
 ('bureacratic', 0.07473712806978168),
 ('neet', 0.07473712806978168)]