# BERT topic modeling 

The code is adapted from https://maartengr.github.io/BERTopic/index.html#quick-start 

## Import pkgs 

In [1]:
from bertopic import BERTopic

# for word + document embedding
from bertopic.backend import WordDocEmbedder
import gensim.downloader as api
from sentence_transformers import SentenceTransformer

import pandas as pd
import os
from sklearn.datasets import fetch_20newsgroups # for test data

# print every ouput
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# print the current wd 
os.getcwd()

# change the current wd 
os.chdir('/Users/jaeyeonkim/scraping_slsv')

'/Users/jaeyeonkim/scraping_slsv/code'

In [3]:
# check the current wd
os.getcwd()

'/Users/jaeyeonkim/scraping_slsv'

In [4]:
df = pd.read_csv('./outputs/filtered_df.csv')

In [5]:
# check the file 
df

Unnamed: 0,text,name
0,voter education voter turnout voter registrat...,2019-2020-Nazareth-College-Action-Plan.pdf
1,gw votes 2020 action plan honey w nashman cen...,2020-Action-Plan-FINAL.pdf
2,tricia debertolis alfred university 1 saxon dr...,Action-Plan-Alfred.pdf
3,allegany college all in one campus plan to in...,Action-Plan-Allegany-College.pdf
4,all in campus democracy challenge 2016 action...,Action-Plan-Anne-Arundel-Community-College.pdf
...,...,...
1221,voter education engagement master plan 429 n c...,Wofford-College-Action-Plan-2018.pdf
1222,voter education engagement action plan 2022 ex...,Wofford-College-Action-Plan-2022-May.pdf
1223,worcester state university has aimed to increa...,Worcester-State-University-Action-Plan-2018.pdf
1224,worcester state university 2020 action plan th...,Worcester-State-University-Action-Plan-2020.pdf


# Feature engineering

In [8]:
# stopwords dictionary
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaeyeonkim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# using list comprehension
df['text'] = df["text"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))

## BERT topic modeling

### Silent agenda 

In [10]:
#topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(embedding_model=sentence_model)

# Word embedding model
#ft = api.load('fasttext-wiki-news-subwords-300')

# Document embedding model
#embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create a model that uses both language models and pass it through BERTopic
#word_doc_embedder = WordDocEmbedder(embedding_model=embedding_model, word_embedding_model=ft)

#topic_model = BERTopic(embedding_model=word_doc_embedder)

In [11]:
topics, probs = topic_model.fit_transform(df['text'].unique())

topic_model.get_topic_info() # 2 topics: (1) health spending, (2) tax, income, jobs, (3) immigration, trade, (4) education, job creation

Unnamed: 0,Topic,Count,Name
0,-1,501,-1_students_voter_engagement_campus
1,0,136,0_engagement_civic_students_student
2,1,89,1_voter_students_student_registration
3,2,56,2_students_voter_student_community
4,3,50,3_harvard_campus_following_voter
5,4,36,4_florida_miami_students_campus
6,5,30,5_student_voter_campus_engagement
7,6,30,6_texas_campus_tlu_students
8,7,28,7_illinois_students_voting_voter
9,8,27,8_campus_students_voter_student


In [15]:
topic_model.get_topic(0) 

[('engagement', 0.023723420328570992),
 ('civic', 0.022454705504838687),
 ('students', 0.020479969864622135),
 ('student', 0.020359275025236063),
 ('campus', 0.020141726480267786),
 ('community', 0.01827453483656546),
 ('learning', 0.016265281496764185),
 ('plan', 0.016264935368142476),
 ('voter', 0.01482686698272112),
 ('voting', 0.013124441908351244)]

In [16]:
topic_model.get_topic(1) 

[('voter', 0.025269491419155687),
 ('students', 0.024301591504023484),
 ('student', 0.022090483276637634),
 ('registration', 0.02070658842154926),
 ('campus', 0.01915155544067472),
 ('voting', 0.018930858230033504),
 ('hopkins', 0.018062680203357342),
 ('vote', 0.017801795109683955),
 ('election', 0.017535592037128095),
 ('day', 0.015292258139481206)]

In [17]:
topic_model.get_topic(2) 

[('students', 0.021822333540753313),
 ('voter', 0.017803113145325185),
 ('student', 0.016476533777543472),
 ('community', 0.015886729472725604),
 ('campus', 0.015827061937832586),
 ('registration', 0.01519985692138873),
 ('engagement', 0.014774617823583516),
 ('civic', 0.013029571669811339),
 ('faculty', 0.01286715166409284),
 ('saint', 0.011959575733202542)]