# Topic Modeling with BERT

DS 5001 Text as Data

**Purpose:** Demonstrate an example of a new transformer based topic modeling algorithm.

**NB:** This notebook is best viewed in Jupyter Notebook (not Lab).

# Set Up

## Config

In [55]:
import configparser
config = configparser.ConfigParser()
config.read('../../../env.ini')
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [56]:
pd.set_option('display.max_colwidth', None)

## Imports

In [2]:
import pandas as pd
import numpy as np
import plotly_express as px

In [4]:
data_prefix = 'austen-melville'
colors = "YlGnBu"
OHCO = ['book_id','chap_id']

# Get the Data

In [6]:
DOCS = pd.read_csv(f"{output_dir}/{data_prefix}-LDA_DOCS-chaps.csv").set_index(OHCO)
LIB = pd.read_csv(f'{output_dir}/{data_prefix}-LIB_LABELS.csv').set_index('book_id')

Create a short label for each author for display purposes.

In [7]:
LIB['author_key'] = LIB.author.str.split(', ').str[0].str.lower()

In [8]:
docs = DOCS.doc_str.tolist()

# Generate Model

In BERTopic, **the number of topics is determined dynamically by the algorithm**, which leverages HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) for clustering the document embeddings. HDBSCAN does not require specifying the number of clusters beforehand, as it determines the number of clusters based on the data's density.

In [10]:
from bertopic import BERTopic # pip install bertopic

In [69]:
# BERTopic?

In [70]:
topic_model = BERTopic(calculate_probabilities=True)
topics, probs = topic_model.fit_transform(docs)

In [71]:
topic_model.topic_labels_

{-1: '-1_man_him_time_it',
 0: '0_it_her_you_time',
 1: '1_deck_men_ship_man',
 2: '2_lord_things_men_king',
 3: '3_mother_her_me_letter',
 4: '4_sea_ship_boat_deck',
 5: '5_whale_whales_head_ship',
 6: '6_man_sir_it_you',
 7: '7_thee_thou_me_brother',
 8: '8_acquaintance_consideration_optimists_considering',
 9: '9_groves_item_sea_aforesaid',
 10: '10_things_me_yillah_him',
 11: '11_valley_natives_islanders_savages',
 12: '12_fruit_trees_natives_valley',
 13: '13_cosmopolitan_nature_stranger_man',
 14: '14_mother_soul_world_him',
 15: '15_sea_waves_life_boat'}

# View Results

In [72]:
fig = topic_model.visualize_topics()
fig.show()

In [92]:
# topic_model.get_document_info(docs)

# View Topic Hierarchy

In [73]:
htopics = topic_model.hierarchical_topics(docs)

100%|██████████| 15/15 [00:00<00:00, 299.50it/s]


In [81]:
# htopics

In [74]:
print(topic_model.get_topic_tree(htopics))

.
├─ship_sea_man_men_deck
│    ├─ship_sea_deck_man_whale
│    │    ├─■──sea_waves_life_boat_thee ── Topic: 15
│    │    └─ship_sea_deck_whale_man
│    │         ├─ship_deck_men_sea_man
│    │         │    ├─■──deck_men_ship_man_sailors ── Topic: 1
│    │         │    └─■──sea_ship_boat_deck_water ── Topic: 4
│    │         └─■──whale_whales_head_ship_sea ── Topic: 5
│    └─valley_natives_fruit_islanders_trees
│         ├─valley_natives_fruit_islanders_trees
│         │    ├─■──valley_natives_islanders_savages_me ── Topic: 11
│         │    └─■──fruit_trees_natives_valley_tree ── Topic: 12
│         └─■──groves_item_sea_aforesaid_reef ── Topic: 9
└─it_her_time_you_him
     ├─it_her_time_you_him
     │    ├─it_her_you_time_nothing
     │    │    ├─■──mother_her_me_letter_churchhill ── Topic: 3
     │    │    └─■──it_her_you_time_nothing ── Topic: 0
     │    └─man_me_lord_it_him
     │         ├─man_me_lord_thee_it
     │         │    ├─■──things_me_yillah_him_mood ── Topic: 10
     │   

# Convert to Pandas

In [75]:
TOPIC = pd.DataFrame({'topic_label':topic_model.generate_topic_labels()})
TOPIC['topic_id'] = TOPIC.topic_label.str.split('_').str[0].astype('int')
TOPIC['topic_words_raw'] = TOPIC.topic_id.apply(lambda x: topic_model.get_topic(x))
TOPIC = TOPIC.reset_index(drop=True).set_index('topic_id')

In [76]:
TOPIC

Unnamed: 0_level_0,topic_label,topic_words_raw
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,-1_man_him_time,"[(man, 0.01973360803979289), (him, 0.015395175872687671), (time, 0.015364692538398806), (it, 0.015023033691593295), (sea, 0.011944988647972735), (hand, 0.011818152387231782), (me, 0.011691678644885488), (way, 0.0116002575153611), (ship, 0.011052106559148072), (men, 0.01104514199736814)]"
0,0_it_her_you,"[(it, 0.028637244722335715), (her, 0.027552479775427515), (you, 0.02122045911725592), (time, 0.020863097536541406), (nothing, 0.02078083518332522), (him, 0.018068129402880596), (sister, 0.017668849396590396), (room, 0.01710146267618076), (thing, 0.01611818894413681), (father, 0.015094395482543134)]"
1,1_deck_men_ship,"[(deck, 0.02552861014771629), (men, 0.024693307268738217), (ship, 0.02380960465719284), (man, 0.023635932355295167), (sailors, 0.020832294635079372), (captain, 0.01727851311510616), (sea, 0.017055684888859352), (war, 0.01606058615322548), (time, 0.015704810726358206), (officers, 0.015623316159623606)]"
2,2_lord_things_men,"[(lord, 0.045204943717034735), (things, 0.017431867562038295), (men, 0.01688520171542865), (king, 0.016620162812115417), (man, 0.015063468527286331), (kings, 0.014927700528965292), (thou, 0.013788719461871777), (then, 0.013727197017760205), (me, 0.01329783953100278), (gods, 0.012904067738193964)]"
3,3_mother_her_me,"[(mother, 0.04052293498192691), (her, 0.040189482282329284), (me, 0.027734586800715357), (letter, 0.027251426433374696), (churchhill, 0.026006628468539), (time, 0.022179067060158215), (woman, 0.021631378667829566), (power, 0.021625709370519067), (affection, 0.021580214126017613), (you, 0.021190083427409887)]"
4,4_sea_ship_boat,"[(sea, 0.03531823513276597), (ship, 0.027645137976566272), (boat, 0.02649668497879557), (deck, 0.021539771155040105), (water, 0.01945913172669459), (sail, 0.017793433686959684), (whale, 0.017155560677356404), (boats, 0.016540926075534305), (time, 0.01577038632684984), (ships, 0.01558963730973681)]"
5,5_whale_whales_head,"[(whale, 0.08363655601051667), (whales, 0.04461760454968852), (head, 0.024001112170095047), (ship, 0.021981366504254304), (sea, 0.02048302836527201), (fish, 0.018030944908688628), (it, 0.017462334697505296), (harpooneer, 0.017246291904652516), (man, 0.016193817683144707), (boat, 0.016189044310065195)]"
6,6_man_sir_it,"[(man, 0.029006007800782833), (sir, 0.019544096288099423), (it, 0.0182112138506371), (you, 0.016578370016613383), (me, 0.014978869943523399), (boy, 0.014391711704453128), (way, 0.014208353798093022), (him, 0.014070223412656862), (sort, 0.012445157880500563), (friend, 0.012350686785961984)]"
7,7_thee_thou_me,"[(thee, 0.06333050960408727), (thou, 0.055340999879675785), (me, 0.04971852162279694), (brother, 0.03280424728105849), (house, 0.02945073809737321), (him, 0.026256611199961474), (it, 0.02415322502792511), (heart, 0.022716002630526047), (mother, 0.02235322131829211), (guitar, 0.02201583052651755)]"
8,8_acquaintance_consideration_optimists,"[(acquaintance, 0.29387996742937317), (consideration, 0.23753398191389227), (optimists, 0.23636142925280082), (considering, 0.21081409923796532), (evinces, 0.2041247149201603), (story, 0.20221301407041736), (gentleman, 0.1930742709243953), (investment, 0.18527035434876107), (business, 0.178826376637808), (boon, 0.17672219998729308)]"


In [77]:
TOPIC['topic_words'] = TOPIC.topic_words_raw\
    .apply(pd.Series).stack()\
    .apply(pd.Series)[0].unstack()\
    .apply(lambda x: ', '.join(map(str,x)), axis=1)

In [78]:
TOPIC['topic_weights'] = TOPIC.topic_words_raw\
    .apply(pd.Series).stack()\
    .apply(pd.Series)[1].unstack().sum(1)

In [79]:
TOPIC[['topic_words', 'topic_weights']].sort_values('topic_weights', ascending=True)

Unnamed: 0_level_0,topic_words,topic_weights
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,"man, him, time, it, sea, hand, me, way, ship, men",0.134669
6,"man, sir, it, you, me, boy, way, him, sort, friend",0.165785
2,"lord, things, men, king, man, kings, thou, then, me, gods",0.179851
1,"deck, men, ship, man, sailors, captain, sea, war, time, officers",0.200223
0,"it, her, you, time, nothing, him, sister, room, thing, father",0.203105
4,"sea, ship, boat, deck, water, sail, whale, boats, time, ships",0.213309
9,"groves, item, sea, aforesaid, reef, isles, wine, round, air, shore",0.215569
11,"valley, natives, islanders, savages, me, house, pi, time, chiefs, island",0.251932
12,"fruit, trees, natives, valley, tree, bread, ground, roots, side, companion",0.257676
10,"things, me, yillah, him, mood, man, eyes, light, thoughts, vision",0.268977
