**Install BERTopic**


In [None]:
!python --version

Python 3.7.13


In [1]:
!pip install bertopic
#!pip install -U sentence-transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.11.0-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 4.9 MB/s 
[?25hCollecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.9 MB/s 
[?25hCollecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 45.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 7.5 MB/s 
Collecting pyyaml<6.0
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 65.7 MB/s 
Collecti

**Get the cleaned data (all tweets)**

In [4]:
import pandas as pd
data = pd.read_csv('allData') #load your data here!

targets = []
for i in data['ijoy']:
  if i == 1:
    targets.append("joy")
  elif i == 0:
    targets.append("no joy")

classes = targets

#Example
#from sklearn.datasets import fetch_20newsgroups
#data = fetch_20newsgroups(subset='train',  remove=('headers', 'footers', 'quotes'))

tweets = []
for tweet in data['clean_tweet']:
  tweets.append(str(tweet))
docs = tweets


Create vocabulary

In [5]:
# create dictionary of terms with valence, arousal, and dominance rankings for vectorizer vocab

file_name = 'NRC-VAD-Lexicon.txt'

terms = {}
va = []
ar = []
do = []
counter=0 

# read words file and store valence, arousal, and dominance in a dictionary
with open(file_name, 'r') as f:
    lines=f.readlines()
    
    for row in lines:
        row_=row.split("\t")
        try:
            terms[row_[0]] = counter
            va.append(float(row_[1]))
            ar.append(float(row_[2]))
            do.append(float(row_[3]))
            counter +=1
        except:
            print(row_)

**Build the model here**

In [6]:
# This can take some time

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") #You can use the same vectorizer from before!
umap_model = UMAP(n_neighbors=15, n_components=15, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
sentence_model = SentenceTransformer("all-mpnet-base-v2")

topic_model = BERTopic(language="english",
                       top_n_words=15,
                       nr_topics=14,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       embedding_model=sentence_model
                       )

topics, probabilities = topic_model.fit_transform(docs)

topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name
0,-1,14549,-1_joy_love_amp_like
1,0,521,0_music_joy_song_album
2,1,371,1_trump_realdonaldtrump_vote_biden
3,2,354,2_chicago_illinois_chicago illinois_il
4,3,334,3_bears_game_season_football
5,4,246,4_cubs_sox_baseball_whitesox
6,5,243,5_jesus_lord_lord jesus_peace
7,6,234,6_birthday_happy_happy birthday_year
8,7,232,7_joy_joy joy_lol_oh
9,8,206,8_raw_food_purim_chef


In [7]:
topic_model.get_topic(topic_model.get_topic_freq().iloc[1].Topic)

[('music', 0.03215979149398472),
 ('joy', 0.02964276356181866),
 ('song', 0.02869434941123512),
 ('album', 0.019766258494840042),
 ('division', 0.018840798814200806),
 ('joy division', 0.01883625606309483),
 ('love', 0.015474680465815632),
 ('listen', 0.013841369352293008),
 ('like', 0.013134604006934619),
 ('songs', 0.01267963059535514),
 ('new', 0.012605483905723285),
 ('amp', 0.011772764554101105),
 ('time', 0.011172514098758916),
 ('listening', 0.009471239560345995),
 ('podcast', 0.008734658279672684)]

**Visualize Topics**

In [8]:
topic_model.visualize_topics()

**Visualize Topics using barchart**

In [9]:
topic_model.visualize_barchart(n_words=10, top_n_topics=14, height=500)

**Visualize Topic Similarity**

In [10]:
topic_model.visualize_heatmap()

**Visualize Topics per Class**

In [11]:
topics_per_class = topic_model.topics_per_class(docs, topics, classes=classes)
topic_model.visualize_topics_per_class(topics_per_class)