#BERTopic

BERTopic is a topic modeling technique that uses transformers and a custom class-based TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

In [2]:
%%capture
!pip install bertopic
!pip install stanza

In [3]:
import nltk
import stanza
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stanza.download('en')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


# CONSTANTS

In [4]:
REVIEW_COUNT = 5000
TOPIC_COUNT = 30

Get the Data ready

#Create Amazon Review dataset for topic modelling

In [5]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [6]:
stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

file_name = '/drive/MyDrive/Colab Notebooks/absa/amazon_review/amazon_reviews_us_Wireless_v1_00.tsv'
i =0
docs = []
original_docs = []
with open(file_name,encoding ='utf-8', mode='r') as f:
  while True:
        line = f.readline()
        parts = line.split('\t')
        review = parts[13]
        review = review.lower()
        words = review.split(' ')
        new_line = ''
        for word in words:
          if word not in stop_words:
            new_line = new_line + word + ' '                        

        i = i+1
        if not line or i > REVIEW_COUNT:
            break
        elif(i > 1):
          docs.append(new_line)
          original_docs.append(review)


#Training BERTopic model using the above dataset
BERTopic will give different topic infor in different run due to its stochastic nature of UMAP ( Uniform Manifold Approximation and Projection). UMAP is a dimentionality reduction technique similar to t-SNE.

In [7]:
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)
# Further reduce topics
topic_model.reduce_topics(docs, nr_topics=TOPIC_COUNT)
topic_model.update_topics(docs, topics, n_gram_range=(1, 2))


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2023-05-15 18:37:14,866 - BERTopic - Transformed documents to Embeddings
2023-05-15 18:37:55,611 - BERTopic - Reduced dimensionality
2023-05-15 18:37:59,245 - BERTopic - Clustered reduced embeddings
2023-05-15 18:38:03,295 - BERTopic - Reduced number of topics from 82 to 30


NOTE: Use language="multilingual" to select a model that support 50+ languages.

#Extract topics

In [8]:
#Step-0 : Find out all document index for each topic.
#Step-1 : For each document, Write the tokens for the original review text line into a data frame column.
#Step-2 : For each topic, find out the list of terms/words.
#Step-4 : create term vector placing 1 if term is matched with work, else place 0
#Step-5 : Write review text/tokens and term vector into csv file.

T = topic_model.get_document_info(docs)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()


In [9]:
def get_vector_tag(tokens,terms):
  tag_vectors = []  
  for token in tokens:
    for term in terms:
      flag = False
      if(token in term):
        tag_vectors.append(1)
        flag = True
        break
    if(flag == False):
      tag_vectors.append(0)
  return tag_vectors

In [10]:
# print terms
def parse_only_terms(terms):
  only_terms = []
  for item in terms:
    only_terms.append(item[0])
  return only_terms


In [51]:
def find_relevent_dependencies_terms(sentence_dict,relation):
  text = []
  #if relationship is obj, find its root(verb) and noun-head
  for word in sentence_dict:    
    #print(word['deprel'])
    if(word['deprel'] == 'amod' or word['deprel'] == 'root' or word['deprel'] == 'nsubj' or word['deprel'] == 'det' or word['deprel'] == 'ccomp' or
      word['deprel'] == 'obj' or word['deprel'] == 'aux' or word['deprel'] == 'advmod' or word['deprel'] == 'compound' or
       word['deprel'] == 'conj' or word['deprel'] == 'advcl' or word['deprel'] == 'mark'):
      text.append(word['text'])
  return text

In [30]:
def extract_phrases_dependency_parsing(sentence, terms):
  extracted_text = []
  nlp = stanza.Pipeline('en',processors = 'tokenize,mwt,pos,lemma,depparse',download_method=None, verbose=False)
  result = nlp(sentence)
  for i in range(len(result.sentences)):
    sentence_dict = result.sentences[i].to_dict()
    for word in sentence_dict:
      for term in terms:
        if(word['text'] == term):        
          text = find_relevent_dependencies_terms(sentence_dict,word['deprel'])        
          extracted_text.append(text)
  return extracted_text

In [13]:
document_indices = docs_per_topics.get(0)
print(document_indices)

Int64Index([   0,    3,   11,   12,   20,   22,   40,   61,   62,   70,
            ...
            4947, 4948, 4950, 4955, 4964, 4967, 4969, 4978, 4980, 4998],
           dtype='int64', length=896)


In [14]:
import pandas as pd
df_review = pd.DataFrame()
review_tokens_data = original_docs.copy()
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')

df = pd.DataFrame(columns=['Tokens','Tag_vectors','Terms','Extraction','Topic#','Review#'])
for topic_no in range(TOPIC_COUNT): 
  document_indices = docs_per_topics.get(topic_no)
  terms = topic_model.get_topic(topic_no)
  for i in document_indices:
    if (i < REVIEW_COUNT):
      review_tokens_data[i] = [token.strip() for token in wtk.tokenize(original_docs[i])]
      only_terms = parse_only_terms(terms)
      tag_vectors = get_vector_tag(review_tokens_data[i],only_terms)
      aspect_text = [] # extract_phrases_dependency_parsing(original_docs[i],only_terms)      
      df.loc[len(df)] = [review_tokens_data[i],tag_vectors,terms,aspect_text,topic_no,i]

In [15]:
tagged_file_name = '/drive/MyDrive/Colab Notebooks/absa/amazon_review/amazon_reviews_us_Wireless_v1_00_labelled.csv'
df.to_csv(tagged_file_name)


In [52]:
terms = ['sound','ear','headphones','bluetooth','speaker','music','headset','hear','volume']
sentence = original_docs[51]
print(sentence)
text = extract_phrases_dependency_parsing(sentence,terms)
text

they sounded okay, they had plenty of the skipping that is all too well known with bluetooth headphones, very difficult for me to get the right fit with these. i wanted to like these headphones, but the blue buds x were just a hands down winner, the blue buds just have too many better features to go with freedoms instead. durability wise, only time will tell. but these freedoms weren't as good sounding to me, the ear hooks were not as comfortable as i was expecting, i also prefer the blue buds due to their smaller size. the price is higher, i bit the bullet and bought them and i am so much happier with them than i was with the cheaper alternatives. i use the comply foam tips and they improve any set of headphones dramatically in my opinion.


[['they',
  'sounded',
  'they',
  'plenty',
  'the',
  'all',
  'too',
  'well',
  'bluetooth',
  'very',
  'for',
  'me',
  'to',
  'the',
  'right',
  'fit'],
 ['they',
  'sounded',
  'they',
  'plenty',
  'the',
  'all',
  'too',
  'well',
  'bluetooth',
  'very',
  'for',
  'me',
  'to',
  'the',
  'right',
  'fit'],
 ['i',
  'wanted',
  'to',
  'these',
  'headphones',
  'the',
  'blue',
  'buds',
  'x',
  'just',
  'a',
  'hands',
  'winner',
  'the',
  'blue',
  'buds',
  'just',
  'too',
  'many',
  'better',
  'features',
  'to',
  'instead'],
 ['these',
  'freedoms',
  "n't",
  'as',
  'good',
  'sounding',
  'the',
  'ear',
  'hooks',
  'not',
  'as',
  'as',
  'i',
  'was',
  'expecting',
  'i',
  'also',
  'the',
  'blue',
  'buds',
  'smaller'],
 ['i',
  'use',
  'the',
  'comply',
  'foam',
  'tips',
  'they',
  'improve',
  'any',
  'set',
  'dramatically']]

In [26]:
sentence = original_docs[118]
sentence

"we have tried all of the cords in the pack.  they seem to work a couple of times and then they don't charge any longer. really disappointed!"

BERTopic will give different topic infor in different run due to its stochastic nature of UMAP ( Uniform Manifold Approximation and Projection).
UMAP is a dimentionality reduction technique similar to t-SNE.