In [1]:
# Mount your google drive in google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Insert the directory
import sys
sys.path.insert(0,'/content/drive/My Drive/CMPUT 701 - Shraddha/Experiments - Code')

In [None]:
!pip install setuptools~=67.6.0
!pip install spacy~=3.5.0
!pip install numpy~=1.21.5
!pip install gensim~=4.1.2
!pip install networkx~=2.8.4
!pip install tomotopy
!pip install bertopic
!pip install igraph

In [3]:
import numpy as np
import network_creation
from gensim.models.phrases import Phraser, ENGLISH_CONNECTOR_WORDS
import preprocessing
import community_utils
import tomotopy as tp
import networkx as nx
import igraph as ig
from gensim.models.coherencemodel import CoherenceModel
from diversity_metrics import *
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel




In [4]:
with open("/content/drive/My Drive/CMPUT 701 - Shraddha/Experiments - Code/text_datasets/europarl_fr_train.txt", "r", encoding="utf-8") as f:
    bbc_train = f.read().split("\n")
with open("/content/drive/My Drive/CMPUT 701 - Shraddha/Experiments - Code/text_datasets/europarl_fr_test.txt", "r", encoding="utf-8") as f:
    bbc_test = f.read().split("\n")

In [47]:
import pickle
with open("/content/drive/My Drive/CMPUT 701 - Shraddha/Experiments - Code/ep_es_master_object.obj", "rb") as f:
      master_object = pickle.load(f)

In [5]:
# reference - https://github.com/MaartenGr/BERTopic/issues/90
"""
MaartenGr commented on Apr 15, 2021
Good catch, I did not test for higher n-grams in the example. I made two changes:

Used the build_analyzer() instead of build_tokenizer() which allows for n-gram tokenization
Preprocessing is now based on a collection of documents per topic, since the CountVectorizer was trained on that data
Tested it with several ranges of n-grams and it seems to work now.
"""

topic_model = BERTopic(language="multilingual", verbose=True, top_n_words=10)
topics, _ = topic_model.fit_transform(bbc_train)

Downloading (…)0fe39/.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)83e900fe39/README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading (…)e900fe39/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading (…)900fe39/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/594 [00:00<?, ?it/s]

2023-03-19 21:55:04,724 - BERTopic - Transformed documents to Embeddings
2023-03-19 21:55:40,499 - BERTopic - Reduced dimensionality
2023-03-19 21:55:43,798 - BERTopic - Clustered reduced embeddings


In [6]:
print("Num of topic: ", len(set(topics)))

Num of topic:  218


In [7]:
topic_words_list = []

for i in range(len(set(topics))-2):
  single_topic = []
  topic = topic_model.get_topic(i)  # select the most frequent topic
  for t in topic:
    single_topic.append(t[0])
  topic_words_list.append(single_topic)

print(topic_words_list)

[['europe', 'européenne', 'union', 'ue', 'citoyens', 'plus', 'une', 'doit', 'européens', 'et'], ['européen', 'parlement', 'européenne', 'commission', 'conseil', 'union', 'le', 'européens', 'la', 'du'], ['président', 'présidente', 'monsieur', 'madame', 'débat', 'chers', 'messieurs', 'je', 'voudrais', 'mesdames'], ['femmes', 'hommes', 'égalité', 'participation', 'sexes', 'femme', 'quotas', 'entre', 'elles', 'les'], ['véhicules', 'voitures', 'recyclage', 'automobile', 'automobiles', 'constructeurs', 'voiture', 'industrie', 'déchets', 'usage'], ['amendements', 'amendement', 'oral', 'proposition', 'déposé', '22', 'commission', 'groupe', '13', '45'], ['alimentaire', 'animaux', 'aliments', 'alimentation', 'additifs', 'génétiquement', 'modifiés', 'sécurité', 'ogm', 'autorité'], ['autriche', 'autrichien', 'parti', 'gouvernement', 'autrichienne', 'autrichiens', 'coalition', 'haider', 'fpö', 'italien'], ['kosovo', 'serbes', 'albanais', 'milosevic', 'ethnique', 'kosovars', 'au', 'otan', 'situation

In [8]:
import pandas as pd                     

# Preprocess Documents
documents = pd.DataFrame({"Document": bbc_train,
                          "ID": range(len(bbc_train)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

In [9]:
# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

In [10]:
# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

In [11]:
print("puw:", proportion_unique_words(topic_words, topk=10))
print("jd:", pairwise_jaccard_diversity(topic_words, topk=10))
print("irbo p=0.5:", irbo(topic_words, weight=0.5, topk=10))
print("irbo p=0.9:", irbo(topic_words, weight=0.9, topk=10))


for coherence in ["c_v", "c_npmi", "u_mass"]:
  for topn in [5, 10, 20]:
    cm = CoherenceModel(topics=topic_words,
                        texts=tokens,
                        dictionary=dictionary,
                        topn=topn,
                        coherence=coherence)
    score = cm.get_coherence()
    print(coherence, ":", score, "(topn=", topn, ")")


puw: 0.7534562211981567
jd: 0.997758161586168
irbo p=0.5: 0.9982551100303386
irbo p=0.9: 0.996773229300527
c_v : 0.7207397847940442 (topn= 5 )
c_v : 0.6172792949489032 (topn= 10 )
c_v : 0.6172792949489032 (topn= 20 )
c_npmi : 0.09519691197925455 (topn= 5 )
c_npmi : -0.03539229360803869 (topn= 10 )
c_npmi : -0.03539229360803869 (topn= 20 )
u_mass : -1.169444899849656 (topn= 5 )
u_mass : -1.213997014555042 (topn= 10 )
u_mass : -1.213997014555042 (topn= 20 )


In [None]:
#----------------------------------------
# Next section is just trial and error for hierarchical BerTopic

In [None]:
topic_model.get_topics()

In [None]:
 # Evaluate metric(s)
topic_list = []
word_score_list = []

# Iterate over topics to create nested list of topics
for i in topic_model.get_topic_info()['Topic']:
  single_topic_list = []
  single_word_score_list = []
  for elem in topic_model.get_topic(i):
    single_topic_list.append(elem[0])
    single_word_score_list.append(elem[1])

  topic_list.append(single_topic_list)
  word_score_list.append(single_word_score_list)

In [None]:
print(topic_list)
print(len(topic_list))
print(word_score_list)
print(len(word_score_list))

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(bbc_train)


In [None]:
hierarchical_topics

In [None]:
hierarchical_topics.Topics.values

In [None]:
topic_model.get_topic_tree(hierarchical_topics)

In [None]:
pip install corextopic
