In [1]:
import sys
sys.path.append('topicx/')

from baselines.cetopictm import CETopicTM
from utils import prepare_dataset
import baseline_utils
from sklearn.feature_extraction.text import CountVectorizer
from octis.dataset.dataset import Dataset
import random 
import pickle

In [2]:
dataset_name = '20news'
number_topic_models = 2
list_number_of_topics = [25,50]
sbert_model_name = 'all-mpnet-base-v2'
word2vec_path = '/mnt/datasets/VIST/data/commonsense_ctm_data/GoogleNews-vectors-negative300.bin.gz'


In [3]:
dataset = Dataset()

if dataset_name == '20news':
    dataset.load_custom_dataset_from_folder("resources_octis/20news")
if dataset_name == 'dbpedia':
    dataset.load_custom_dataset_from_folder("resources_octis/dbpedia")
if dataset_name =='google_news':
    dataset.load_custom_dataset_from_folder("resources_octis/google_news")

In [4]:
texts = dataset.get_corpus()
texts = [' '.join(text) for text in texts]

In [5]:

key_name = 'results/topicx_'+dataset_name+'_NTM_'+str(number_topic_models)+'_ntopics_'+str(list_number_of_topics)+'_embedding_'+sbert_model_name+'.pkl'
print('key_name', key_name)

key_name results/topicx_20news_NTM_2_ntopics_[25, 50]_embedding_all-mpnet-base-v2.pkl


In [6]:

from datetime import datetime
start_time = datetime.now()

final_results = []
for current_number_of_topics in list_number_of_topics:
    for i in range(number_topic_models):
        random_seed_number = random.randint(0, 1000)
        print('Random seed number: {}'.format(random_seed_number))


        tm = CETopicTM(dataset=dataset, 
                    topic_model='cetopic', 
                    num_topics=current_number_of_topics, 
                    dim_size=50,#In the paper the authors say 'we reduce the dimensionality of sentence embedding to 50 usign UMAP' #default 5 
                    word_select_method='tfidf_idfi', #best word selection method according to their paper
                    embedding='/mnt/datasets/SBERT/'+sbert_model_name,
                    seed=random_seed_number)  #sentence-transformers/all-mpnet-base-v2', #embedding='princeton-nlp/unsup-simcse-bert-base-uncased',  #Default in Raymond's evaluation is: all-mpnet-base-v2 #TODO: I think we must use a different embedding)
        
        tm.train()

        #td_score, cv_score, npmi_score = tm.evaluate()
        #print(f'td: {td_score} npmi: {npmi_score} cv: {cv_score}')

        topics = tm.get_topics()
        print(f'Topics: {topics}')

        formatted_topics_list = []

        for key, value in topics.items():
            #new_list.append(topic.split())
            list_current_topic = []
            for keyword, score in value:
                list_current_topic.append(keyword)
            #print(list_current_topic)
            formatted_topics_list.append(list_current_topic)

        npmi_score, we_score, irbo_score, td_score = baseline_utils.evaluate(formatted_topics_list, texts, embeddings_path=word2vec_path)
        dict_results = {'npmi_score': npmi_score, 'we_score': we_score, 'irbo_score': irbo_score, 'td_score': td_score}
        print(dict_results)
        final_results.append(dict_results)

        output = open(key_name, 'wb')
        pickle.dump(final_results, output)
        output.close()
        end_time  =datetime.now()
        print('Duration: {}'.format(end_time - start_time))

output = open(key_name, 'wb')
pickle.dump(final_results, output)
output.close()    




Random seed number: 279
Initialize CETopicTM with num_topics=25, embedding=/mnt/datasets/SBERT/all-mpnet-base-v2
Topics: {0: [('morality', 0.007924123353806092), ('article', 0.007509111950931161), ('people', 0.0072568644277435776), ('objective', 0.006795872524744639), ('think', 0.0064857723446847615), ('dont', 0.005579055479196847), ('just', 0.00529419214581915), ('evidence', 0.004833451631825757), ('say', 0.00440508098814201), ('moral', 0.004306914360134347)], 1: [('radio', 0.007896098725012873), ('tape', 0.007873014203870756), ('battery', 0.0069355483369465416), ('audio', 0.006115110444749674), ('use', 0.005595446324983719), ('cd', 0.005483803179234133), ('sound', 0.005455724644273408), ('circuit', 0.005251386679415793), ('used', 0.00514182991266607), ('like', 0.004965471854358432)], 2: [('israel', 0.026925451825609337), ('jews', 0.02405723074740101), ('israeli', 0.02248659210858313), ('jewish', 0.02048334260402525), ('arab', 0.016779225955958797), ('arabs', 0.01579789717218034), ('m

03/13/2023 21:12:33 - INFO - gensim.corpora.dictionary -   adding document #0 to Dictionary<0 unique tokens: []>
03/13/2023 21:12:33 - INFO - gensim.corpora.dictionary -   adding document #10000 to Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...>
03/13/2023 21:12:33 - INFO - gensim.corpora.dictionary -   built Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...> from 18173 documents (total 909509 corpus positions)
03/13/2023 21:12:33 - INFO - gensim.utils -   Dictionary lifecycle event {'msg': "built Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...> from 18173 documents (total 909509 corpus positions)", 'datetime': '2023-03-13T21:12:33.571359', 'gensim': '4.2.0', 'python': '3.8.10 (default, Jun  2 2021, 10:49:15) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-84-generic-x86_64-with-glibc2.29', 'event': 'created'}
03/13/2023 21:12:33 - INFO - gensim.topic_coherence.probability_estimation -   using

{'npmi_score': 0.15405963128928893, 'we_score': 0.23126341, 'irbo_score': 0.9879595474550714, 'td_score': 0.844}
Duration: 0:03:17.585467
Random seed number: 697
Initialize CETopicTM with num_topics=25, embedding=/mnt/datasets/SBERT/all-mpnet-base-v2
Topics: {0: [('car', 0.02485005365230378), ('cars', 0.012828079260050727), ('ford', 0.010355058261026116), ('engine', 0.010013260313330646), ('oil', 0.007196102123983612), ('turbo', 0.006845764846790278), ('radar', 0.006765386307769), ('miles', 0.006360902056207117), ('dealer', 0.006344312943602546), ('just', 0.00590358781574913)], 1: [('god', 0.015511583406137433), ('jesus', 0.01305898941500947), ('bible', 0.009274604105684491), ('christian', 0.008873507174276806), ('church', 0.008726402506360057), ('christians', 0.008193279114772472), ('christ', 0.007599127440225455), ('homosexual', 0.007551023394144562), ('homosexuality', 0.006436759521264659), ('people', 0.006428295014394069)], 2: [('motif', 0.012558992131972353), ('mouse', 0.012395511

03/13/2023 21:15:35 - INFO - gensim.corpora.dictionary -   adding document #0 to Dictionary<0 unique tokens: []>
03/13/2023 21:15:35 - INFO - gensim.corpora.dictionary -   adding document #10000 to Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...>
03/13/2023 21:15:35 - INFO - gensim.corpora.dictionary -   built Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...> from 18173 documents (total 909509 corpus positions)
03/13/2023 21:15:35 - INFO - gensim.utils -   Dictionary lifecycle event {'msg': "built Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...> from 18173 documents (total 909509 corpus positions)", 'datetime': '2023-03-13T21:15:35.998783', 'gensim': '4.2.0', 'python': '3.8.10 (default, Jun  2 2021, 10:49:15) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-84-generic-x86_64-with-glibc2.29', 'event': 'created'}
03/13/2023 21:15:36 - INFO - gensim.topic_coherence.probability_estimation -   using

{'npmi_score': nan, 'we_score': nan, 'irbo_score': 0.9857340679244285, 'td_score': 0.804}
Duration: 0:06:19.479838
Random seed number: 763
Initialize CETopicTM with num_topics=50, embedding=/mnt/datasets/SBERT/all-mpnet-base-v2
Topics: {0: [('3d', 0.020730406147559624), ('gif', 0.02038720965631712), ('format', 0.014042042407949599), ('graphics', 0.01381713110067996), ('image', 0.0109958181612322), ('program', 0.009332280911726984), ('jpeg', 0.008913333109830548), ('formats', 0.008159629344114096), ('convert', 0.008063052407915916), ('files', 0.007937262158956014)], 1: [('feet', 0.012044894635581118), ('thanks', 0.00872635954656463), ('insurance', 0.00847205906465639), ('face', 0.007730355976120141), ('moment', 0.007690522202122535), ('hands', 0.007534646192751484), ('eye', 0.007409637905844352), ('just', 0.007342477450946603), ('difference', 0.0072148660974439525), ('hi', 0.007056645843660408)], 2: [('israel', 0.029286360263018407), ('israeli', 0.0238553321131165), ('jews', 0.023114909

03/13/2023 21:18:37 - INFO - gensim.corpora.dictionary -   adding document #0 to Dictionary<0 unique tokens: []>
03/13/2023 21:18:37 - INFO - gensim.corpora.dictionary -   adding document #10000 to Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...>
03/13/2023 21:18:38 - INFO - gensim.corpora.dictionary -   built Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...> from 18173 documents (total 909509 corpus positions)
03/13/2023 21:18:38 - INFO - gensim.utils -   Dictionary lifecycle event {'msg': "built Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...> from 18173 documents (total 909509 corpus positions)", 'datetime': '2023-03-13T21:18:38.027401', 'gensim': '4.2.0', 'python': '3.8.10 (default, Jun  2 2021, 10:49:15) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-84-generic-x86_64-with-glibc2.29', 'event': 'created'}
03/13/2023 21:18:38 - INFO - gensim.topic_coherence.probability_estimation -   using

{'npmi_score': 0.14406007614015526, 'we_score': 0.22462593, 'irbo_score': 0.9896593758747638, 'td_score': 0.75}
Duration: 0:09:24.166272
Random seed number: 218
Initialize CETopicTM with num_topics=50, embedding=/mnt/datasets/SBERT/all-mpnet-base-v2


03/13/2023 21:21:42 - INFO - gensim.corpora.dictionary -   adding document #0 to Dictionary<0 unique tokens: []>


Topics: {0: [('orbit', 0.016519368436004035), ('space', 0.01585004520428548), ('shuttle', 0.013614426054165771), ('spacecraft', 0.012991155567141869), ('moon', 0.012522152656216576), ('nasa', 0.012492672681444807), ('lunar', 0.011485244971344583), ('solar', 0.009550690314702464), ('launch', 0.00945790584560128), ('mission', 0.008166505617858932)], 1: [('modem', 0.06519996341369302), ('fax', 0.023263520928746402), ('serial', 0.021782197870994058), ('port', 0.02157875399221169), ('ports', 0.0128566363521248), ('pc', 0.008297261631428887), ('software', 0.008115507254968716), ('mac', 0.006933904673644644), ('connector', 0.006779840390491774), ('external', 0.006708765849828871)], 2: [('atheists', 0.040916410420761995), ('atheist', 0.03524060051169942), ('atheism', 0.032693391510826744), ('god', 0.023805940564354193), ('religion', 0.009415685242574785), ('belief', 0.009258702505998622), ('believe', 0.008873425382078316), ('faith', 0.007979456936350757), ('christianity', 0.007764159882212314)

03/13/2023 21:21:42 - INFO - gensim.corpora.dictionary -   adding document #10000 to Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...>
03/13/2023 21:21:42 - INFO - gensim.corpora.dictionary -   built Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...> from 18173 documents (total 909509 corpus positions)
03/13/2023 21:21:42 - INFO - gensim.utils -   Dictionary lifecycle event {'msg': "built Dictionary<2000 unique tokens: ['actually', 'beat', 'better', 'bit', 'couple']...> from 18173 documents (total 909509 corpus positions)", 'datetime': '2023-03-13T21:21:42.828383', 'gensim': '4.2.0', 'python': '3.8.10 (default, Jun  2 2021, 10:49:15) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-84-generic-x86_64-with-glibc2.29', 'event': 'created'}
03/13/2023 21:21:42 - INFO - gensim.topic_coherence.probability_estimation -   using ParallelWordOccurrenceAccumulator<processes=7, batch_size=64> to estimate probabilities from sliding windows
03/

{'npmi_score': 0.15184204075800584, 'we_score': 0.2233993, 'irbo_score': 0.9883366881035277, 'td_score': 0.74}
Duration: 0:12:27.638308


In [7]:
#How to read the results

''' 

with open(key_name, 'rb') as f:
    results_file = pickle.load(f)

import pandas as pd
pd.DataFrame(results_file)
pd.DataFrame(results_file).mean()

'''