In [1]:
import pandas as pd
from nltk.corpus import stopwords
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category = DeprecationWarning)
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
stop_words = stopwords.words('english')
stop_words.append("said")
bbc_df = pd.read_csv("../data/bbc-text.csv")

In [4]:
documents = bbc_df["text"]
preprocessor = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stop_words) 
preprocessed_documents, unpreprocessed_documents, vocab, indices = preprocessor.preprocess() 

In [5]:
tp = TopicModelDataPreparation("distiluse-base-multilingual-cased")
training_dataset = tp.fit(text_for_contextual=unpreprocessed_documents, text_for_bow=preprocessed_documents)

W0125 22:43:11.488000 14712 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

In [6]:
ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=512, n_components=5, num_epochs=100)
ctm.fit(training_dataset)

Epoch: [100/100]	 Seen Samples: [217600/222500]	Train Loss: 1077.365846521714	Time: 0:00:21.579533: : 100it [36:06, 21.66s/it]
100%|██████████| 35/35 [00:21<00:00,  1.60it/s]


In [7]:
ctm.get_topics()

defaultdict(list,
            {0: ['side',
              'matches',
              'defeat',
              'final',
              'back',
              'improved',
              'semi',
              'captain',
              'row',
              'transfer'],
             1: ['bn',
              'us',
              'economy',
              'market',
              'oil',
              'bank',
              'economic',
              'sales',
              'year',
              'china'],
             2: ['mr',
              'would',
              'blair',
              'labour',
              'brown',
              'election',
              'party',
              'government',
              'tax',
              'secretary'],
             3: ['technology',
              'people',
              'computer',
              'mobile',
              'content',
              'music',
              'digital',
              'net',
              'video',
              'microsoft'],
             4: ['aw

In [8]:
spanish_news_piece = """IBM anuncia el comienzo de la “era de la utilidad cuántica” y anticipa un superordenador en 2033. 
La compañía asegura haber alcanzado un sistema de computación que no se puede simular con procedimientos clásicos."""
testing_dataset = tp.transform([spanish_news_piece])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
ctm.get_doc_topic_distribution(testing_dataset)

100%|██████████| 1/1 [00:03<00:00,  3.97s/it]


array([[0.08576978, 0.09846543, 0.17105791, 0.55924577, 0.08546114]],
      dtype=float32)