In [6]:
import json

with open('0-texts-ctm.json', 'r') as f:
    docs = json.load(f)

In [None]:
from collections import Counter

with open('0-nouns_adj.json', 'r') as f:
    documents = json.load(f)

docs_flat = [item for doc in documents for item in doc] # flatten
c = Counter(docs_flat)
most_common = [key for key, _ in c.most_common()[:25]] # 25 most common words
least_common = [key for key, value in c.most_common() if value == 1] # occur only once

stoplist = most_common + least_common + ['нью', 'по']

docs = [[word for word in doc if len(word) > 2 and word not in stoplist] \
        for doc in documents]

In [10]:
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

sp = WhiteSpacePreprocessingStopwords(docs, 'russian')
preprocessed_documents, unpreprocessed_documents, vocab = sp.preprocess()

In [13]:
# Zero-Shot
from contextualized_topic_models.models.ctm import ZeroShotTM

# Initialize a contextualized model
qt_z = TopicModelDataPreparation('DeepPavlov/rubert-base-cased') # BERT model

# Create the training set
training_dataset_z = qt_z.fit(text_for_contextual=unpreprocessed_documents, \
                              text_for_bow=preprocessed_documents)

# Train the model
num_topics = 10
ctm_zero = ZeroShotTM(bow_size=len(qt_z.vocab), contextual_size=768, \
                      n_components=num_topics, num_epochs=50)

Some weights of the model checkpoint at C:\Users\satan/.cache\torch\sentence_transformers\DeepPavlov_rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [14]:
# Run the model
ctm_zero.fit(training_dataset_z)

Epoch: [50/50]	 Seen Samples: [91200/91200]	Train Loss: 5303.0183747944075	Time: 0:00:16.225645: : 50it [13:43, 16.48s/it]
Sampling: [20/20]: : 20it [05:16, 15.83s/it]


In [15]:
ctm_zero.get_topics(15)

defaultdict(list,
            {0: ['возраст',
              'источник',
              'млн',
              'авторов',
              'содержание',
              'современного',
              'прошлом',
              'возраста',
              'углерода',
              'видов',
              'считают',
              'морские',
              'океана',
              'привело',
              'показало'],
             1: ['кто',
              'литературе',
              'русские',
              'власти',
              'xviii',
              'подробно',
              'половине',
              'своем',
              'нами',
              'языке',
              'живет',
              'настолько',
              'начал',
              'нас',
              'автор'],
             2: ['сам',
              'литературе',
              'xviii',
              'такое',
              'русские',
              'язык',
              'общества',
              'начал',
              'власти',
              'под

In [None]:
ctm_zero.get_wordcloud(topic_id=4, n_words=15)

In [12]:
# Combined TM
from contextualized_topic_models.models.ctm import CombinedTM

# Initialize a contextualized model
qt_c = TopicModelDataPreparation("Tatyana/rubert-base-cased-sentiment-new") # BERT model

# Create the training set
training_dataset_c = qt_c.fit(text_for_contextual=unpreprocessed_documents, \
                              text_for_bow=preprocessed_documents)

# Train the model
num_topics = 20
ctm_comb = CombinedTM(bow_size=len(qt_c.vocab), contextual_size=768, \
                      n_components=num_topics, num_epochs=50)

Some weights of the model checkpoint at C:\Users\satan/.cache\torch\sentence_transformers\Tatyana_rubert-base-cased-sentiment-new were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [13]:
# Run the model
ctm_comb.fit(training_dataset_c)

Epoch: [50/50]	 Seen Samples: [69100/69100]	Train Loss: 3137.0402496382053	Time: 0:00:16.800953: : 50it [14:06, 16.93s/it]
Sampling: [20/20]: : 20it [05:33, 16.66s/it]


In [None]:
from pprint import pprint

pprint(ctm_comb.get_topics(15))