# Importing some useful libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Reading Dataset from HuggingFace

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset('SetFit/20_newsgroups')

Using custom data configuration SetFit--20_newsgroups-f9362e018b6adf67
Found cached dataset json (/home/sysadm/.cache/huggingface/datasets/SetFit___json/SetFit--20_newsgroups-f9362e018b6adf67/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 7532
    })
})

In [5]:
type(dataset)

datasets.dataset_dict.DatasetDict

# Sample Data

In [6]:
dataset['train'][0]

{'text': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 'label': 7,
 'label_text': 'rec.autos'}

# Checking number of instances of train and test

In [7]:
len(dataset['train'])

11314

In [8]:
len(dataset['test'])

7532

# Creating Train and Test Data

In [9]:
documents_train = []

for i in range(len(dataset['train'])):
    documents_train.append(dataset['train'][i]['text'])

In [10]:
dataset['train'][0]['text']

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [11]:
documents_train[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [12]:
documents_test = []

for i in range(len(dataset['test'])):
    documents_test.append(dataset['test'][i]['text'])

In [13]:
documents_test[0]

'I am a little confused on all of the models of the 88-89 bonnevilles.\nI have heard of the LE SE LSE SSE SSEI. Could someone tell me the\ndifferences are far as features or performance. I am also curious to\nknow what the book value is for prefereably the 89 model. And how much\nless than book value can you usually get them for. In other words how\nmuch are they in demand this time of year. I have heard that the mid-spring\nearly summer is the best time to buy.'

In [14]:
len(documents_train)

11314

In [15]:
len(documents_test)

7532

# Importing what we need

In [16]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk

# Preprocessing

In [17]:
from nltk.corpus import stopwords as stop_words

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sysadm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
stopwords = list(stop_words.words("english"))

# Building the Model

In [20]:
sp = WhiteSpacePreprocessingStopwords(documents_train, stopwords_list=stopwords)

In [21]:
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()



# Will try to identify posterior collapse

# Only Using "all-mpnet-base-v2" as PLM

In [22]:
tpa = TopicModelDataPreparation("all-mpnet-base-v2")

In [23]:
training_dataset1=tpa.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)



Batches:   0%|          | 0/55 [00:00<?, ?it/s]



## Batch Size 32 and learning rate 10^-7

In [24]:
ctma1 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=1e-7, solver='adam',
                  n_components=10, num_epochs=100)
ctma2 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=1e-7, solver='adam',
                  n_components=20, num_epochs=100)
ctma3 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=1e-7, solver='adam',
                  n_components=30, num_epochs=100)
ctma4 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=1e-7, solver='adam',
                  n_components=50, num_epochs=100)
ctma5 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=1e-7, solver='adam',
                  n_components=100, num_epochs=100)

In [25]:
ctma1.fit(training_dataset1)
ctma2.fit(training_dataset1) 
ctma3.fit(training_dataset1)
ctma4.fit(training_dataset1)
ctma5.fit(training_dataset1)

Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: 566.0066612199035	Time: 0:00:11.750636: : 100it [18:58, 11.38s/it]
Sampling: [20/20]: : 20it [01:30,  4.50s/it]
Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: 584.5583935118558	Time: 0:00:11.208517: : 100it [20:22, 12.22s/it]
Sampling: [20/20]: : 20it [01:30,  4.55s/it]
Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: 583.3190147890682	Time: 0:00:13.486770: : 100it [20:10, 12.11s/it]
Sampling: [20/20]: : 20it [01:38,  4.91s/it]
Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: 612.6710064090483	Time: 0:00:11.910460: : 100it [19:45, 11.85s/it]
Sampling: [20/20]: : 20it [01:45,  5.26s/it]
Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: 652.338473403663	Time: 0:00:12.320324: : 100it [20:35, 12.36s/it]
Sampling: [20/20]: : 20it [01:36,  4.83s/it]


In [26]:
with open(r'preprocessed_documents.txt', 'w') as fp:
    for item in preprocessed_documents:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done


In [27]:
from contextualized_topic_models.evaluation.measures import CoherenceNPMI
from contextualized_topic_models.evaluation.measures import TopicDiversity

In [28]:
with open('preprocessed_documents.txt', "r") as fr:
    texts = [doc.split() for doc in fr.read().splitlines()] # load text for NPMI

In [29]:
npmia1 = CoherenceNPMI(texts=texts, topics=ctma1.get_topic_lists(10)); a1 = npmia1.score(topk=10)
npmia2 = CoherenceNPMI(texts=texts, topics=ctma2.get_topic_lists(10)); a2 = npmia2.score(topk=10)
npmia3 = CoherenceNPMI(texts=texts, topics=ctma3.get_topic_lists(10)); a3 = npmia3.score(topk=10)
npmia4 = CoherenceNPMI(texts=texts, topics=ctma4.get_topic_lists(10)); a4 = npmia4.score(topk=10)
npmia5 = CoherenceNPMI(texts=texts, topics=ctma5.get_topic_lists(10)); a5 = npmia5.score(topk=10)

In [30]:
print(f"for 10 topics, the coherenceNPMI is {a1}")
print(f"for 20 topics, the coherenceNPMI is {a2}")
print(f"for 30 topics, the coherenceNPMI is {a3}")
print(f"for 50 topics, the coherenceNPMI is {a4}")
print(f"for 100 topics, the coherenceNPMI is {a5}")

for 10 topics, the coherenceNPMI is -0.26875313220350555
for 20 topics, the coherenceNPMI is -0.24743953369429547
for 30 topics, the coherenceNPMI is -0.2738088090824495
for 50 topics, the coherenceNPMI is -0.24863943501846283
for 100 topics, the coherenceNPMI is -0.2557335394061254


In [31]:
tda1 = TopicDiversity(topics=ctma1.get_topic_lists(10)); ta1 = tda1.score(topk=10)
tda2 = TopicDiversity(topics=ctma2.get_topic_lists(10)); ta2 = tda2.score(topk=10)
tda3 = TopicDiversity(topics=ctma3.get_topic_lists(10)); ta3 = tda3.score(topk=10)
tda4 = TopicDiversity(topics=ctma4.get_topic_lists(10)); ta4 = tda4.score(topk=10)
tda5 = TopicDiversity(topics=ctma5.get_topic_lists(10)); ta5 = tda5.score(topk=10)

In [32]:
print(f"for 10 topics, the TopicDiversity is {ta1}")
print(f"for 20 topics, the TopicDiversity is {ta2}")
print(f"for 30 topics, the TopicDiversity is {ta3}")
print(f"for 50 topics, the TopicDiversity is {ta4}")
print(f"for 100 topics, the TopicDiversity is {ta5}")

for 10 topics, the TopicDiversity is 0.95
for 20 topics, the TopicDiversity is 0.93
for 30 topics, the TopicDiversity is 0.9133333333333333
for 50 topics, the TopicDiversity is 0.896
for 100 topics, the TopicDiversity is 0.804


In [33]:
print(ctma1.get_topic_lists(k=5))
print("////")
print(ctma2.get_topic_lists(k=5))
print("////")
print(ctma3.get_topic_lists(k=5))
print("////")
print(ctma4.get_topic_lists(k=5))
print("////")
print(ctma5.get_topic_lists(k=5))

[['attempt', 'field', 'fight', 'rom', 'many'], ['drivers', 'woman', 'forces', 'meeting', 'interface'], ['robert', 'mailing', 'natural', 'says', 'woman'], ['gets', 'issue', 'fight', 'hey', 'ah'], ['playing', 'uk', 'numbers', 'kept', 'organizations'], ['place', 'zone', 'replies', 'sometimes', 'company'], ['individual', 'hands', 'minnesota', 'mc', 'break'], ['nobody', 'events', 'player', 'feature', 'converter'], ['ftp', 'places', 'begin', 'quality', 'differences'], ['variety', 'distribution', 'hardware', 'mhz', 'prices']]
////
[['supply', 'memory', 'sp', 'air', 'obtained'], ['going', 'transfer', 'eof', 'table', 'impossible'], ['armenian', 'seriously', 'terms', 'willing', 'matter'], ['moved', 'hello', 'assuming', 'tl', 'ray'], ['colors', 'merely', 'atheist', 'oname', 'lead'], ['week', 'bring', 'air', 'appreciated', 'washington'], ['stick', 'best', 'cpu', 'check', 'europe'], ['along', 'eternal', 'pu', 'later', 'tom'], ['saying', 'militia', 'criminal', 'ed', 'max'], ['eye', 'season', 'bible'

## Batch Size 32 and learning rate 15

In [34]:
ctmb1 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=15, solver='adam',
                  n_components=10, num_epochs=100)
ctmb2 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=15, solver='adam',
                  n_components=20, num_epochs=100)
ctmb3 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=15, solver='adam',
                  n_components=30, num_epochs=100)
ctmb4 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=15, solver='adam',
                  n_components=50, num_epochs=100)
ctmb5 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                  dropout=0.2, batch_size=32, lr=15, solver='adam',
                  n_components=100, num_epochs=100)

In [35]:
ctmb1.fit(training_dataset1)
ctmb2.fit(training_dataset1) 
ctmb3.fit(training_dataset1)
ctmb4.fit(training_dataset1)
ctmb5.fit(training_dataset1)

Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: nan	Time: 0:00:11.503402: : 100it [18:33, 11.13s/it]            
Sampling: [20/20]: : 20it [01:20,  4.02s/it]
Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: nan	Time: 0:00:10.919300: : 100it [18:54, 11.34s/it]            
Sampling: [20/20]: : 20it [01:31,  4.57s/it]
Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: nan	Time: 0:00:12.381671: : 100it [19:12, 11.52s/it]            
Sampling: [20/20]: : 20it [01:35,  4.76s/it]
Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: nan	Time: 0:00:12.031894: : 100it [18:53, 11.34s/it]            
Sampling: [20/20]: : 20it [01:42,  5.11s/it]
Epoch: [100/100]	 Seen Samples: [1094400/1095100]	Train Loss: nan	Time: 0:00:13.074626: : 100it [20:46, 12.47s/it]
Sampling: [20/20]: : 20it [01:47,  5.38s/it]


In [36]:
npmib1 = CoherenceNPMI(texts=texts, topics=ctmb1.get_topic_lists(10)); b1 = npmib1.score(topk=10)
npmib2 = CoherenceNPMI(texts=texts, topics=ctmb2.get_topic_lists(10)); b2 = npmib2.score(topk=10)
npmib3 = CoherenceNPMI(texts=texts, topics=ctmb3.get_topic_lists(10)); b3 = npmib3.score(topk=10)
npmib4 = CoherenceNPMI(texts=texts, topics=ctmb4.get_topic_lists(10)); b4 = npmib4.score(topk=10)
npmib5 = CoherenceNPMI(texts=texts, topics=ctmb5.get_topic_lists(10)); b5 = npmib5.score(topk=10)

In [37]:
print(f"for 10 topics, the coherenceNPMI is {b1}")
print(f"for 20 topics, the coherenceNPMI is {b2}")
print(f"for 30 topics, the coherenceNPMI is {b3}")
print(f"for 50 topics, the coherenceNPMI is {b4}")
print(f"for 100 topics, the coherenceNPMI is {b5}")

for 10 topics, the coherenceNPMI is 0.13694446572443153
for 20 topics, the coherenceNPMI is -0.08294739701591167
for 30 topics, the coherenceNPMI is 0.10902113721593902
for 50 topics, the coherenceNPMI is 0.0264843100851418
for 100 topics, the coherenceNPMI is 0.062197701543839135


In [38]:
tdb1 = TopicDiversity(topics=ctmb1.get_topic_lists(10)); tb1 = tdb1.score(topk=10)
tdb2 = TopicDiversity(topics=ctmb2.get_topic_lists(10)); tb2 = tdb2.score(topk=10)
tdb3 = TopicDiversity(topics=ctmb3.get_topic_lists(10)); tb3 = tdb3.score(topk=10)
tdb4 = TopicDiversity(topics=ctmb4.get_topic_lists(10)); tb4 = tdb4.score(topk=10)
tdb5 = TopicDiversity(topics=ctmb5.get_topic_lists(10)); tb5 = tdb5.score(topk=10)

In [39]:
print(f"for 10 topics, the TopicDiversity is {tb1}")
print(f"for 20 topics, the TopicDiversity is {tb2}")
print(f"for 30 topics, the TopicDiversity is {tb3}")
print(f"for 50 topics, the TopicDiversity is {tb4}")
print(f"for 100 topics, the TopicDiversity is {tb5}")

for 10 topics, the TopicDiversity is 0.9
for 20 topics, the TopicDiversity is 0.585
for 30 topics, the TopicDiversity is 0.7933333333333333
for 50 topics, the TopicDiversity is 0.5
for 100 topics, the TopicDiversity is 0.404


In [40]:
print(ctmb1.get_topic_lists(k=5))
print("////")
print(ctmb2.get_topic_lists(k=5))
print("////")
print(ctmb3.get_topic_lists(k=5))
print("////")
print(ctmb4.get_topic_lists(k=5))
print("////")
print(ctmb5.get_topic_lists(k=5))

[['team', 'year', 'game', 'nd', 'games'], ['mb', 'drive', 'fine', 'cable', 'card'], ['ax', 'mv', 'mw', 'cx', 'db'], ['md', 'ww', 'qs', 'sk', 'uw'], ['clipper', 'use', 'chip', 'would', 'encryption'], ['people', 'would', 'israeli', 'israel', 'president'], ['god', 'jesus', 'church', 'christian', 'believe'], ['file', 'edu', 'com', 'window', 'program'], ['giz', 'qax', 'bhj', 'max', 'ey'], ['people', 'would', 'gun', 'car', 'said']]
////
[['shared', 'modified', 'sending', 'distributed', 'variety'], ['one', 'people', 'would', 'use', 'key'], ['merely', 'implementation', 'modified', 'applied', 'caught'], ['length', 'implementation', 'variety', 'ok', 'obtained'], ['drive', 'ax', 'would', 'think', 'god'], ['thanks', 'modified', 'responses', 'please', 'hello'], ['thanks', 'anyone', 'mb', 'please', 'hi'], ['thanks', 'accurate', 'applied', 'chicago', 'rangers'], ['people', 'god', 'would', 'church', 'bible'], ['gun', 'us', 'government', 'law', 'israel'], ['thanks', 'variety', 'packages', 'modified', '

## Batch size 1024 and learning rate 1e-7

In [42]:
ctmc1 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=1e-7, solver='adam',
                 n_components=10, num_epochs=100)
ctmc2 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=1e-7, solver='adam',
                 n_components=20, num_epochs=100)
ctmc3 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=1e-7, solver='adam',
                 n_components=30, num_epochs=100)
ctmc4 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=1e-7, solver='adam',
                 n_components=50, num_epochs=100)
ctmc5 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=1e-7, solver='adam',
                 n_components=100, num_epochs=100)

In [43]:
ctmc1.fit(training_dataset1)
ctmc2.fit(training_dataset1)
ctmc3.fit(training_dataset1)
ctmc4.fit(training_dataset1)
ctmc5.fit(training_dataset1)

Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 358410554.73821104	Time: 0:00:02.918322: : 100it [04:55,  2.95s/it]
Sampling: [20/20]: : 20it [00:37,  1.87s/it]
Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 10729534703.688074	Time: 0:00:02.552770: : 100it [04:48,  2.88s/it]
Sampling: [20/20]: : 20it [00:37,  1.90s/it]
Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 22782043896.925888	Time: 0:00:02.999342: : 100it [04:27,  2.67s/it]
Sampling: [20/20]: : 20it [00:38,  1.93s/it]
Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 32449287987.266285	Time: 0:00:02.751385: : 100it [04:49,  2.89s/it]
Sampling: [20/20]: : 20it [00:39,  1.95s/it]
Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 39046484944.59589	Time: 0:00:02.786825: : 100it [05:03,  3.03s/it]
Sampling: [20/20]: : 20it [00:42,  2.12s/it]


In [44]:
npmic1 = CoherenceNPMI(texts=texts, topics=ctmc1.get_topic_lists(10)); c1 = npmic1.score(topk=10)
npmic2 = CoherenceNPMI(texts=texts, topics=ctmc2.get_topic_lists(10)); c2 = npmic2.score(topk=10)
npmic3 = CoherenceNPMI(texts=texts, topics=ctmc3.get_topic_lists(10)); c3 = npmic3.score(topk=10)
npmic4 = CoherenceNPMI(texts=texts, topics=ctmc4.get_topic_lists(10)); c4 = npmic4.score(topk=10)
npmic5 = CoherenceNPMI(texts=texts, topics=ctmc5.get_topic_lists(10)); c5 = npmic5.score(topk=10)

In [45]:
print(f"for 10 topics, the coherenceNPMI is {c1}")
print(f"for 20 topics, the coherenceNPMI is {c2}")
print(f"for 30 topics, the coherenceNPMI is {c3}")
print(f"for 50 topics, the coherenceNPMI is {c4}")
print(f"for 100 topics, the coherenceNPMI is {c5}")

for 10 topics, the coherenceNPMI is -0.23494672797729726
for 20 topics, the coherenceNPMI is -0.2590431524098522
for 30 topics, the coherenceNPMI is -0.24916408423010594
for 50 topics, the coherenceNPMI is -0.2569681246550537
for 100 topics, the coherenceNPMI is -0.2621705790517895


In [46]:
tdc1 = TopicDiversity(topics=ctmc1.get_topic_lists(10)); tc1 = tdc1.score(topk=10)
tdc2 = TopicDiversity(topics=ctmc2.get_topic_lists(10)); tc2 = tdc2.score(topk=10)
tdc3 = TopicDiversity(topics=ctmc3.get_topic_lists(10)); tc3 = tdc3.score(topk=10)
tdc4 = TopicDiversity(topics=ctmc4.get_topic_lists(10)); tc4 = tdc4.score(topk=10)
tdc5 = TopicDiversity(topics=ctmc5.get_topic_lists(10)); tc5 = tdc5.score(topk=10)

In [47]:
print(f"for 10 topics, the TopicDiversity is {tc1}")
print(f"for 20 topics, the TopicDiversity is {tc2}")
print(f"for 30 topics, the TopicDiversity is {tc3}")
print(f"for 50 topics, the TopicDiversity is {tc4}")
print(f"for 100 topics, the TopicDiversity is {tc5}")

for 10 topics, the TopicDiversity is 0.98
for 20 topics, the TopicDiversity is 0.935
for 30 topics, the TopicDiversity is 0.94
for 50 topics, the TopicDiversity is 0.906
for 100 topics, the TopicDiversity is 0.792


In [48]:
print(ctmc1.get_topic_lists(k=5))
print("////")
print(ctmc2.get_topic_lists(k=5))
print("////")
print(ctmc3.get_topic_lists(k=5))
print("////")
print(ctmc4.get_topic_lists(k=5))
print("////")
print(ctmc5.get_topic_lists(k=5))

[['later', 'sk', 'double', 'final', 'new'], ['apartment', 'enter', 'process', 'end', 'genocide'], ['model', 'alive', 'port', 'alternative', 'written'], ['cost', 'trade', 'treatment', 'start', 'japanese'], ['decide', 'sale', 'sgi', 'features', 'early'], ['encrypted', 'george', 'normally', 'facts', 'mm'], ['dx', 'reasonable', 'pgp', 'status', 'across'], ['clients', 'bs', 'normal', 'defense', 'coming'], ['stated', 'id', 'field', 'realize', 'start'], ['mode', 'concept', 'differences', 'message', 'quickly']]
////
[['pointed', 'change', 'half', 'felt', 'pitt'], ['input', 'live', 'experience', 'com', 'eq'], ['support', 'rule', 'wrote', 'kind', 'meant'], ['pretty', 'simms', 'kept', 'drivers', 'letter'], ['intended', 'ide', 'rom', 'game', 'internal'], ['apr', 'alive', 'compatible', 'article', 'purpose'], ['yd', 'science', 'certainly', 'sin', 'kn'], ['ripem', 'mentioned', 'group', 'everybody', 'software'], ['mentioned', 'heard', 'rl', 'deleted', 'went'], ['citizens', 'library', 'standards', 'wro

## Batch size 1024 and learning rate 15

In [49]:
ctmd1 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=15, solver='adam',
                 n_components=10, num_epochs=100)
ctmd2 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=15, solver='adam',
                 n_components=20, num_epochs=100)
ctmd3 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=15, solver='adam',
                 n_components=30, num_epochs=100)
ctmd4 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=15, solver='adam',
                 n_components=50, num_epochs=100)
ctmd5 = CombinedTM(bow_size=len(tpa.vocab), contextual_size=768, model_type='prodLDA',
                 dropout=0.2, batch_size=1024, lr=15, solver='adam',
                 n_components=100, num_epochs=100)

In [50]:
ctmd1.fit(training_dataset1)
ctmd2.fit(training_dataset1)
ctmd3.fit(training_dataset1)
ctmd4.fit(training_dataset1)
ctmd5.fit(training_dataset1)

Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 539.4370819091797	Time: 0:00:02.948394: : 100it [04:46,  2.86s/it]
Sampling: [20/20]: : 20it [00:38,  1.93s/it]
Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 557.857666015625	Time: 0:00:02.272464: : 100it [04:41,  2.82s/it]
Sampling: [20/20]: : 20it [00:34,  1.71s/it]
Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 530.77490234375	Time: 0:00:02.985612: : 100it [04:55,  2.96s/it] 
Sampling: [20/20]: : 20it [00:41,  2.06s/it]
Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 593.3708618164062	Time: 0:00:02.974825: : 100it [04:57,  2.98s/it]
Sampling: [20/20]: : 20it [00:41,  2.07s/it]
Epoch: [100/100]	 Seen Samples: [1024000/1095100]	Train Loss: 822.5695251464844	Time: 0:00:02.920138: : 100it [05:08,  3.09s/it]
Sampling: [20/20]: : 20it [00:46,  2.34s/it]


In [51]:
npmid1 = CoherenceNPMI(texts=texts, topics=ctmd1.get_topic_lists(10)); d1 = npmid1.score(topk=10)
npmid2 = CoherenceNPMI(texts=texts, topics=ctmd2.get_topic_lists(10)); d2 = npmid2.score(topk=10)
npmid3 = CoherenceNPMI(texts=texts, topics=ctmd3.get_topic_lists(10)); d3 = npmid3.score(topk=10)
npmid4 = CoherenceNPMI(texts=texts, topics=ctmd4.get_topic_lists(10)); d4 = npmid4.score(topk=10)
npmid5 = CoherenceNPMI(texts=texts, topics=ctmd5.get_topic_lists(10)); d5 = npmid5.score(topk=10)

In [52]:
print(f"for 10 topics, the coherenceNPMI is {d1}")
print(f"for 20 topics, the coherenceNPMI is {d2}")
print(f"for 30 topics, the coherenceNPMI is {d3}")
print(f"for 50 topics, the coherenceNPMI is {d4}")
print(f"for 100 topics, the coherenceNPMI is {d5}")

for 10 topics, the coherenceNPMI is -0.12208717555784236
for 20 topics, the coherenceNPMI is -0.17089890763678858
for 30 topics, the coherenceNPMI is -0.11367217633630475
for 50 topics, the coherenceNPMI is -0.087623773175991
for 100 topics, the coherenceNPMI is -0.14701510828828013


In [53]:
tdd1 = TopicDiversity(topics=ctmd1.get_topic_lists(10)); td1 = tdd1.score(topk=10)
tdd2 = TopicDiversity(topics=ctmd2.get_topic_lists(10)); td2 = tdd2.score(topk=10)
tdd3 = TopicDiversity(topics=ctmd3.get_topic_lists(10)); td3 = tdd3.score(topk=10)
tdd4 = TopicDiversity(topics=ctmd4.get_topic_lists(10)); td4 = tdd4.score(topk=10)
tdd5 = TopicDiversity(topics=ctmd5.get_topic_lists(10)); td5 = tdd5.score(topk=10)

In [54]:
print(f"for 10 topics, the TopicDiversity is {td1}")
print(f"for 20 topics, the TopicDiversity is {td2}")
print(f"for 30 topics, the TopicDiversity is {td3}")
print(f"for 50 topics, the TopicDiversity is {td4}")
print(f"for 100 topics, the TopicDiversity is {td5}")

for 10 topics, the TopicDiversity is 0.75
for 20 topics, the TopicDiversity is 0.73
for 30 topics, the TopicDiversity is 0.7066666666666667
for 50 topics, the TopicDiversity is 0.598
for 100 topics, the TopicDiversity is 0.497


In [55]:
print(ctmd1.get_topic_lists(k=5))
print("////")
print(ctmd2.get_topic_lists(k=5))
print("////")
print(ctmd3.get_topic_lists(k=5))
print("////")
print(ctmd4.get_topic_lists(k=5))
print("////")
print(ctmd5.get_topic_lists(k=5))

[['ice', 'mary', 'rangers', 'year', 'minnesota'], ['detroit', 'york', 'conference', 'wings', 'minor'], ['player', 'subject', 'committee', 'van', 'minnesota'], ['ax', 'max', 'pl', 'ei', 'ey'], ['play', 'year', 'boston', 'det', 'cs'], ['tor', 'buf', 'mary', 'wings', 'chi'], ['pittsburgh', 'cal', 'expansion', 'green', 'mon'], ['expansion', 'obtained', 'la', 'wings', 'flames'], ['final', 'york', 'hall', 'season', 'address'], ['detroit', 'season', 'cup', 'players', 'mary']]
////
[['card', 'drives', 'disk', 'problem', 'twice'], ['mind', 'ie', 'space', 'far', 'traffic'], ['max', 'love', 'bh', 'resources', 'somewhere'], ['living', 'paul', 'black', 'pittsburgh', 'somewhere'], ['max', 'including', 'soviet', 'modem', 'technology'], ['whether', 'death', 'fire', 'robert', 'innocent'], ['technology', 'historical', 'stephanopoulos', 'jobs', 'comments'], ['push', 'living', 'last', 'carry', 'japanese'], ['lack', 'card', 'death', 'nl', 'drives'], ['order', 'source', 'tar', 'version', 'sources'], ['reaso

### Out of this 4 combinations, the last combination of high learning rate and large batch size gives us topics with much lesser coherence and much less topic diversity. So, we can conclude that we have somewhat achived posterior collapse because in posterior collapse as the latent structure is not learned during training it gives us topics which are more or less similar i.e not diverse.

## Posterior Collapse Condition: 
1. High Learning Rate
2. Large Batch Size