# Importations

In [1]:
from words import interesting_, interesting_word_list # Word list if necessary
from cade import TTEC_wrapper, load, save, TTEC, TimeSlice
from concurrent.futures import ProcessPoolExecutor
import logging
import time

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


# Train TTEC
## Train using sequential approach
For this method, you need to have a list of timestamps and a text file of preprocessed documents.
It will then separate the documents by year and automatically create time slices

In [2]:
# Basic logging
logging.basicConfig(filename='log_sequential.txt', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# List of timestamps, loaded from pickle format
stamps = load('data/years.pkl')
start = time.time()
wrapper = TTEC_wrapper(corpus_file='data/all_preprocessed.txt', time_stamps=stamps, size=100,n_topics=50,
                      train_compass_now=True, train_slices_now=True, min_count=50,
                      hdbscan_selection="nested", similarity_method="vote"
                      )
print("Time taken:", time.time() - start, "seconds")

Time taken: 1323.1047928333282 seconds


In [3]:
for n_topics in [50, 40, 30, 20, 10]:
    wrapper.reduce_topics(n_topics)
    print(f"{n_topics}: coh {wrapper.test_coherence()} div {wrapper.test_diversity()}")

50: coh -0.06024703269064913 div 0.9361531122242683
40: coh -0.06480162497217568 div 0.9465820926364688
30: coh -0.06651853685715607 div 0.9616254041698884
20: coh -0.08775501453140756 div 0.9780729566814474
10: coh -0.1531354895126527 div 0.9938194810769568


## Train using parallel approach
For this method, you need to have a text file with all the preprocessed documents and then a series of text files which represent each time slice.
TTEC will train a compass first and then train the other time slices in parallel.

In [4]:
def executor_function(year):
    return ttec.train_slice(f'data/{year}.txt', create_topics=False), year

stamps = load('data/years.pkl')
start = time.time()
ttec = TTEC(hdbscan_selection="nested", n_topics=50, log=True, log_name='log_parallel.log')
ttec.train_compass('data/all_preprocessed.txt')
wrapper = TTEC_wrapper(corpus_file='data/all_preprocessed.txt', time_stamps=stamps, size=100,n_topics=50,
                      train_compass_now=False, train_slices_now=False, min_count=50,
                      hdbscan_selection="nested", similarity_method="vote"
                      )
wrapper.compass = ttec
with ProcessPoolExecutor(max_workers=6) as executor:
    for result in executor.map(executor_function, range(7,16)):
        wrapper.slices[result[1]] = result[0]
wrapper.remake_slice_embeddings()
print("Time taken:", time.time() - start, "seconds")

Time taken: 621.3934881687164 seconds


In [5]:
for n_topics in [50, 40, 30, 20, 10]:
    wrapper.reduce_topics(n_topics)
    print(f"{n_topics}: coh {wrapper.test_coherence()} div {wrapper.test_diversity()}")

50: coh -0.07374714343547341 div 0.9324240117722551
40: coh -0.06848750635673428 div 0.94896520145362
30: coh -0.06771502821288779 div 0.9591100529133316
20: coh -0.09309892628740647 div 0.9793894074129125
10: coh -0.1278014950072993 div 0.9941041924164288
