# Neural Topic Modeling

##  English version

First of all, we can check how much GPU we have access to.

In [None]:
import GPUtil
GPUs = GPUtil.getGPUs()
for i, gpu in enumerate(GPUs):
  print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))

During the first run, we clone the repository kindly made public by the authors of the paper.

In [None]:
# !git clone https://github.com/ahoho/kd-topic-models.git

Now, let's create the environments for the teacher and the student.

In [None]:
!conda env create -f /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/teacher/teacher.yml

In [None]:
!conda env create -f /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/scholar/scholar.yml

### Preprocessing 20NG dataset
We perform the same preprocessing steps as the authors of the paper.

In [None]:
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng
!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/1_convert_prodlda_to_txt_py27.py

In [None]:
! python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/2_convert_txt_to_scholar_format_py3.py

In [None]:
!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/3_replicate_and_align_raw_data.py

In [None]:
!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/4_create_dev_sets.py

In [None]:
!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/5_create_aligned_dev_set.py

In [None]:
!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/6_create_raw_text_file.py /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/replicated

### LDA

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer 
from gensim.corpora.dictionary import Dictionary 
nltk.download("stopwords") 
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_documents(dataset, dictionary = None, language = 'english'):

    """Preprocess the dataset to obtain the input for the model.
    Parameters
    ----------
    - dataset: List of Strings (each String is a document of the dataset)
    - dictionary: Dictionary, default None (for the training, set it to None in order to extract the dictionary of the words from the corpus. 
                                            During test give the dictionary computed for the training.)
    - language: String, default "english" (language of the dataset)
    Output
    ------
    - dictionary: Dictionary (dictionary containing all the words of the corpus, each associated to an int key.
                              If the input dictionary is not None, the input dictionary is returned.)
    - corpus: List of Lists of Tuples (each document is mapped to tuples at a word-level. Each tuple contains two elements, 
                                       the int-key of the word in the dictionary and the number of times it appears in the specific document considered.)
    - lemmatized_words: List of Lists of Strings (contains the lemmatized words)"""
    

    ### Tokenizer
    
    tokenized = []
    for doc in dataset:
        tokenized_doc = word_tokenize(doc)
        tokenized_doc= [word.lower() for word in tokenized_doc if word.isalpha()]
        tokenized.append(tokenized_doc)

    ### Stopwords

    if language == "italian": # added some stopwords specific for the webhose corpus in italian
        stop_words = nltk.corpus.stopwords.words(language)
        newStopWords = ['nflash','credits','ansa', 'gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago', 'set', 'sett', 'ott', 'nov', 'dic']
        stop_words.extend(newStopWords)
    
    elif language == "english":
        stop_words = set(stopwords.words(language))
    
    filtered_list = []
    for doc in tokenized :
        filtered_doc = [word for word in doc if word.casefold() not in stop_words and re.match(r'.*([a-zA-Z])\1{3,}',word) is None]
        filtered_list.append(filtered_doc)
    ### Lemmatizing
    if language == "english":
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = []
        for doc in filtered_list:
            lemmatized_doc = [lemmatizer.lemmatize(word) for word in doc]
            lemmatized_words.append(lemmatized_doc)
    elif language == "italian":
        lemmatized_words = italian_lemmatizer(filtered_list)

    if dictionary is None:
        dictionary = Dictionary(lemmatized_words)
    
    corpus = [dictionary.doc2bow(text) for text in lemmatized_words]
    
    return dictionary, corpus, lemmatized_words

In [None]:
from gensim.models import LdaModel
import numpy as np

def apply_lda_and_save_topics_file(out_file_path, num_topics, train_corpus, train_dict):
    
    """Instantiate an LDA model and save the computed topics in a txt file.
    Parameters
    ----------
    - out_file_path: String
    - num_topics: int (number of topics to be computed by LDA)
    - train_corpus: List of Lists of Strings (every document is represented by a list of words, and the corpus is a list of all the documents)
    - train_dict: Dictionary (dictionary containing all the words of the corpus, each associated to an int key)
    Output
    ------
    - topics: List of Strings (each String is a topic. The length of the list is num_topics)"""

    lda = LdaModel(train_corpus, num_topics=num_topics, id2word=train_dict, dtype=np.float64, passes = 100)
    topics  =[]
    with open(out_file_path,"w") as f:
        for element in lda.show_topics(num_topics = num_topics, formatted = False):
            current_topics = ""
            for value in element[1]:
                current_topics = current_topics + ' ' + str(value[0])
            topics.append(current_topics.strip())
            f.write(current_topics + "\n")
    return topics

Obtain the 20NG dataset from sklearn and preprocess it with the above functions.

In [None]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

train_dict, train_corpus, train_text = preprocess_documents(newsgroups_train.data)
test_dict, test_corpus, test_text = preprocess_documents(newsgroups_test.data, train_dict)

In [None]:
k = 50 # number_topics
path_lda_topics = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/outputs/LDA{}_ENG/topics.txt".format(k)
apply_lda_and_save_topics_file(path_lda_topics, k, train_corpus, train_dict)

#### Internal NPMI

In [None]:
import codecs
from scipy import sparse

def read_text(input_filename):
    with codecs.open(input_filename, 'r', encoding='utf-8') as input_file:
        lines = input_file.readlines()
    return lines
def read_json(input_filename):
    with open(input_filename, 'r', encoding='utf-8') as input_file:
        data = json.load(input_file, encoding='utf-8')
    return data
def load_sparse(input_filename):
    npy = np.load(input_filename)
    coo_matrix = sparse.coo_matrix((npy['data'], (npy['row'], npy['col'])), shape=npy['shape'])
    return coo_matrix.tocsc()

def write_list_to_text(lines, output_filename, add_newlines=True, add_final_newline=False):
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        for i, line in enumerate(lines):
            output_file.write(line)
            if add_newlines and i < len(lines) - 1:
                output_file.write('\n')
        
        if add_final_newline:
            output_file.write('\n')

def load_and_compute_npmi(topics_file, ref_vocab_file, ref_counts_file, n_vals, cols_to_skip=0, output_file=None):
    print("Loading reference counts")
    ref_vocab = read_json(ref_vocab_file)
    ref_counts = load_sparse(ref_counts_file).tocsc()
    compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip, output_file)


def compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip=0, output_file=None):
    print("Loading topics")
    topics = read_text(topics_file)

    mean_vals = []
    for n in range(n_vals):
        mean_npmi = compute_npmi_at_n(topics, ref_vocab, ref_counts, n, cols_to_skip=cols_to_skip)
        mean_vals.append(mean_npmi)

    if output_file is not None:
        lines = [str(n) + ' ' + str(v) for n, v in zip(range(n_vals), mean_vals)]
        write_list_to_text(lines, output_file)


def compute_npmi_at_n(
    topics, ref_vocab, ref_counts, n=10, cols_to_skip=0, silent=False, return_mean=True
):

    vocab_index = dict(zip(ref_vocab, range(len(ref_vocab))))
    n_docs, _ = ref_counts.shape

    npmi_means = []
    for topic in topics:
        words = topic.strip().split()[cols_to_skip:]
        npmi_vals = []
        for word_i, word1 in enumerate(words[:n]):
            if word1 in vocab_index:
                index1 = vocab_index[word1]
            else:
                index1 = None
            for word2 in words[word_i+1:n]:
                if word2 in vocab_index:
                    index2 = vocab_index[word2]
                else:
                    index2 = None
                if index1 is None or index2 is None:
                    npmi = 0.0
                else:
                    col1 = np.array(ref_counts[:, index1].todense() > 0, dtype=int)
                    col2 = np.array(ref_counts[:, index2].todense() > 0, dtype=int)
                    c1 = col1.sum()
                    c2 = col2.sum()
                    c12 = np.sum(col1 * col2)
                    if c12 == 0:
                        npmi = 0.0
                    else:
                        npmi = (np.log10(n_docs) + np.log10(c12) - np.log10(c1) - np.log10(c2)) / (np.log10(n_docs) - np.log10(c12))
                npmi_vals.append(npmi)
        if not silent:
            print(str(np.mean(npmi_vals)) + ': ' + ' '.join(words[:n]))
        npmi_means.append(np.mean(npmi_vals))
    if not silent:
        print(np.mean(npmi_means))
    if return_mean:
        return np.mean(npmi_means)
    else:
        return np.array(npmi_means)

In [None]:
topics_file = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/outputs/LDA50_ENG/topics.txt"
ref_vocab_file = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/20ng/replicated/train.vocab.json"
ref_counts_file = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/20ng/replicated/test.npz"
load_and_compute_npmi(topics_file, ref_vocab_file, ref_counts_file, 10, output_file="internal_npmi_LDA_k50_ENG.txt")

### Scholar

In [None]:
!source activate scholar

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/scholar/run_scholar.py \
    /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/replicated/dev \
    --dev-metric npmi \
    -k 200 \
    --epochs 500 \
    --patience 450 \
    --batch-size 200 \
    --background-embeddings \
    --device 0 \
    --dev-prefix dev \
    -l 0.002 \
    --alpha 1.0 \
    --eta-bn-anneal-step-const 0.25 \
    --use-doc-layer \
    -o ./outputs/20ng/scholar_k200_defaultParameters/

### Scholar + BAT

#### Teacher

In [None]:
# !source activate transformers28

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/teacher/bert_reconstruction.py \
--input-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/replicated/dev \
--output-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/replicated/dev/logits \
--do-train \
--evaluate-during-training \
--save-steps 112 \
--logging-steps 112 \
--num-train-epochs 6 \
--seed 42 \
--num-workers 4 \
--batch-size 10 \
--gradient-accumulation-steps 8

Extract the logits from the trained teacher model:

In [None]:
#!source activate transformers28

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/teacher/bert_reconstruction.py \
    --output-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/replicated/dev/logits \
    --seed 42 \
    --num-workers 4 \
    --get-reps \
    --checkpoint-folder-pattern "checkpoint-672" \
    --save-doc-logits \
    --no-dev

#### Student

In [None]:
!source activate scholar

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/scholar/run_scholar.py \
    /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/replicated/dev \
    --dev-metric npmi \
    -k 50 \
    --epochs 500 \
    --patience 500 \
    --batch-size 200 \
    --background-embeddings \
    --device 0 \
    --dev-prefix dev \
    -l 0.002 \
    --alpha 1.0 \
    --eta-bn-anneal-step-const 0.25 \
    --doc-reps-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/20ng/replicated/dev/logits/checkpoint-672/doc_logits \
    --use-doc-layer \
    --no-bow-reconstruction-loss \
    --doc-reconstruction-weight 0.75 \
    --doc-reconstruction-temp 2.0 \
    --doc-reconstruction-logit-clipping 10.0 \
    -o ./outputs/20ng/k50_optimalParameters

### Palmetto

In [None]:
from palmettopy.palmetto import Palmetto
import requests
import numpy as np
import re

def evaluate_with_palmetto(path_to_topics, n_words_topic, coherence_measure="npmi", output_file = None):
  """Evaluate the topics with the Palmetto framework.
  Parameters
  ----------
  - path_to_topics: String (path the the txt file containing the topics extracted by the student)
  - n_words_topics: int (numer of words to consider for each topic)
  - coherence_measure: String, default "npmi" (coherence measure to be used for the evaluation. The available measures are: ca, cp, cv, npmi, uci, umass)
  - output_file: String, default None (txt file where to save the NPMI measure for each topic and the average one considering all the topics)
  Output
  ------
  - avg_npmi: float (average NPMI considering all the topics)"""
  
  palmetto = Palmetto()
  coherence_values = []
  with open(path_to_topics,"r") as f:
    lines = f.readlines()
    if output_file is not None:
      g = open(output_file,"w")
    for topic in lines:
      words = topic.split(" ")[:n_words_topic+1]
      words_string = ""
      for word in words:
          words_string = words_string + word + " "
      words_string = words_string.strip()
      print(words_string)
      # coherence_values.append(palmetto.get_coherence(words, coherence_type=coherence_measure))
      r =requests.get("https://palmetto.demos.dice-research.org/service/{}?words={}".format(coherence_measure,words_string))
      print(r.text)

      if re.match(r'^-?\d+(?:\.\d+)$', r.text) is not None:
        coherence_values.append(float(r.text))
      else :
        print(str(r.status_code) + " - " + r.text + "[" + words_string + "]")

      if output_file is not None:
        g.write(words_string + " --> NPMI: " + r.text + "\n")
    avg_npmi = np.mean(coherence_values)
    if output_file is not None:
        g.write("\n\nAVERAGE NPMI: " + str(avg_npmi))
        g.close()
    return avg_npmi

In [None]:
evaluate_with_palmetto("/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/outputs/20ng/k50_defaultParameters/topics.txt",10,output_file= "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/outputs/20ng/k50_defaultParameters/topics_npmi.txt")

## Italian version

First of all we will obtain the italian lemmatizer which will be used to preprocess the documents of the italian corpus.

In [None]:
!pip install treetaggerwrapper

In [None]:
%%bash
mkdir treetagger
cd treetagger
# Download the tagger package for your system (PC-Linux, Mac OS-X, ARM64, ARMHF, ARM-Android, PPC64le-Linux).
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.4.tar.gz
tar -xzvf tree-tagger-linux-3.2.4.tar.gz
# Download the tagging scripts into the same directory.
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz
gunzip tagger-scripts.tar.gz
# Download the installation script install-tagger.sh.
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/install-tagger.sh
# Download the parameter files for the languages you want to process.
# list of all files (parameter files) https://cis.lmu.de/~schmid/tools/TreeTagger/#parfiles
wget https://cis.lmu.de/~schmid/tools/TreeTagger/data/italian.par.gz
sh install-tagger.sh
cd ..
#sudo pip install treetaggerwrapper

### Preprocessing Webhose dataset

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from pathlib import Path
from scipy import sparse
import json
import treetaggerwrapper

def italian_lemmatizer (tokenized_corpus):
  # tokenized_corpus list of lists of strings
  corpus = []
  for doc in tokenized_corpus:
    current_str = ""
    for word in doc :
      current_str = current_str + ' ' + str(word)
    corpus.append(current_str.strip())
  tagger = treetaggerwrapper.TreeTagger(TAGLANG='it', TAGDIR='treetagger/') 
  lemmatized = [] 
  for text in corpus:
    tags = tagger.tag_text(text)
    lemmatized.append([tag.split('\t')[2] for tag in tags])
  return lemmatized
  
def toks_to_onehot(doc, vocab):
    tokens =  [vocab[word] for word in doc if word in vocab]                         
    return np.bincount(tokens, minlength=len(vocab))
    
def save_sparse(sparse_matrix, output_filename):
    assert sparse.issparse(sparse_matrix)
    if sparse.isspmatrix_coo(sparse_matrix):
        coo = sparse_matrix
    else:
        coo = sparse_matrix.tocoo()
    row = coo.row
    col = coo.col
    data = coo.data
    shape = coo.shape
    np.savez(output_filename, row=row, col=col, data=data, shape=shape)

def save_json(obj, fpath):
    with open(fpath, 'w') as o:
        json.dump(obj, o, ensure_ascii=False)

def save_jsonlist(dicts, fpath):
    with open(fpath, 'w', encoding='utf-8') as o:
        for d in dicts:
            json.dump(d, o)
            o.write('\n')

In [None]:
from pathlib import Path
from tqdm import tqdm
import json
import pandas as pd
import pickle
def preprocess_dataset_webhose(corpus_path):

    """Preprocess the italian dataset to obtain the input for the model.
    Parameters
    ----------
    - corpus_path: String, the path of the .txt file containig the corpus where each document is represented by a JSON file.
    Output
    ------
    - train_dict: Dictionary, the dictionary created on the trainig dataset.
    - train_corpus: List of Lists of Tuples, the training dataset in an appropriate format for the LDA model"""
    

    docs = []
    with open(corpus_path,"r") as file_reader:
        corpus = file_reader.read()
        json_lists = json.loads("[{}]".format(corpus.replace('}{', '},{')))
    for doc in tqdm(json_lists):
        docs.append(doc["text"])
    

    
    # Train-test split
    TEST_SIZE = 0.1
    raw_train, raw_test = train_test_split(docs, test_size = TEST_SIZE, random_state = 42, shuffle = True)

    # Lemmatize only trainig dataset
    train_dict, train_corpus, raw_tokens_train = preprocess_documents(raw_train, language = "italian")
    
    # Swap key and value of the dictionary
    vocab_dict = dict([(value, key) for key, value in train_dict.items()])
    

    # Compute count matrix of training dataset only
    raw_counts_train = np.array([toks_to_onehot(doc, vocab_dict) for doc in raw_tokens_train]) 

    # Our addition: we map each document on a list of ids of the respective words in that document, then we perform the bin count. 
    #               After that we compute a mask by filtering out the columns relative to that words that do not respect the condition.
    mapping_bool = lambda x: int(bool(x))
    func = np.vectorize(mapping_bool)
    b = func(raw_counts_train)
    mask = (np.sum(b,axis = 0) > 15) & (np.sum(b,axis = 0) < 6890) # at least 15 documents and less than 65% 
    
    # New dictionary without the previously removed words
    vocab_dict_keys = [k for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])]
    new_words = np.array(vocab_dict_keys)[mask]
    train_dict = Dictionary([list(new_words)]) 
    vocab_dict = dict([(value, key) for key, value in train_dict.items()])

    # Lemmatize training and test sets
    train_dict, train_corpus, raw_tokens_train = preprocess_documents(raw_train, train_dict, language = "italian")
    test_dict, _, raw_tokens_test = preprocess_documents(raw_test, train_dict, language = "italian")

    # NOTE: using Dictionary() we shuffle words, we need to recompute the training data set's count matrix
    raw_counts_train = np.array([toks_to_onehot(doc, vocab_dict) for doc in raw_tokens_train])
    raw_counts_test = np.array([toks_to_onehot(doc, vocab_dict) for doc in raw_tokens_test])

    ## Filter out the zero-counts
    nonzero_train = raw_counts_train.sum(1) > 0
    nonzero_test = raw_counts_test.sum(1) > 0
        
    ## Keep only non empty documents
    raw_ids_train = [idx for idx, keep in enumerate(nonzero_train) if keep] # non empty documents ids
    raw_ids_test = [idx for idx, keep in enumerate(nonzero_test) if keep] # list of integers

    raw_tokens_train = [' '.join(raw_tokens_train[idx]) for idx in raw_ids_train] # non empty documents lemmatized
    raw_tokens_test = [' '.join(raw_tokens_test[idx]) for idx in raw_ids_test] # list of strings

    raw_counts_train = raw_counts_train[nonzero_train] # non empty documents word counts
    raw_counts_test = raw_counts_test[nonzero_test] # list of lists of 0/1

    raw_data_train = [{'id': idx, 'text': raw_train[idx]} for idx in raw_ids_train] # list of dictinaries, one dict for each non empty documents
    raw_data_test = [{'id': idx, 'text': raw_test[idx]} for idx in raw_ids_test]

    ## save data
    %cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose
    Path("replicated").mkdir(exist_ok=True)
    save_sparse(sparse.coo_matrix(raw_counts_train), "./replicated/train.npz")
    save_sparse(sparse.coo_matrix(raw_counts_test), "./replicated/test.npz")
    keys_dict = [
        k for k, v in vocab_dict.items()
    ]   
    save_json(keys_dict, "./replicated/train.vocab.json")

    save_json(raw_tokens_train, "./replicated/train.tokens.json")
    save_json(raw_tokens_test, "./replicated/test.tokens.json")

    save_jsonlist(raw_data_train, "./replicated/train.jsonlist")
    save_jsonlist(raw_data_test, "./replicated/test.jsonlist")

    save_json([d['id'] for d in raw_data_train], "./replicated/train.ids.json")
    save_json([d['id'] for d in raw_data_test], "./replicated/test.ids.json")
    with open("./webhose_corpus.txt", "wb") as internal_filename:
        pickle.dump(train_corpus, internal_filename)
    return train_dict, train_corpus

We can now perform the preprocessing with the defined functions.

In [None]:
train_dict_webhose, train_corpus_webhose = preprocess_dataset_webhose("/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/webhose_corpus_clean_new.txt")

In [None]:
!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/4_create_dev_sets.py

In [None]:
!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/6_create_raw_text_file.py /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl

### LDA

In [None]:
apply_lda_and_save_topics_file("/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/LDA/Ita_LDA_topics.txt", 50, train_corpus_webhose, train_dict_webhose)

#### Internal NPMI

In [None]:
topics_file = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/LDA/Ita_LDA_topics.txt"
ref_vocab_file = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/train.vocab.json"
ref_counts_file = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/test.npz"
load_and_compute_npmi(topics_file, ref_vocab_file, ref_counts_file, 10, output_file="internal_npmi_LDA_k50.txt")

### Scholar

We can test the Scholar model without the application of knowledge distillation.

In [None]:
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl

In [None]:
!source activate scholar

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/scholar/run_scholar.py \
    /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev \
    --dev-metric npmi \
    -k 50 \
    --epochs 500 \
    --patience 250 \
    --batch-size 200 \
    --background-embeddings \
    --device 0 \
    --dev-prefix dev \
    -l 0.001 \
    --alpha 0.5 \
    --eta-bn-anneal-step-const 0.25 \
    --use-doc-layer \
    -o ./outputs/webhose/scholar_k50_defaultParameters_lr001/

### Scholar + BAT

#### Teacher

In [None]:
# !source activate transformers28

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/teacher/bert_reconstruction.py \
--input-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev  \
--output-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev/logits \
--bert-model dbmdz/bert-base-italian-xxl-uncased \
--do-train \
--evaluate-during-training \
--save-steps 165 \
--logging-steps 165 \
--num-train-epochs 8 \
--seed 42 \
--num-workers 4 \
--batch-size 8 \
--gradient-accumulation-steps 8

We can now extract the logits.

In [None]:
#!source activate transformers28

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/teacher/bert_reconstruction.py \
    --output-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev/logits \
    --seed 42 \
    --num-workers 4 \
    --get-reps \
    --checkpoint-folder-pattern "checkpoint-1155" \
    --save-doc-logits \
    --no-dev

#### Student

In [None]:
!source activate scholar

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/scholar/run_scholar.py \
    /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev \
    --dev-metric npmi \
    -k 50 \
    --epochs 500 \
    --patience 500 \
    --batch-size 200 \
    --background-embeddings \
    --device 0 \
    --dev-prefix dev \
    -l 0.001 \
    --alpha 0.5 \
    --eta-bn-anneal-step-const 0.25 \
    --doc-reps-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev/logits/checkpoint-1320/doc_logits \
    --use-doc-layer \
    --no-bow-reconstruction-loss \
    --doc-reconstruction-weight 0.5 \
    --doc-reconstruction-temp 1.0 \
    --doc-reconstruction-logit-clipping 10.0 \
    -o ./outputs/webhose/k50_defaultParameters_new_7_epochs_lr001/

### Articles from Corriere
To give the chosen articles to the neural topic model we have to perform the same preprocessing steps as before, extract the logits from the teacher model and pass them to the student model.

In [None]:
def save_and_preprocess_articles(corpus_path,train_dict):

    """Preprocess the test articles and save results.
        Parameters
        ----------
        - corpus_path: String, the path of the .txt file containig the corpus.
        - train_dict: Dictionary, the dictionary created during training"""

    corpus = open(
        corpus_path, encoding="utf-8"
    ).read()

    corpus = corpus.split("</text>")

    docs = []
    for doc in corpus:
        try:
            if(doc!='' and doc!='\n'):
                t = doc.split("</text")[0]
                if (doc!=''):
                    docs.append(t.split("\">")[1])
        except IndexError:
            print("doc: " + doc)
            print("previous doc: " + docs[-1])
    
    test_dict, _, raw_tokens_test = preprocess_documents(docs, train_dict, language = "italian")
    
    ## Swap key and value of the dictionary
    vocab_dict = dict([(value, key) for key, value in train_dict.items()])

    ## Compute count matrix
    raw_counts_test = np.array([toks_to_onehot(doc, vocab_dict) for doc in raw_tokens_test])

    ## Filter out the zero-counts
    nonzero_test = raw_counts_test.sum(1) > 0
        
    ## Keep only non empty documents
    raw_ids_test = [idx for idx, keep in enumerate(nonzero_test) if keep]

    raw_tokens_test = [' '.join(raw_tokens_test[idx]) for idx in raw_ids_test]

    raw_counts_test = raw_counts_test[nonzero_test] # list of lists of 0/1

    raw_data_test = [{'id': idx, 'text': docs[idx]} for idx in raw_ids_test]

    ## save data
    %cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset
    Path("replicated").mkdir(exist_ok=True)
    save_sparse(sparse.coo_matrix(raw_counts_test), "./replicated/test.npz")
    keys_dict = [
        k for k, v in vocab_dict.items()
    ]   
    save_json(keys_dict, "./replicated/train.vocab.json")

    save_json(raw_tokens_test, "./replicated/test.tokens.json")

    save_jsonlist(raw_data_test, "./replicated/test.jsonlist")

    save_json([d['id'] for d in raw_data_test], "./replicated/test.ids.json")


We can apply the defined functions and scripts, similar to the ones applied on the Webhose corpus, to obtain the preprocessed version of the dataset.

In [None]:
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl

article_path = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset/articles.txt" 
save_and_preprocess_articles(article_path,train_dict_webhose)

In [None]:
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset

In [None]:
!cp ./replicated/test.ids.json ./replicated/test.jsonlist ./replicated/test.npz ./replicated/test.tokens.json ./replicated/dev

In [None]:
#!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2876181/code/Users/S287618/ProjectMLDL/kd-topic-models/data/articles/4_create_dev_sets.py

In [None]:
!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset/6_create_raw_text_file.py /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset/replicated

Now that we have preprocessed dataset, we can use the trained teacher model to extract the logits.

In [None]:
#!source activate transformers28

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/teacher/bert_reconstruction.py \
    --input-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev \
    --output-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset/replicated/dev/logits \
    --test-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset/replicated/dev \
    --seed 42 \
    --num-workers 4 \
    --checkpoint-folder-pattern "checkpoint-1155" \
    --save-doc-logits \
    --do-eval

Now, we use the logits to apply the student model to the articles.

In [None]:
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl

In [None]:
!source activate scholar

!python /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/scholar/run_scholar.py \
    /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset/replicated/dev \
    --batch-size 200 \
    --background-embeddings \
    --device 0 \
    --test-prefix test \
    --doc-reps-dir /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/corriere_dataset/replicated/dev/logits/checkpoint-1155/doc_logits \
    -o ./outputs/corriere_dataset/k50_defaultParameters_new_7_epochs_lr001 \
    --do-test


Now we can extract the most relevant topics for each article to evaluate qualitatively the results.

In [None]:
def read_topic(path_topics, ind):
    with open(path_topics, "r") as topics_file:
        lines = topics_file.readlines()
        for index,line in enumerate(lines):
            if(index == ind):
                words = line.split(" ")[:9]
                return words

In [None]:
import numpy as np
data = np.load("/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/corriere_dataset/k50_defaultParameters_new_7_epochs_lr001/theta.test.npz")
fields = data.files
path_topics = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/corriere_dataset/k50_defaultParameters_new_7_epochs_lr001/topics.txt"
for item in fields:
    if(item == "theta"):
        for i in range(3):
          print("Document {} top topics:".format(i+1))
          ind = data[item][i].argsort()[-3:][::-1]
          for el in ind:
            value = data[item][i][el]
            topic = read_topic(path_topics, el)
            print(value, ": topic ", el, topic)

### Search engine

In [None]:
import numpy as np
import os
import json
from scipy.spatial import distance
!pip install --upgrade scipy

def load_jsonlist(fpath):
    data = []
    with open(fpath, 'r', encoding='utf-8') as i:
        data = [json.loads(line) for line in i]
    return data

def search_engine(topic_dist, language, num_related_docs, metric):

  """Search within the training dataset the document(s) charcterized by a similar topic distribution to, or the most chacterizing topic of the one provided in input.
    Parameters
    ----------
    - topic_dist: List of Floats, a distribution over topics.
    - language: "Italian" or "English", it specifies the data set in which to search
    - num_related_docs: Integer, number of douments to be retrieved
    - meteric: String, "topic", retrieves the document(s) in which the probability, of the most relevant topic of the distribution in input, is higher;
                       "js", retrieves the document(s) with the most similar topic distribution to the one given in input.
    Output
    ------
    - results: List of Dictionaries, keys of each document: "document_id": Integer, the id of the retrieved document
                                                            "value": Float, if metric = "js" is the value of the Jensen-Shannon divergence between the two distributions
                                                                            if metric = "topic" is the proportion of the most probable topic in the retrieved document
                                                            "text: String, the text of the retrieved document"""

  if language == "english":
    theta_and_topics_path =  "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/20ng_replicated"
    train_jsonlist_path = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/20ng/replicated/dev"
  elif language == 'italian':
    theta_and_topics_path = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/webhose/k50_defaultParameters_new_7_epochs_lr001"
    train_jsonlist_path = "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev"
  
  train_jsonlist = load_jsonlist(os.path.join(train_jsonlist_path,"train.jsonlist"))
  theta = np.load(os.path.join(theta_and_topics_path,"theta.train.npz"))['theta']
  topic_id = topic_dist.argmax()
  
  print("Most relevant topic: ", topic_id)
  
  if metric == "js":
    topic_matrix = np.tile(topic_dist, (theta.shape[0], 1))
    thetaT = theta.T
    topic_matrixT = topic_matrix.T
    ditanaces = distance.jensenshannon(thetaT, topic_matrixT)
    most_related_document = ditanaces[ditanaces.argsort()[:num_related_docs]]
    most_related_document_ids = ditanaces.argsort()[:num_related_docs]
  elif metric == "topic":
    most_related_document = theta[:,topic_id][theta[:,topic_id].argsort()[::-1][:num_related_docs]]
    most_related_document_ids = theta[:,topic_id].argsort()[::-1][:num_related_docs]
  
  results = [{'document_id': train_jsonlist[id]['id'],'value': topic_probability, 'text': train_jsonlist[id]['text']}for id, topic_probability in zip(most_related_document_ids, most_related_document)]
  
  return results

In [None]:
theta_and_topics_path =  "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/corriere_dataset/k50_defaultParameters_new_7_epochs_lr001"
theta1 = np.load(os.path.join(theta_and_topics_path,"theta.test.npz"))['theta']
topic_dist = theta1[0]
search_engine(topic_dist, "italian", 3, "topic") 

### Topics alignment (Scholar / Scholar + BAT)

In [None]:
def get_npmi_vals_and_topic_words(ref_vocab, ref_counts, topics, n=10, cols_to_skip=0):
    vocab_index = dict(zip(ref_vocab, range(len(ref_vocab))))
    n_docs, _ = ref_counts.shape
    npmi_values, top_words_strings = [], []
    for topic in topics:
        words = topic.strip().split()[cols_to_skip:]
        npmi_vals = []
        for word_i, word1 in enumerate(words[:n]):
            if word1 in vocab_index:
                index1 = vocab_index[word1]
            else:
                index1 = None
            for word2 in words[word_i+1:n]:
                if word2 in vocab_index:
                    index2 = vocab_index[word2]
                else:
                    index2 = None
                if index1 is None or index2 is None:
                    npmi = 0.0
                else:
                    col1 = np.array((ref_counts[:, index1] > 0).todense(), dtype=int)
                    col2 = np.array((ref_counts[:, index2] > 0).todense(), dtype=int)
                    c1 = col1.sum()
                    c2 = col2.sum()
                    c12 = np.sum(col1 * col2)
                    if c12 == 0:
                        npmi = 0.0
                    else:
                        npmi = (np.log10(n_docs) + np.log10(c12) - np.log10(c1) - np.log10(c2)) / (np.log10(n_docs) - np.log10(c12))
                npmi_vals.append(npmi)
        npmi_values.append(round(np.mean(npmi_vals), 4))
        top_words_strings.append(' '.join(words[:n]))
    return npmi_values, top_words_strings

In [None]:
def get_npmi_topics(datapath, modelpath, n=10):
    ref_vocab = fh.read_json(datapath + 'train.vocab.json')
    ref_counts = fh.load_sparse(datapath + 'dev.npz').tocsc()
    out = []

    topics = fh.read_text(modelpath + '/topics.txt')
    npmi_values, top_words_strings = get_npmi_vals_and_topic_words(ref_vocab, ref_counts, topics, n)
    out.append(list(zip(npmi_values, top_words_strings)))
    return out

In [None]:
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/scholar/

In [None]:
import numpy as np
import torch
import scipy
from scipy import stats
import os
import pandas as pd
import numpy as np
import json
from run_scholar import print_top_words
import scipy
import torch
import pickle
import file_handling as fh

def jsd(p, q, base=np.e):
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    
    ## normalize p, q to probabilities
    p, q = np.array(torch.softmax(torch.from_numpy(p), dim=0)), np.array(torch.softmax(torch.from_numpy(q), dim=0))
    m = (p + q)/2
    return scipy.stats.entropy(p, m, base=base)/2. +  scipy.stats.entropy(q, m, base=base)/2.

def js_divergence(beta1, beta2):

    assert beta1.shape==beta2.shape
    x, y = beta1.shape
    js_div_score_matrix = np.zeros((x,x))
    for i in range(x):
        for j in range(x):
            js_div_score_matrix[i][j] = round(jsd(beta1[i], beta2[j]), 4)
    return js_div_score_matrix

def get_topic_matched_pairs(beta1, beta2):
    assert beta1.shape==beta2.shape
    js_div_scores = js_divergence(beta1, beta2)
    topic_match_tuples = []
    topic_match_scores = []
    while len(topic_match_tuples)<50:
        z = np.argmin(js_div_scores) 
        i = z//js_div_scores.shape[1]
        j = z%js_div_scores.shape[1]
        topic_match_tuples.append((i,j))
        topic_match_scores.append(np.min(js_div_scores))
        js_div_scores[i, :] = 2.0
        js_div_scores[:, j] = 2.0
    return topic_match_tuples, topic_match_scores

In [None]:
beta_baseline = np.load("/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/webhose/scholar_k50_defaultParameters_lr001/beta.npz")['beta']
beta_kd = np.load("/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/webhose/k50_defaultParameters_new_7_epochs_lr001/beta.npz")['beta']
topic_pairs_jsdiv_baseline_kd, scores = get_topic_matched_pairs(beta_baseline, beta_kd)

In [None]:
%cd /mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl

In [None]:
def print_compare_baseline_kd_matched_topics(baseline_npmi_topics, kd_npmi_topics, topic_pairs_jsdiv_baseline_kd, scores, top_matched_pairs=10):
    kd_wins, baseline_wins = 0, 0
    topic_pairs_jsdiv_baseline_kd = topic_pairs_jsdiv_baseline_kd[:top_matched_pairs]
    df = pd.DataFrame(columns=['Pair #', 'SCHOLAR vs SCHOLAR+BAT', 'JS Divergence'])
    ind = list(range(1, 44+1))
    b_k, js = [], []
    for x, y in zip(topic_pairs_jsdiv_baseline_kd, scores):
        print('SCHOLAR: ' + str(baseline_npmi_topics[0][x[0]]) + '\nSCHOLAR+BAT: ' + str(kd_npmi_topics[0][x[1]]))
        b_k.append('SCHOLAR: ' + str(baseline_npmi_topics[0][x[0]]) + '\nSCHOLAR+BAT: ' + str(kd_npmi_topics[0][x[1]]))
        print('JS Div. Value = ' + str(y))
        js.append(y)
        if baseline_npmi_topics[0][x[0]][0]>kd_npmi_topics[0][x[1]][0]:
            baseline_wins+=1
        else:
            kd_wins+=1
        print('---')
    df['Pair #'] = ind
    df['SCHOLAR vs SCHOLAR+BAT'] = b_k
    df['JS Divergence'] = js
    return df, baseline_wins, kd_wins

In [None]:
baseline_npmi_topics = get_npmi_topics("/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev/", "/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/webhose/scholar_k50_defaultParameters_lr001")
kd_npmi_topics = get_npmi_topics(datapath="/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/kd-topic-models/data/webhose/replicated/dev/", modelpath="/mnt/batch/tasks/shared/LS_root/mounts/clusters/s2838321/code/Users/S283832/project_mldl/outputs/webhose/k50_defaultParameters_new_7_epochs_lr001")

In [None]:
df, baseline_wins, kd_wins = print_compare_baseline_kd_matched_topics(baseline_npmi_topics,kd_npmi_topics,topic_pairs_jsdiv_baseline_kd,scores,44)

In [None]:
kd_wins,baseline_wins

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
x = list(range(1,51))
plt.plot(x,scores)
plt.ylabel("JS divergence")
plt.xlabel("Topic pairs ordered by similarity")
plt.savefig('ITA_k50_js_divergence.eps', format='eps')

## Visualization

Due to some library updates mismatch it is recommended to run this part locally

In [None]:
import json
import numpy as np
import os
from scipy import sparse
from scipy.special import softmax

def load_model(filename, docs_path):
    beta = np.load(os.path.join(filename,"beta.npz"))['beta']
    beta = softmax(beta, axis = 1)
    theta = np.load(os.path.join(filename,"theta.train.npz"))['theta']
    with open(os.path.join(docs_path,"train.vocab.json"), 'r') as j:
        vocab = json.load(j)
    train = np.load(os.path.join(docs_path,"train.npz"))
    data = sparse.coo_matrix((train['data'], (train['row'], train['col'])), shape=train['shape']).todense()
    data = np.array(data)
    counts = np.sum(data, axis = 0, keepdims = False)
    lengths = [] 
    with open(os.path.join(docs_path,"train.tokens.json"), 'r') as j:
        tokenized_docs = json.load(j)
        for doc in tokenized_docs:
            lengths.append(len(doc))
    data = {'topic_term_dists': beta, #beta
          'doc_topic_dists': theta, #theta
          'doc_lengths': lengths,
          'vocab': vocab, #vocab
          'term_frequency': counts} #term freq
    return data

docs_path = './20ng/k50'
filename = './20ng/k50'
model_data = load_model(filename, docs_path)

print('Topic-Term shape: %s' % str(np.array(model_data['topic_term_dists']).shape))
print('Doc-Topic shape: %s' % str(np.array(model_data['doc_topic_dists']).shape))

In [None]:
!pip install pyLDAvis

import pyLDAvis

vis_data = pyLDAvis.prepare(**model_data)

In [None]:
pyLDAvis.save_html(vis_data, 'scholarbatENG.html')

In [None]:
%matplotlib inline
pyLDAvis.display(vis_data)