In [None]:
# Generating of Dummy Documents

In [None]:
import numpy as np
import pandas as pd
import re
import warnings

from gensim.models import  KeyedVectors
from spherecluster import  VonMisesFisherMixture, sample_vMF

warnings.filterwarnings(action='ignore')

In [None]:
filename = "word_vectors.kv"
model = KeyedVectors.load(filename, mmap='r')

word_embedding = np.array(model.wv.vectors)
vocab = list(model.wv.vocab)

linfnorm = np.linalg.norm(word_embedding, axis=1, ord=2)
word_embedding = word_embedding / linfnorm[:,None]

words_df = pd.DataFrame(word_embedding.T, columns=vocab)

class_keywords_str  = open('class keywords.txt', encoding='utf-8').read()
class_keywords = {i.split(': ')[0]: i.split(': ')[1].split(', ') for i in class_keywords_str.split('\n')}

class_keywords = {topic: [i[0] for i in model.wv.most_similar (topic, topn = 100)] for topic in class_keywords.keys()}
class_keywords_supplied = {class_label: [np.array(words_df[word]) for word in words] 
                           for class_label, words in class_keywords.items()}

In [None]:
topic_vMFs = {}

for i in class_keywords_supplied.keys():
    keyword_mtx = np.vstack(class_keywords_supplied[i])
    vmF = VonMisesFisherMixture(n_clusters=1, n_jobs=10, max_iter= 20)
    vmF.fit(keyword_mtx)
    topic_vMFs[i] = (vmF.cluster_centers_[0], vmF.concentrations_[0])

In [None]:
word_counts = np.load('word_counts.npy').item()
total_length = sum(word_counts.values())

In [None]:
word_distributions = {i : word_count/total_length for i, word_count in word_counts.items()}
word_distributions = pd.DataFrame.from_records(word_distributions,index=[0])
word_distributions = word_distributions[words_df.columns]

In [None]:
# Generation of psuedo documents distribution given a topic

def generateWordDistribution(alpha, word_distributions, top_n_keywords, words_df, topic):

    mu, kappa = topic_vMFs[topic]
    di = sample_vMF(mu, kappa, num_samples = 1)
    
    di_similarities = np.exp(np.dot(di, words_df.values).ravel())
    ranked_index = np.argsort(di_similarities)[::-1]
    
    di_similarities[ranked_index[top_n_keywords:]] = 0
    
    # generate document distributions
    keywords_distributions = di_similarities/np.sum(di_similarities)
    background_words = word_distributions.values.ravel()
    
    document_distributions = (alpha* np.array(background_words)
                                      + (1 - alpha)* keywords_distributions.ravel())
    
    return document_distributions

In [None]:
def generatePseudoLabels(alpha, word_distributions, topic):
    """ Generates psuedo labels given a topic
        Input: alpha - balancing parameter between background words and keywords
               vocab - vocabulary lists
               topic - topic keywords
        Output: a vector similiar to one-hot, with the largest probabilities at the topic keyword
    """
    # generate pseudo label
    background_words = word_distributions.values.ravel()
    label_vector = np.ones(len(background_words))*alpha/len(background_words)
    label_vector[list(word_distributions).index(topic)] += 1 - alpha
    return label_vector

In [None]:
def generateLabelledPseudoDocuments(alpha, doc_length, num_docs):
    """ Generates psuedo documents given a topic
        Input: alpha - balancing parameter between background words and keywords
               doc_length - length of words in the pseudo document
               num_docs - number of documents in a batch
        Output: a tuple (pseudo docs, pseudo labels)
    
    """
    
    topics = class_keywords_supplied.keys()
    topic_docs = {}
    for topic in topics:
        pseudo_docs = []
        pseudo_labels = []
        for i in range(num_docs):
        
            document_distribution = generateWordDistribution(alpha, word_distributions, 
                                                         20, words_df, topic)
            pseudo_docs.append(np.random.choice(len(document_distribution), size=doc_length, p=document_distribution)) 
            pseudo_labels.append(generatePseudoLabels(alpha, word_distributions, topic))
        
        topic_docs[topic] = (pseudo_docs, pseudo_labels)
        
    return topic_docs

In [None]:
pseudo_docs = generateLabelledPseudoDocuments(alpha = 0.3, doc_length = 1000, num_docs = 1000)