# A word-cluster based topic model for document representation



In [None]:
class TopicClusterGadget():
    """
        For vectorizing and computing document distance, based on word-topic-clusters in a document. 
        Implemented wmd_cosine, wmd_euclidean, and soft_cosine
    """
    def __init__(self, tokenizer, cls, distance_measure, word2vec):
        self.cls = cls
        self.tokenizer = tokenizer
        self.word2vec = word2vec
        self.distance_measure = distance_measure 
        self.topics = []
        if distance_measure == "wmd_euclidean":
            self.get_distance = DistanceTool().wmd_distance_euclidean
        elif distance_measure == "wmd_cosine":
            self.get_distance = DistanceTool().wmd_distance_cosine
        elif self.distance_measure == "soft_cosine":
            self.get_distance = DistanceTool().softcosine_distance
        
    
    def tokenize(self, text):
        return self.tokenizer(text)
    
    def vectorize_all(self, train_X, test_X):
        train_vecs = [self.encode(t) for t in train_X]
        test_vecs = [self.encode(t) for t in test_X]
        max_len = np.max(np.concatenate(([len(a) for a in train_vecs], [len(a) for a in test_vecs])))
        train_vecs = np.asarray([np.pad(a, (0, max_len - len(a)), 'constant', constant_values=0) for a in train_vecs])
        test_vecs = np.asarray([np.pad(a, (0, max_len - len(a)), 'constant', constant_values=0) for a in test_vecs])
        return train_vecs, test_vecs
        
    def encode(self, text):
        text = self.tokenize(text)
        text = [t for t in text if t in self.word2vec]
        word_counter = Counter(text)
        word_counts = np.array(list(word_counter.values()))
        unique_text = list(word_counter.keys())
        vecs = np.array([self.word2vec[w] for w in unique_text])
        num_clusters = math.ceil(np.cbrt(len(unique_text)))
                                
        if self.cls == "kmeans": 
            model = KMeans(n_clusters = num_clusters, random_state=3425, precompute_distances = True)
            model.fit(vecs)
            distance_matrix = euclidean_distances(vecs)
            cluster_centers = model.cluster_centers_
        elif self.cls == "spectral":
            model = SpectralClustering(num_clusters, affinity='precomputed', random_state=3425,
                                    assign_labels = "discretize", n_init=1)
            similarity_matrix = cosine_similarity(vecs)+1 
            model.fit(similarity_matrix) 
            distance_matrix = 2-similarity_matrix
           
        labels = model.labels_
        non_empty_clusters = np.array(list(set(labels)))                         
        indices = [np.where(labels == i) if i in non_empty_clusters 
                                                           else [] for i in range(num_clusters)]
        
        #silhoutte_scores = silhouette_samples(distance_matrix, labels, metric = 'precomputed')
        if self.cls == "spectral":
             cluster_centers = np.array([np.mean(vecs[indices[i]], axis = 0) for i in non_empty_clusters])
        sizes = np.array([sum(word_counts[indices[i]]) for i in non_empty_clusters])
        weights = sizes/sum(sizes)
        return np.concatenate([[num_clusters], cluster_centers.flatten(), weights])
    
    def decode(self, a, b):
        num_clusters_a, num_clusters_b = int(a[0]), int(b[0])
        cut_a, cut_b = 1+300*num_clusters_a, 1+300*num_clusters_b
        topics1, topics2 = a[1:cut_a].reshape(-1,300), b[1:cut_b].reshape(-1,300)
        weights1, weights2 = a[cut_a:cut_a+num_clusters_a], b[cut_b:cut_b+num_clusters_b]
        allTopics = np.concatenate([topics1,topics2])
        d1 = np.concatenate([weights1,[0]*len(topics2)])
        d2 = np.concatenate([[0]*len(topics1),weights2])
        return d1, d2, allTopics
