In [None]:
from collections import namedtuple
from glob import glob
from itertools import groupby
from operator import itemgetter
from pathlib import Path
import pickle

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel as LDA

from matplotlib import pyplot as plt

import numpy as np

from pandas import DataFrame, read_csv, concat

import pyLDAvis as ldavis
import pyLDAvis.gensim

from scipy.spatial.distance import cdist, pdist, squareform

from sklearn.model_selection import train_test_split

ldavis.enable_notebook()
%matplotlib notebook
%precision 4

out = Path('../output/')

Candidate = namedtuple('Candidate', ['iterations', 'num_topics'])

def get_i_t(filename):
    _, content, document_type = filename.split('-')
    i, t, _ = content.split('_')
    return int(i[1:]), int(t[1:]), document_type.split('.')[0]

get_texts = lambda df: df[target].str.split()
tobows = lambda df, d: get_texts(df).apply(d.doc2bow)

report_types = 'ISA', 'PFR', 'DPFR'

test_size = .2
min_occurances = 2

In [None]:
documents_by_type = {
    t: read_csv(out / f'norm_{t}.csv').dropna()
    for t in report_types
}

target = 'GLOMUNSTEM'

with open('election_by_type.pkl', 'rb') as fd:
    election_by_type = pickle.load(fd)

In [None]:
dictionary_by_type = dict()

for document_type in election_by_type:
    c = election_by_type[document_type]

    dictionary_by_type[document_type] = Dictionary(
        documents_by_type[document_type][target].str.split()
    )

    print(f'{document_type: <4}', end=' ')
    for test_num in range(100):

        train_by_type, test_by_type = dict(), dict()

        train_by_type[document_type], test_by_type[document_type] = \
          train_test_split(
            documents_by_type[document_type], test_size=test_size
          )

        dictionary_by_type[document_type].filter_extremes(no_below=min_occurances)

        corpus = tobows(
            train_by_type[document_type], 
            dictionary_by_type[document_type])

        lda = LDA(corpus=corpus,
                    num_topics=c.num_topics,
                    iterations=c.iterations,
                   )
        print('*', end='')

        savename = f'x{test_num:03}-i{c.iterations:03}_t{c.num_topics:03}_d-{document_type}'
        lda.save(f'{savename}.lda')

    print()


In [None]:
test_models = map(LDA.load, glob(f'models/x*-ISA.lda'))
test_topics = list(map(lambda x: x.get_topics(), test_models))


points = np.concatenate(list(filter(lambda x: x.shape[1] == 32709, test_topics)))
points.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
mat = cosine_similarity(points)

In [None]:
_ = plt.imshow(1-mat)
mat = np.diagonal(1-mat).round(4)

In [None]:
from fastcluster import linkage


def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarhical tree
            - res_linkage is the hierarhical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(mat)


In [None]:
from sklearn.cluster import SpectralClustering
sc = SpectralClustering(20)
sc.fit_predict(points)

In [None]:
from sklearn.cluster import KMeans
km = KMeans(20)
km.fit(points)
clusters = km.labels_

In [None]:
print(*(np.sum(clusters == i) for i in range(22)), sep='\n')