In [None]:
from collections import namedtuple
from glob import glob
from itertools import groupby
from operator import itemgetter
from pathlib import Path
import pickle

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel as LDA

from matplotlib import pyplot as plt

import numpy as np

from pandas import DataFrame, read_csv, concat

import pyLDAvis as ldavis
import pyLDAvis.gensim

from sklearn.model_selection import train_test_split

ldavis.enable_notebook()
%matplotlib notebook
%precision 4

out = Path('../output/lda')

Candidate = namedtuple('Candidate', ['iterations', 'num_topics'])

def get_i_t(filename):
    _, content, document_type = filename.split('-')
    i, t, _ = content.split('_')
    return int(i[1:]), int(t[1:]), document_type.split('.')[0]

get_texts = lambda df: df[target].str.split()
tobows = lambda df, d: get_texts(df).apply(d.doc2bow)

report_types = 'ISA', 'PFR', 'DPFR'

test_size = .2
min_occurances = 2


In [None]:
documents_by_type = {
    t: read_csv(out / f'norm_{t}.csv').dropna()
    for t in report_types
}

target = 'GLOMUNSTEM'

In [None]:
documents_by_type['ISA'].shape

In [None]:
#raise 'think about it'

In [None]:
dictionary_by_type, train_by_type, test_by_type = dict(), dict(), dict()
for document_type in documents_by_type:
    train_by_type[document_type], test_by_type[document_type] = \
      train_test_split(
        documents_by_type[document_type], test_size=test_size
      )

    dictionary_by_type[document_type] = Dictionary(
        train_by_type[document_type][target].str.split()
    )
    dictionary_by_type[document_type].filter_extremes(no_below=min_occurances)

with open('dict_train_test.pkl', 'wb') as fd:
    pickle.dump([dictionary_by_type, train_by_type, test_by_type], fd)

In [None]:
with open('dict_train_test.pkl', 'rb') as fd:
    dictionary_by_type, train_by_type, test_by_type = pickle.load(fd)

for document_type in train_by_type:
    corpus = tobows(train_by_type[document_type], dictionary_by_type[document_type])

    for iterations in range(1, 400, 50):
        print(document_type, iterations, end=' - ')
        for num_topics in range(1, 120, 20):

            lda = LDA(corpus=corpus,
                        num_topics=num_topics,
                        iterations=iterations,
                       )
            print(num_topics, end=':')

            savename = f'wide-i{iterations:03}_t{num_topics:03}_d-{document_type}'
            lda.save(str(out / f'{savename}.lda'))
        print()


In [None]:
str(out / 'cat')

In [None]:
with open('dict_train_test.pkl', 'rb') as fd:
    dictionary_by_type, train_by_type, test_by_type = pickle.load(fd)

for filename in glob(str(out / 'wide-*.lda')):
    iterations, num_topics, document_type = get_i_t(filename)

    # holdout not used intentionally
    corpus = tobows(train_by_type[document_type], dictionary_by_type[document_type])

    lda = LDA.load(filename)
    cm = CoherenceModel(
        model=lda,
        corpus=corpus,
        dictionary=dictionary_by_type[document_type],
        coherence='u_mass'
    )

    cm.save(filename.replace('lda', 'cm'))
    print('*', end='')


In [None]:
with open('dict_train_test.pkl', 'rb') as fd:
    dictionary_by_type, train_by_type, test_by_type = pickle.load(fd)

acc = []
for filename in glob(str(out / 'wide-*.lda')):
    iterations, num_topics, document_type = get_i_t(filename)
    train_corpus = tobows(train_by_type[document_type], dictionary_by_type[document_type])
    test_corpus = tobows(test_by_type[document_type], dictionary_by_type[document_type])

    lda = LDA.load(filename)
    cm = CoherenceModel.load(filename.replace('lda', 'cm'))

    try:
        c = cm.get_coherence()
    except:
        c = float('nan')
    row = [num_topics, iterations,
           lda.log_perplexity(train_corpus), lda.log_perplexity(test_corpus),
           c, document_type]

    acc.append(row)    
    print('*', end='')

In [None]:
wdiagnosis = DataFrame(
    acc,
    columns=['num_topics', 'iterations', 
             'train_perplexity', 'test_perplexity', 
             'coherence', 'document_type']
).sort_values(['train_perplexity'], ascending=False)

for document_type in wdiagnosis.document_type.unique():
    display(document_type, 
            wdiagnosis[wdiagnosis.document_type==document_type].head(n=10))

In [None]:
raise "pause"

In [None]:
with open('dict_train_test.pkl', 'rb') as fd:
    dictionary_by_type, train_by_type, test_by_type = pickle.load(fd)

try_by_type = {
    'ISA' : Candidate(iterations=range(50, 275, 25), num_topics=range(35, 70, 5)),
    'DPFR' : Candidate(iterations=range(100, 325, 25), num_topics=range(15, 50, 5)),
    'PFR' : Candidate(iterations=range(225, 375, 25), num_topics=range(35, 70, 5)),
}
    
for document_type in documents_by_type:
    corpus = tobows(train_by_type[document_type], dictionary_by_type[document_type])

    for iterations in try_by_type[document_type].iterations:
        print(document_type, f'{iterations:03}', end=' - ')
        for num_topics in try_by_type[document_type].num_topics:

            lda = LDA(corpus=corpus,
                        num_topics=num_topics,
                        iterations=iterations,
                       )
            print(num_topics, end=':')

            savename = f'narrow-i{iterations:03}_t{num_topics:03}_d-{document_type}'
            lda.save(str(out / f'{savename}.lda'))
        print()


In [None]:
with open('dict_train_test.pkl', 'rb') as fd:
    dictionary_by_type, train_by_type, test_by_type = pickle.load(fd)

for filename in glob(str(out / f'narrow-*.lda')):
    iterations, num_topics, document_type = get_i_t(filename)

    corpus = tobows(train_by_type[document_type], dictionary_by_type[document_type])

    lda = LDA.load(filename)
    cm = CoherenceModel(
        model=lda,
        corpus=corpus,
        dictionary=dictionary_by_type[document_type],
        coherence='u_mass'
    )

    cm.save(filename.replace('lda', 'cm'))
    print('*', end='')


In [None]:
with open('dict_train_test.pkl', 'rb') as fd:
    dictionary_by_type, train_by_type, test_by_type = pickle.load(fd)

acc = []
for filename in glob(str(out / f'narrow-*.lda')):
    iterations, num_topics, document_type = get_i_t(filename)

    corpus = tobows(train_by_type[document_type], dictionary_by_type[document_type])
    holdout = tobows(test_by_type[document_type], dictionary_by_type[document_type])

    lda = LDA.load(filename)
    cm = CoherenceModel.load(filename.replace('lda', 'cm'))

    try:
        c = cm.get_coherence()
    except:
        c = float('nan')
    row = [num_topics, iterations, 
           lda.log_perplexity(corpus), lda.log_perplexity(holdout), 
           c, document_type]

    acc.append(row)
    print('*', end='')


In [None]:
ndiagnosis = DataFrame(
    acc, 
    columns=['num_topics', 'iterations', 'train_perplexity', 'test_perplexity',
             'coherence', 'document_type']
).sort_values(['coherence'], ascending=True)

for document_type in ndiagnosis.document_type.unique():
    info = ndiagnosis[ndiagnosis.document_type==document_type]
    display(
        document_type, min(info.num_topics),max(info.num_topics),
        info.head(n=10)
    )

In [None]:
raise "pause"

In [None]:
with open('dict_train_test.pkl', 'rb') as fd:
    dictionary_by_type, train_by_type, test_by_type = pickle.load(fd)


In [None]:
election_by_type = {
    'ISA' :Candidate(iterations=150,  num_topics=60),
    'PFR' :Candidate(iterations=250,  num_topics=55),
    'DPFR':Candidate(iterations=250, num_topics=45),
}


with open('election_by_type.pkl', 'wb') as fd:
    pickle.dump(election_by_type, fd)

In [None]:
raise "pause"

In [None]:
with open('dict_train_test.pkl', 'rb') as fd:
    dictionary_by_type, train_by_type, test_by_type = pickle.load(fd)


with open('election_by_type.pkl', 'rb') as fd:
    election_by_type = pickle.load(fd)

model_by_type = dict()

for document_type in election_by_type:
    c = election_by_type[document_type]
    filename = f'narrow-i{c.iterations:03}_t{c.num_topics:03}_d-{document_type}'
    model_by_type[document_type] = LDA.load(str(out / f'{filename}.lda'))

In [None]:
#documents_by_type['ISA'].sort_values('Project_Code')

In [None]:
from scipy.stats import logistic


document_type = 'ISA'

tump = documents_by_type[document_type].sort_values('Project_Code')
dump = dictionary_by_type[document_type]
mump = model_by_type[document_type]

for project, documents in tump.groupby('Project_Code'):
    gs = documents[target]
    bow = dump.doc2bow(gs.str.cat().split())
    topics = sorted(mump.get_document_topics(bow), key=itemgetter(1), reverse=True)

    if False:
        print(
            f'{project} : {gs.size} documents', 
            *topics,
            sep='\n . '
        )
    else:
        num_contrib = len(topics)
        topic_words = ((mump.show_topic(t), v) for t, v in topics)
        words = []
        for wvalues, tvalue in topic_words:
            scored = map(lambda w_v: (w_v[0], w_v[1]*tvalue), wvalues)
            words.extend(scored)

        result = dict()
        for g, targets in groupby(sorted(words, key=itemgetter(0)), itemgetter(0)):
            result[g] = sum(map(itemgetter(1), targets))

        ranked = sorted(result.items(), key=itemgetter(1), reverse=True)
        print(
            f'{project} : {gs.size} documents, {num_contrib} topics', 
            *(dump[int(key)] for key, value in ranked if value > .002),
            sep='\n . ')


In [None]:
election_by_type.keys()

In [None]:
prepared_data = {
    document_type: ldavis.gensim.prepare(
        model_by_type[document_type],
        corpus=tobows(documents_by_type[document_type], dictionary_by_type[document_type]),
        dictionary=dictionary_by_type[document_type],
    )
    for document_type in election_by_type}

In [None]:
for report_type in election_by_type:
    display(prepared_data[report_type])