In [None]:
from collections import namedtuple, defaultdict, Counter
from glob import glob
from itertools import groupby
from operator import itemgetter
from os import remove
from pathlib import Path
import pickle

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel as LDA
from gensim.models import AuthorTopicModel as ATM

from matplotlib import pyplot as plt

import numpy as np

from pandas import DataFrame, read_csv, concat

import pyLDAvis as ldavis
import pyLDAvis.gensim

import seaborn as sns

from sklearn.model_selection import train_test_split

import scipy
from scipy.spatial.distance import cdist
from scipy.sparse import coo_matrix as sparse_matrix

ldavis.enable_notebook()
%matplotlib notebook
#%precision 4


Candidate = namedtuple('Candidate', ['iterations', 'num_topics'])



def get_i_t(filename):
    _, content, document_type = filename.split('-')
    i, t, _ = content.split('_')
    return int(i[1:]), int(t[1:]), document_type.split('.')[0]

get_texts = lambda df: df[target].str.split()
tobows = lambda df, d: concat([df['Anomaly_ID'], get_texts(df).apply(d.doc2bow)], axis=1)



isr2 = 2.0 ** -.5

def hellinger(x, y):
    return isr2 * np.sqrt(((np.sqrt(x) - np.sqrt(y)) ** 2).sum())



report_types = 'ISA', 'PFR', 'DPFR'

TEST_SIZE = 0.3
min_occurances = 2
target = 'GLOMUNSTEM'

%matplotlib notebook
%precision 4

BASEDIR = Path('../data')
OUT = Path('../output/atm/')



In [None]:
with open(BASEDIR / 'processed_authors.csv') as fd:
    af = read_csv(fd)
af.shape
af.head()

In [None]:
authors_by_type = {
    t: af[af.ReportType == t] 
    for t in af.ReportType.unique()
}

In [None]:
documents_by_type = {
    t: read_csv(OUT / f'../norm_{t}.csv').dropna()
    for t in af.ReportType.unique()
}

In [None]:
def author_document_downselect(documents, authors, ANOMALY_LABEL='Anomaly_ID'):
    documents[ANOMALY_LABEL] = \
      'A' + documents[ANOMALY_LABEL].apply(str)

    idx = documents[ANOMALY_LABEL].isin(
        authors[ANOMALY_LABEL].unique()
    )

    documents = documents[idx]
    
    idx = authors[ANOMALY_LABEL].isin(
        documents[ANOMALY_LABEL].unique()
    )
    authors = authors[idx]
    return documents, authors


for document_type in documents_by_type:
    documents_by_type[document_type], authors_by_type[document_type] = \
        author_document_downselect(
          documents_by_type[document_type],
          authors_by_type[document_type]
    )


In [None]:
documents_by_type['ISA']

In [None]:
def get_train_test_and_vocab(documents, test_size, min_occurances=min_occurances):
    train, test = train_test_split(
        documents, test_size=test_size
    )
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)

    vocab = Dictionary(
        train[target].str.split()
    )
    vocab.filter_extremes(no_below=min_occurances)

    return train, test, vocab


In [None]:
model_by_type, train_documents_by_type, test_documents_by_type, dictionary_by_type = \
  dict(), dict(), dict(), dict()

train_author_table_by_type = dict()
test_author_table_by_type = dict()

def attribution_table(documents, relevent_authors):
    store = defaultdict(set)
    for idx, anomaly in documents.iterrows():
        authors_documents = relevent_authors[
            relevent_authors.Anomaly_ID == anomaly.Anomaly_ID
        ]

        for author in authors_documents.Users_ID:
            store[author].add(idx)

    return {k: list(v) for k, v in store.items()}


for document_type in documents_by_type:
    train_documents_by_type[document_type], \
    test_documents_by_type[document_type], \
    dictionary_by_type[document_type] = \
        get_train_test_and_vocab(documents_by_type[document_type], TEST_SIZE)

    train_author_table_by_type[document_type] = attribution_table(
        train_documents_by_type[document_type],
        authors_by_type[document_type]
    )

    test_author_table_by_type[document_type] = attribution_table(
        test_documents_by_type[document_type],
        authors_by_type[document_type]
    )

    print(f'{document_type: <4}')

In [None]:
raise "pause"

In [None]:
for filename in glob(str(OUT / 'wide-*.')):
    remove(filename)

for document_type in documents_by_type:
    corpus = tobows(
        train_documents_by_type[document_type],
        dictionary_by_type[document_type]
    )[target]

    for iterations in range(1, 352, 50):
        print(f'{document_type: <4}', f'{iterations:03}', end=' - ')
        for num_topics in range(1, 122, 3):

            model = ATM(corpus=list(corpus),
                        author2doc=train_author_table_by_type[document_type],
                        num_topics=num_topics,
                        iterations=iterations,
                       )
            print(num_topics, end=':')

            savename = f'wide-i{iterations:03}_t{num_topics:03}_d-{document_type}'
            model.save(str(OUT / f'{savename}.atm'))
        print()


In [None]:
def build_coherence(filename, coherence_type):
    iterations, num_topics, document_type = get_i_t(filename)

    # holdout not used intentionally
    corpus = tobows(
        train_documents_by_type[document_type],
        dictionary_by_type[document_type]
    )[target]

    model = ATM.load(filename)
    cm = CoherenceModel(
        model=model,
        corpus=corpus,
        texts=train_documents_by_type[document_type][target].apply(str.split),
        dictionary=dictionary_by_type[document_type],
        coherence=coherence_type
    )

    cm.save(filename.replace('.atm', f'.cm.{coherence_type}'))
    return cm


coherence_labels = ['c_v', 'u_mass']

for coherence_type in coherence_labels:
    print(coherence_type)
    for filename in glob(str(OUT / 'wide-*.atm')):
        build_coherence(filename, coherence_type)
        print('*', end='')
    print()
    

In [None]:
def get_metrics(filename, coherence_labels):
    iterations, num_topics, document_type = get_i_t(filename)

    train_corpus = tobows(
        train_documents_by_type[document_type], 
        dictionary_by_type[document_type]
    )[target]

    test_corpus = tobows(
        test_documents_by_type[document_type],
        dictionary_by_type[document_type]
    )[target]

    atm = ATM.load(filename)
    coherences = [
        CoherenceModel.load(
            filename.replace('.atm', f'.cm.{ct}')
        ).get_coherence()
        for ct in coherence_labels
    ]

    #p_train = model.log_perplexity(train_corpus)
    #p_test = model.log_perplexity(test_corpus)

    row = [
        document_type, num_topics, iterations,
        #p_train, p_test,
    ] + coherences
    return row

acc = []
for filename in glob(str(OUT / 'wide-*.atm')):
    row = get_metrics(filename, coherence_labels)
    acc.append(row)
    print('*', end='')

In [None]:


wdiagnosis = DataFrame(
    acc,
    columns=[
        'document_type', 'num_topics', 'iterations', 
        #'train_perplexity', 'test_perplexity', 
    ] + coherence_labels
)
wdiagnosis.to_csv('./saveout.csv', index=False)
plt.scatter(wdiagnosis.num_topics, wdiagnosis.u_mass, c=wdiagnosis.iterations)
plt.colorbar()


In [None]:
wdiagnosis = DataFrame.from_csv('./saveout.csv', index_col=None)
metric = 'c_v'
for document_type in wdiagnosis.document_type.unique():
    __ = plt.figure()
    _ = sns.boxplot(
        x="num_topics", y=metric,
        data=wdiagnosis[
            (wdiagnosis.document_type == document_type) &
            #(wdiagnosis.iterations > 50) &
            (wdiagnosis.num_topics < 100) &
            True
        ],
        palette="Set3")
    plt.xticks(rotation=70)
    plt.title(document_type)
    plt.savefig(f'{document_type}_{metric}.png')


In [None]:
for document_type in wdiagnosis.document_type.unique():
    display(document_type,
            wdiagnosis[
                (wdiagnosis.document_type==document_type) &
                (wdiagnosis.num_topics == 25) &
                True
            ].sort_values(['c_v'], ascending=False).head(n=10))

In [None]:
raise "pause"

In [None]:
election_by_type = {
    'ISA' : Candidate(iterations=351, num_topics=40),
    'PFR' : Candidate(iterations=251, num_topics=25),
    'DPFR': Candidate(iterations=351, num_topics=25),
}

for document_type in election_by_type:
    c = election_by_type[document_type]

    corpus = tobows(
        train_documents_by_type[document_type],
        dictionary_by_type[document_type]
    )[target]

    atm = ATM(
        corpus=list(corpus),
        author2doc=train_author_table_by_type[document_type],
        num_topics=c.num_topics,
        iterations=c.iterations,
    )
    model_by_type[document_type] = atm


In [None]:
def get_document_topics(model, doc_bow):

    gamma_chunk, sstats = model.inference(
        chunk=[doc_bow], author2doc=dict(), doc2author=dict(), 
        rhot=1.00,
        collect_sstats=True
    )

    return gamma_chunk


def get_model_author_topic_vectors(model):
    author_topic_vectors = np.zeros(
        (model.num_authors, model.num_topics)
    )

    for i, author in enumerate(model.id2author.values()):
        idx, scores = zip(*model.get_author_topics(author))
        author_topic_vectors[i, idx] = scores

    return author_topic_vectors


def get_sorted_authors(model, doc_bow, author_topic_vectors, metric=hellinger):
    doc_vector = get_document_topics(model, doc_bow)

    author_scores = np.argsort(
        cdist(doc_vector, author_topic_vectors, metric=metric)
    )

    contenders = [
        model.id2author[idx]
        for idx in author_scores[0]
    ]

    return contenders


def get_all_ranks_and_counts(model, publication_counts, authors, dictionary, documents):
    author_topic_vectors = get_model_author_topic_vectors(model)

    for idx, row in tobows(documents, dictionary).iterrows():
        Anomaly_ID, doc_bow = row['Anomaly_ID'], row[target]

        contenders = get_sorted_authors(model, doc_bow, author_topic_vectors)
        
        real_authors = [
            a
            for a in authors
            if idx in authors[a]
        ]

        for a in real_authors:
            try:
                rank = contenders.index(a)
            except ValueError as e:
                continue   

            try:
                publications = publication_counts[a]
            except KeyError:
                continue

            yield publications, rank


In [None]:
# train_author_table_by_type

TT = 'ISA'

publication_counts_by_type = dict()
for t in train_author_table_by_type:
    _authors = train_author_table_by_type[t]
    publication_counts_by_type[t] = {a: len(_authors[a]) for a in _authors}
    del _authors

train = list(get_all_ranks_and_counts(
    model_by_type[TT],
    publication_counts_by_type[TT],
    train_author_table_by_type[TT],
    dictionary_by_type[TT],
    train_documents_by_type[TT]
))

test = list(get_all_ranks_and_counts(
    model_by_type[TT],
    publication_counts_by_type[TT],
    test_author_table_by_type[TT],
    dictionary_by_type[TT],
    test_documents_by_type[TT]
))

In [None]:
print(train_documents_by_type[TT].shape)
print(train_documents_by_type[TT].iloc[0][target])

from itertools import chain
#set(chain(*train_author_table_by_type['ISA'].values()))
#set(chain(*test_author_table_by_type['ISA'].values()))


In [None]:
def plot_arc(arc):
    _ = plt.figure()
    # Load the dataset
    data = DataFrame(columns=["publications", "rank"], data=arc)
    _ = sns.jointplot(x=data['publications'], y=data['rank'], 
                      kind='hex', 
                      xlim=(0, 250), ylim=(0, 300), gridsize=20
                     )

plot_arc(train)
plot_arc(test)

In [None]:
def plot(arc, label):
    _ = plt.figure()
    # Load the dataset
    ranks = np.array(arc, dtype=np.float)[:,1]
    #print(ranks)
    
    def get_stats(ranks):
        for cap in range(500):
            # there are no negatives
            idx = ranks < cap
            tp = np.sum(idx)
            fp = np.sum(ranks[idx])
            fn = np.sum(~idx)
            tn = np.sum(ranks[~idx])
            accuracy = np.sum(idx) / len(idx)

            sensitivity = tp / fn
            specificity = tn / (fp + tn)
            yield cap, accuracy
    x, y = zip(*list(get_stats(ranks)))
    plt.scatter(x, y)
    plt.title(label)
    plt.xlabel('cap')
    #plt.xlim((0, 10))
    plt.ylabel('accuracy')

plot(train, f'{TT}-train')
plot(test, f'{TT}-test')

In [None]:
prepared_data = {
    document_type: ldavis.gensim.prepare(
        model_by_type[document_type],
        corpus=tobows(
            documents_by_type[document_type], 
            dictionary_by_type[document_type])[target],
        dictionary=dictionary_by_type[document_type],
    )
    for document_type in election_by_type}

In [None]:
for report_type in election_by_type:
    display(prepared_data[report_type])