In [None]:
from collections import namedtuple, defaultdict, Counter
from glob import glob
from itertools import groupby
from operator import itemgetter
from pathlib import Path
import pickle

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel as LDA
from gensim.models import AuthorTopicModel as ATM

from matplotlib import pyplot as plt

import numpy as np

from pandas import DataFrame, read_csv, concat

import pyLDAvis as ldavis
import pyLDAvis.gensim

from sklearn.model_selection import train_test_split

import scipy

ldavis.enable_notebook()
%matplotlib notebook
#%precision 4

out = Path('../output/')

Candidate = namedtuple('Candidate', ['iterations', 'num_topics'])

def get_i_t(filename):
    _, content, document_type = filename.split('-')
    i, t, _ = content.split('_')
    return int(i[1:]), int(t[1:]), document_type.split('.')[0]

get_texts = lambda df: df[target].str.split()
tobows = lambda df, d: get_texts(df).apply(d.doc2bow)

report_types = 'ISA', 'PFR', 'DPFR'

test_size = .2
min_occurances = 2
target = 'GLOMUNSTEM'

%matplotlib notebook
%precision 4

BASEDIR = Path('../data')
OUT = Path('../output/')



In [None]:
with open(BASEDIR / 'processed_authors.csv') as fd:
    af = read_csv(fd)
af.shape
af.head()

In [None]:
authors_by_type = {t: af[af.ReportType == t] for t in af.ReportType.unique()}

In [None]:
documents_by_type = {
    t: read_csv(OUT / f'norm_{t}.csv').dropna()
    for t in af.ReportType.unique()
}

In [None]:
t = 'ISA'
documents_by_type[t].head()

In [None]:
authors_by_type[t].Anomaly_ID.head()

In [None]:
documents_by_type[t].Anomaly_ID.head()

In [None]:
for document_type in documents_by_type:
    documents_by_type[document_type].Anomaly_ID = \
      'A' + documents_by_type[document_type].Anomaly_ID.apply(str)

    
    idx = documents_by_type[document_type].Anomaly_ID.isin(
        authors_by_type[document_type].Anomaly_ID.unique()
    )

    documents_by_type[document_type] = documents_by_type[document_type][idx]
    
    idx = authors_by_type[document_type].Anomaly_ID.isin(
        documents_by_type[document_type].Anomaly_ID.unique()
    )
    authors_by_type[document_type] = authors_by_type[document_type][idx]


In [None]:
for document_type in documents_by_type:
    print(document_type, len(documents_by_type[document_type]))

In [None]:
train_by_type, test_by_type, dictionary_by_type = dict(), dict(), dict()

for document_type in documents_by_type:
    
    train_by_type[document_type], test_by_type[document_type] = \
      train_test_split(
        documents_by_type[document_type], test_size=test_size
      )

    dictionary_by_type[document_type] = Dictionary(
        train_by_type[document_type][target].str.split()
    )
    dictionary_by_type[document_type].filter_extremes(no_below=min_occurances)


In [None]:
with open('election_by_type.pkl', 'rb') as fd:
    election_by_type = pickle.load(fd)

model_by_type, train_by_type, test_by_type, dictionary_by_type = \
  dict(), dict(), dict(), dict()

train_author_table_by_type = dict()
test_author_table_by_type = dict()

def attribution_table(documents, relevent_authors):
    store = defaultdict(set)
    for idx, anomaly in documents.iterrows():
        authors_documents = relevent_authors[
            relevent_authors.Anomaly_ID == anomaly.Anomaly_ID
        ]

        for author in authors_documents.Users_ID:
            store[author].add(idx)

    return {k: list(v) for k, v in store.items()}


for document_type in election_by_type:
    c = election_by_type[document_type]

    dictionary_by_type[document_type] = Dictionary(
        documents_by_type[document_type][target].str.split()
    )

    print(f'{document_type: <4}')

    train_by_type[document_type], test_by_type[document_type] = \
      train_test_split(
        documents_by_type[document_type], test_size=test_size
      )
    train_by_type[document_type] = train_by_type[document_type].reset_index(drop=True)
    test_by_type[document_type] = test_by_type[document_type].reset_index(drop=True)

    dictionary_by_type[document_type].filter_extremes(no_below=min_occurances)

    train_author_table_by_type[document_type] = attribution_table(
        train_by_type[document_type],
        authors_by_type[document_type]
    )
    
    test_author_table_by_type[document_type] = attribution_table(
        test_by_type[document_type],
        authors_by_type[document_type]
    )

In [None]:
for document_type in election_by_type:
    c = election_by_type[document_type]

    corpus = tobows(
        train_by_type[document_type], 
        dictionary_by_type[document_type]
    )

    atm = ATM(
        corpus=list(corpus),
        author2doc=train_author_table_by_type[document_type],
        num_topics=c.num_topics,
        iterations=c.iterations,
    )
    model_by_type[document_type] = atm
 

In [None]:
model_by_type['ISA'].get_topics()

In [None]:
test_by_type['ISA'].columns

In [None]:
author_candidates = dict()
count_document_ids = Counter()
for user_id in train_author_table_by_type['ISA']:
    if user_id in test_author_table_by_type['ISA']:
        author_candidates[user_id] = test_author_table_by_type['ISA'][user_id]
        count_document_ids.update(author_candidates[user_id])


In [None]:
count_document_ids.most_common(n=20)

In [None]:
raise "pause"

In [None]:
doc_id = 285
test_by_type['ISA'].iloc[doc_id].GLOMUNSTEM.split()

In [None]:
doc = dictionary_by_type['ISA'].doc2bow(
    test_by_type['ISA'].iloc[doc_id].GLOMUNSTEM.split()
)
doc

isa_atm = model_by_type['ISA']

In [None]:
author_topic_vectors = np.zeros(
    (isa_atm.num_authors, isa_atm.num_topics)
)

for i, author in enumerate(isa_atm.id2author.values()):
    idx, scores = zip(*isa_atm.get_author_topics(author))
    author_topic_vectors[i, idx] = scores

In [None]:
gamma_chunk, sstats = isa_atm.inference(
    chunk=[doc], author2doc=dict(), doc2author=dict(), 
    rhot=1.00,
    collect_sstats=True
)

doc_topics = gamma_chunk / gamma_chunk.max()

In [None]:
import numpy as np
from scipy.spatial.distance import cdist

isr2 = 2.0 ** -.5

def hellinger(x, y):
    return isr2 * np.sqrt(((np.sqrt(x) - np.sqrt(y)) ** 2).sum())

author_scores = np.argsort(
    cdist(gamma_chunk, author_topic_vectors, metric=hellinger)
)

top_k = 200
contenders = [
    isa_atm.id2author[idx]
    for idx in author_scores[0,0:top_k]]

print(*contenders[:10], sep='\n')
print(len(contenders))

In [None]:
from itertools import chain

for idx, author in enumerate(contenders):
    if author in test_author_table_by_type['ISA']:
        if doc_id in test_author_table_by_type['ISA'][author]:
            print(idx)
            break
        else:
            print('-', end='')

    else:
        print('*', end='')
