In [None]:
import itertools
import os
import zipfile
import csv
import textwrap
import time
import multiprocessing

from difflib import SequenceMatcher
from pprint import pprint
from collections import Counter, deque

import spacy
import numpy
import pandas
import umap

import phasor

from headless import load_pages
from scipy.spatial import cKDTree
from sklearn.neighbors import BallTree
from pyhash import city_64

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, TapTool, OpenURL, ColumnDataSource
from bokeh.palettes import magma

output_notebook()

In [None]:
dataset = 'test-batch'
dataset = 'rand-fiction-1k'

secure_paths = (f'/media/secure_volume/volumes/zip/{dataset}',
                f'/media/secure_volume/derived/{dataset}/fft',
                f'/media/secure_volume/derived/{dataset}/srp_fft',
                f'/media/secure_volume/worksets/{dataset}-hathifiles.csv')

test_paths = ('../ht-open-test-data/fiction_998/',
              '../ht-open-test-data/fiction_fft',
              '../ht-open-test-data/fiction_srp_fft',
              '../ht-open-test-data/fiction.csv')

(path_to_volumes, 
 path_to_fft, 
 path_to_srp_fft, 
 path_to_meta) = test_paths

In [None]:
# for test_path in phasor.volume_paths(path_to_volumes)[:3]:
#     print('testing with {}'.format(test_path))
#     try:
#         phasor.test_fft_reshape(test_path, srp=False)
#     except Exception as e:
#         print('skipping {} -- {}: {}'.format(test_path, type(e), e))

# for test_path in phasor.volume_paths(path_to_volumes):
#     phasor.test_htid_conversion(test_path)

In [None]:
paths = phasor.save_embedding_ffts(path_to_volumes, path_to_fft, srp=False)
paths = phasor.save_embedding_ffts(path_to_volumes, path_to_srp_fft, srp=True)

In [None]:
data, metadata = phasor.load_fft_metadata(path_to_fft, path_to_meta, end=20, csv_delim=',')
data[numpy.isnan(data)] = 0
# phasor.show_umap_bokeh(
#     phasor.slice_vec_bands(data, start=0, end=1),
#     metadata,
#     n_neighbors=50
#     # color_field='pub_date'
# )

In [None]:
kwargs = dict(
    n_neighbors=10, 
    n_components=5, 
    metric='euclidean',
    # random=True
)
dedupe_slice_full = phasor.Deduplicator(phasor.slice_vec_bands(data, start=0, end=10), **kwargs)
dedupe_slices = [phasor.Deduplicator(phasor.slice_vec_bands(data, start=i, end=i + 1), **kwargs)
                 for i in range(10)]
dedupe_boolean = phasor.Deduplicator(dedupe_slices[0])
for ds in dedupe_slices[1:5]:
    dedupe_boolean.merge(ds)

In [None]:
radius = 0.8

pairs_boolean = dedupe_boolean(radius)
pairs_single = dedupe_slices[0](radius)
pairs_full = dedupe_slice_full(radius)
print("Number of candidates found by each test")
print()
print("Boolean test:     ", len(pairs_boolean))
print("Single-band test: ", len(pairs_single))
print("Full-band test:   ", len(pairs_full))

In [None]:
likely_true_positives_boolean = set(
    frozenset((a, b)) for a, b in pairs_boolean
    if metadata['title'][a] == metadata['title'][b]
)
likely_true_positives_single = set(
    frozenset((a, b)) for a, b in pairs_single
    if metadata['title'][a] == metadata['title'][b]
)
likely_true_positives_full = set(
    frozenset((a, b)) for a, b in pairs_full
    if metadata['title'][a] == metadata['title'][b]
)

print("Number of likely duplicates (based on title) found by each test")
print()
print("Boolean test:     ", len(likely_true_positives_boolean))
print("Single-band test: ", len(likely_true_positives_single))
print("Full-band test:   ", len(likely_true_positives_full))
print()
print("Number of identically-titled volumes missed by single test, caught by boolean test:")
print(len(likely_true_positives_boolean - likely_true_positives_single))
print()
print("Number of identically-titled volumes missed by boolean test, caught by single test:")
print(len(likely_true_positives_single - likely_true_positives_boolean))
print()
print("NOTE: Many false positives will still appear in these counts because "
      "different volumes from multi-volume works may have the same title even "
      "though they do not contain the same content. This accounts for many of the "
      "matches captured by the single-band but not by the boolean test. The "
      "single-band test captures the broad semantic similarity between volumes of "
      "the same work, but can't make fine-grained distinctions between individual "
      "volumes of the work. A single ten-volume work can produce as many as one "
      "hundred false positives here, so this can give the impression that the "
      "boolean test has missed many duplicates. Hand check a few and you'll "
      "probably see that it hasn't.")

In [None]:
pairs = list(pairs_boolean)

def get_root(links, ix):
    while links[ix] != ix:
        ix = links[ix]
    return ix

def set_root(links, ix, root):
    while links[ix] != ix:
        old_ix = ix
        ix = links[ix]
        links[old_ix] = root
    links[ix] = root

def cluster_pairs(pairs, maxn=None):
    if maxn is None:
        maxn = max(x for p in pairs for x in p) + 1
        
    dupe_link = list(range(maxn))
    for a, b in pairs:
        a_root = get_root(dupe_link, a)
        set_root(dupe_link, a, a_root)
        set_root(dupe_link, b, a_root)
    
    for i in range(len(dupe_link)):
        dupe_link[i] = get_root(dupe_link, i)

    dupe_clusters = {c: [] for c in dupe_link}
    for i, c in enumerate(dupe_link):
        dupe_clusters[c].append(i)

    return list(dupe_clusters.values())

dupe_clusters = cluster_pairs(pairs, len(data))
print('Total number of items:', sum(len(c) for c in dupe_clusters))
print('Number of clusters:', len(dupe_clusters))
print('Largest cluster:', max(len(c) for c in dupe_clusters))
print('Number of one-item clusters:', sum(len(c) == 1 for c in dupe_clusters))

In [None]:
def sort_cluster(indices, data):
    centroid = sum(data[i] for i in indices) / len(indices)
    distances = [((data[i] - centroid) ** 2).sum() ** 0.5 for i in indices]
    order = sorted(range(len(distances)), key=distances.__getitem__)
    indices_sorted = [indices[o] for o in order]
    distances = [distances[o] for o in order]
    return list(zip(indices_sorted, distances))

def mean_dist(cluster_dist):
    return sum(d for c, d in cluster_dist) / len(cluster_dist)

display_clusters = [sort_cluster(dc, data)
                    for dc in dupe_clusters if len(dc) > 1]
display_clusters.sort(key=mean_dist)

for i, dc in enumerate(display_clusters):
    print()
    print(f'Cluster {i}, {len(dc)} items:')
    for j, (vol, dist) in enumerate(dc):
        vol_id = metadata.index[vol]
        vol_ti = metadata['title'][vol]
        vol_au = metadata['author'][vol]
        print('    Item', j, ' ~~  distance from cluster centroid:', dist)
        print('   ', vol_au, ' ~~ ', vol_ti)
        print('   ', phasor.htid_url(vol_id))
        print()    

In [None]:
data_freq_array = [phasor.unflatten_vec(r) / len(data) for r in data]  # 300 rows, 20 cols in each array, ~1000 arrays

data_freq_mean = data_freq_array[0]
for dfa in data_freq_array[1:]:
    data_freq_mean += dfa

In [None]:
power_a = data_freq_mean[:, 1:].real
power_b = data_freq_mean[:, 1:].imag
power = (power_a * power_a + power_b * power_b) ** 0.5

mean_power = power.sum(axis=0) / 300
plt.plot(mean_power)

In [None]:
scaled_mean = mean_power / mean_power.mean()
scaled_power = power / power.mean(axis=0)
# scaled_rel_diff = []

scaled_diffs = numpy.array([(((scaled_power[i] - scaled_mean) / scaled_power[i]) ** 2).sum()
                            for i in range(len(power))])

scaled_diffs_argsort = scaled_diffs.argsort()

for chunk in range(10):
    for i in range(chunk * 30, chunk * 30 + 30):
        plt.plot(power[scaled_diffs_argsort[i]] / 
                 power[scaled_diffs_argsort[i]].mean())
    plt.show()




In [None]:
unstable_vec = numpy.zeros(len(scaled_diffs), dtype=numpy.float64)
unstable_vec[scaled_diffs_argsort] = (numpy.arange(len(scaled_diffs)) / len(scaled_diffs)) > 0.99
unstable_vec = unstable_vec.reshape(1, -1)

stable_vec = numpy.zeros(len(scaled_diffs), dtype=numpy.float64)
stable_vec[scaled_diffs_argsort] = (numpy.arange(len(scaled_diffs)) / len(scaled_diffs)) < 0.01
stable_vec = stable_vec.reshape(1, -1)

def get_similar(vec, comp_word):
    vec = vec + phasor.en_nlp.vocab.vectors[phasor.en_nlp.vocab.strings[comp_word]]
    t_id = phasor.en_nlp.vocab.vectors.most_similar(vec)[0][0]
    return phasor.en_nlp.vocab.strings[t_id]

# get_similar(unstable_vec, "lost")