In [None]:
import itertools
import os

from pprint import pprint

import spacy
import numpy
import umap

from headless import load_pages
from scipy.spatial import KDTree

import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
en_nlp = spacy.load('en_core_web_md', disable=['tagger', 'parser', 'ner'])

In [None]:
N_BANDS = 50

def volume_paths(path):
    """List all zip files and subfolders in the given folder."""
    files = (os.path.join(path, f) for f in sorted(os.listdir(path)))
    return [f for f in files if os.path.isdir(f) or f.endswith('.zip')]

def numpy_paths(path):
    """List all numpy files in the given folder."""
    files = (os.path.join(path, f) for f in sorted(os.listdir(path)))
    return [f for f in files if f.endswith('.npy')]

def load_one_sp_embedding(volume_path):
    """Parse the text of one volume and extract word vectors."""
    sp_text = en_nlp.pipe(load_pages(volume_path))
    return numpy.array([tok.vector.reshape(-1) for doc in sp_text for tok in doc])

def piecewise_avg(vec, n_groups):
    """Divide a vector into pieces and return the average for each."""
    size = len(vec) / n_groups
    ends = []
    for i in range(1, n_groups + 1):
        ends.append(int(size * i))
    ends[-1] = len(vec)
    
    sums = []
    start = 0
    for end in ends:
        sums.append(vec[start:end].sum() / (end - start))
        start = end
        
    return numpy.array(sums)
        
def embedding_fft(sp_embedding, n_bands=N_BANDS, flatten=False):
    """
    Perform a Fourier transform on all the dimensions of an
    array of word embeddings extracted from a document. 
    `sp_embedding` is assumed to be an array with a row
    for each document, and a column for each dimension of 
    the underlying word embedding vector model.
    """
    fft_cols = []
    n_groups = 1
    while n_groups < n_bands * 10:
        n_groups *= 2
        
    for col in range(sp_embedding.shape[1]):
        vec = sp_embedding[:, col]
        vec = piecewise_avg(vec, n_groups)
        fft = numpy.fft.rfft(vec)
        fft_cols.append(fft[:n_bands])
    
    if flatten:
        return flatten_fft(fft_cols)
    else:
        return numpy.array(fft_cols)

def flatten_fft(emb_fft):
    """Reshape an fft array into a single vector."""
    complex_vec = numpy.array(emb_fft).reshape(-1)
    return numpy.array([x for r_i in zip(complex_vec.real, complex_vec.imag)
                        for x in r_i])

def unflatten_vec(doc_vector, cols=N_BANDS * 2):
    """Turn a document vector back into an fft array."""
    array = doc_vector.reshape(300, cols)  # This hard-codes values that should be parameters.
    real = array[:, ::2]
    imag = array[:, 1::2]
    return real + imag * 1j
    
def test_fft_reshape(volume_path):
    """A test of vector-array conversion routines."""
    assert _test_fft_reshape_one(volume_path)

def _test_fft_reshape_one(folder):
    emb = load_one_sp_embedding(folder)
    
    fft_orig = embedding_fft(emb, flatten=False)
    
    fft_complex = unflatten_vec(embedding_fft(emb, flatten=True))
    return (fft_orig == fft_complex).all()

def vol_path_to_npy_path(vol_path):
    return vol_path if not vol_path.endswith('.zip') else vol_path[:-4]

def save_sp_embeddings(source_path, dest_path=None):
    dest_path = source_path if dest_path is None else dest_path
    vol_paths = volume_paths(source_path)
    new_paths = [vol_path_to_npy_path(os.path.join(dest_path, os.path.split(vp)[-1]))
                 for vp in vol_paths]
    
    sp_embeddings = (load_one_sp_embedding(v) for v in vol_paths)
    for emb, np in zip(sp_embeddings, new_paths):
        numpy.save(np, emb)        

def save_embedding_ffts(source_path, dest_path=None):
    dest_path = source_path if dest_path is None else dest_path
    vol_paths = volume_paths(source_path)
    new_paths = [vol_path_to_npy_path(os.path.join(dest_path, os.path.split(vp)[-1]))
                 for vp in vol_paths]
    
    sp_embeddings = (load_one_sp_embedding(v) for v in vol_paths)
    emb_ffts = (embedding_fft(e) for e in sp_embeddings)
    for emb, np in zip(emb_ffts, new_paths):
        numpy.save(np, emb) 

def load_embedding_fft_array(path):
    return numpy.array([flatten_fft(numpy.load(f)) for f in numpy_paths(path)])

def load_embedding_ffts(path, flatten=True):
    if flatten:
        return {os.path.split(f)[-1][:-4]: flatten_fft(numpy.load(f)) 
                for f in numpy_paths(path)}
    else:
        return {os.path.split(f)[-1][:-4]: numpy.load(f)
                for f in numpy_paths(path)}

def show_dataset(folder, n=None):
    files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.txt')]
    files.sort()
    files = files[:n]
    for f in files:
        with open(f) as ip:
            print(ip.read()[0:500])

def show_umap(data):
    print(data.shape)
    um = umap.UMAP(n_neighbors=20, min_dist=0.001, metric='cosine')
    vis = um.fit_transform(data)
    plt.gca().axis('equal')
    plt.scatter(vis[:, 0], 
                vis[:, 1], 
                c=[i / len(vis) for i in range(len(vis))],
                cmap='plasma')
    plt.show()

In [None]:
# path_to_volumes = '/media/secure_volume/workset/orig'
path_to_volumes = '../ht-open-test-data/gov_docs/'
path_to_fft = '../ht-open-test-data/gov_docs_fft'

In [None]:
for test_path in volume_paths(path_to_volumes)[:2]:
    print(test_path)
    try:
        test_fft_reshape(test_path)
    except Exception as e:
        print(e)

In [None]:
# save_embedding_ffts(path_to_volumes, path_to_fft)

In [None]:
show_umap(load_embedding_fft_array(path_to_fft))

In [None]:
data_dict = load_embedding_ffts(path_to_fft)
data = list(data_dict.items())
files, data = zip(*data)
data_umap = umap.UMAP(n_neighbors=5, n_components=10, metric='cosine').fit_transform(data)

In [None]:
data_umap.shape

In [None]:
data_umap_kd = KDTree(data_umap)

In [None]:
pairs = list(data_umap_kd.query_pairs(0.02))
pairs

In [None]:
# dist between known duplicates:
diff = data_dict['mdp.39015001704199'] - data_dict['mdp.39015048784154']
print((diff * diff).sum() ** 0.5)
plt.plot(diff)

In [None]:
# dist between known duplicates:
diff = data_dict['mdp.39015001704199'] - data_dict['nc01.ark+=13960=t2j68kf84']
print((diff * diff).sum() ** 0.5)
plt.plot(diff)

In [None]:
# dist between random pair:
diff = data_dict['mdp.39015001704199'] - data_dict['mdp.39015041912760']
print((diff * diff).sum() ** 0.5)
plt.plot(diff)

In [None]:
'mdp.39015029726430', 'inu.39000000259478'

In [None]:
# dist between 10 smallest pairs in base space
pair_dists = numpy.array([((data[x] - data[y]) ** 2).sum() ** 0.5 
                          for x, y in pairs])
smallest_ix = pair_dists.argsort()[:10]
print("closest pairs in base space")
pprint(list(zip([pairs[i] for i in smallest_ix], pair_dists[smallest_ix])))

# dist between pairs in umap space
pair_umap_dists = [((data_umap[x] - data_umap[y]) ** 2).sum() ** 0.5 for x, y in pairs]
smallest_ix = pair_dists.argsort()[:10]
print("closest pairs in umap space")
pprint(list(zip([pairs[i] for i in smallest_ix], pair_dists[smallest_ix])))

In [None]:
pair_ids = [(files[x], files[y]) for x, y in pairs][:5]
pair_ids

In [None]:
for a, b in pair_ids:
    diff = data_dict[a] - data_dict[b]
    plt.plot(diff)
    plt.show()

In [None]:
data_freq_array = [unflatten_vec(r) / len(data) for r in data]  # 300 rows, 20 cols in each array, ~1000 arrays
data_freq_mean = data_freq_array[0]
for dfa in data_freq_array[1:]:
    data_freq_mean += dfa

In [None]:
power_a = data_freq_mean[:, 2::2]
power_b = data_freq_mean[:, 3::2]
power = (power_a * power_a + power_b * power_b) ** 0.5

mean_power = power.sum(axis=0) / 300
plt.plot(mean_power)
plt.plot(power[2])

In [None]:
plt.plot(numpy.fft.ifft(mean_power)[2:])