In [None]:
import itertools
import os

import spacy
import numpy
import umap
from scipy.spatial import KDTree

import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
en_nlp = spacy.load('en_core_web_md', disable=['tagger', 'parser', 'ner'])

In [None]:
def load_page_iter(folder):
    files = (os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.txt'))
    for f in files:
        with open(f, encoding='UTF-8') as ip:
            yield ip.read()

def load_one_embedding(folder):
    sp_text = en_nlp.pipe(load_page_iter(folder))
    return numpy.array([tok.vector.reshape(-1) for doc in sp_text for tok in doc])

def group_avg(vec, n_groups):
    size = len(vec) / n_groups
    ends = []
    for i in range(1, n_groups + 1):
        ends.append(int(size * i))
    ends[-1] = len(vec)
    
    sums = []
    start = 0
    for end in ends:
        sums.append(vec[start:end].sum() / (end - start))
        start = end
        
    return numpy.array(sums)
        
def fft_vec(vecs, n_bands=10, unpack=True):
    fft_cols = []
    n_groups = 1
    while n_groups < n_bands * 4:
        n_groups *= 2
        
    for col in range(vecs.shape[1]):
        vec = vecs[:, col]
        vec = group_avg(vec, n_groups)
        fft = numpy.fft.rfft(vec)
        fft_cols.append(fft[:n_bands])
    
    if unpack:
        complex_vec = numpy.array(fft_cols).reshape(-1)
        return numpy.array([x for r_i in zip(complex_vec.real, complex_vec.imag)
                            for x in r_i])
    else:
        return numpy.array(fft_cols)

def test_fft_reshape(folder):
    files = [os.path.join(folder, f) for f in os.listdir(folder)]
    files = [f for f in files if os.path.isdir(f)][:10]
    for f in files:
        assert _test_fft_reshape_one(f)

def _test_fft_reshape_one(folder):
    vecs = load_one_embedding(folder)
    
    fft_orig = fft_vec(vecs, unpack=False)
    
    fft_complex = fft_array_to_complex(fft_vec_to_array(fft_vec(vecs)))
    return (fft_orig == fft_complex).all()

def fft_vec_to_array(vec):
    return vec.reshape(300, 20)

def fft_array_to_complex(array):
    real = array[:, ::2]
    imag = array[:, 1::2]
    return real + imag * 1j

def load_embeddings_iter(files):
    return (load_one_embedding(f) for f in files)

def load_fft_vecs_iter(embeddings):
    return (fft_vec(e) for e in embeddings)

def save_embeddings(folder):
    files = [os.path.join(folder, f) for f in os.listdir(folder)]
    for emb, f in zip(load_embeddings_iter(files), files):
        f += '.npy'
        numpy.save(f, emb)

def save_fft_vecs(folder):
    files = [os.path.join(folder, f) for f in os.listdir(folder)]
    files = [f for f in files if os.path.isdir(f)]
    for emb, f in zip(load_vecs_iter(load_embeddings_iter(files)), files):
        f += '.npy'
        numpy.save(f, emb)
        
def save_fft_vecs_dest(folder, destfolder=None):
    destfolder = folder if destfolder is None else destfolder
    files = [os.path.join(folder, f) for f in os.listdir(folder)]
    files = [f for f in files if os.path.isdir(f)]
    newfiles = [os.path.join(destfolder, os.path.split(f)[-1])
                for f in files]
    for emb, f in zip(load_vecs_iter(load_embeddings_iter(files)), newfiles):
        f += '.npy'
        numpy.save(f, emb)

def load_fft_vec_array(folder):
    files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.npy')]
    return numpy.array([numpy.load(f) for f in files])

def load_fft_vecs(folder):
    files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.npy')]
    return {os.path.split(f)[-1][:-4]: numpy.load(f) for f in files}
        
def load_dataset(folder, n=None):
    files = [os.path.join(folder, f) for f in os.listdir(folder)]
    files.sort()
    files = files[:n]
    return numpy.array(list(load_vecs_iter(load_embeddings_iter(files))))

def show_dataset(folder, n=None):
    files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.txt')]
    files.sort()
    files = files[:n]
    for f in files:
        with open(f) as ip:
            print(ip.read()[0:500])

def show_umap(data):
    print(data.shape)
    um = umap.UMAP(n_neighbors=20, min_dist=0.001, metric='cosine')
    vis = um.fit_transform(data)
    plt.gca().axis('equal')
    plt.scatter(vis[:, 0], 
                vis[:, 1], 
                c=[i / len(vis) for i in range(len(vis))],
                cmap='plasma')
    plt.show()

In [None]:
folder = '/media/secure_volume/workset/orig'

In [None]:
test_fft_reshape(folder)

In [None]:
# save_fft_vecs(folder)

In [None]:
show_umap(load_fft_vec_array(folder))

In [None]:
data_dict = load_fft_vecs(folder)
data = list(data_dict.items())
files, data = zip(*data)
data_umap = umap.UMAP(n_neighbors=5, n_components=10, metric='cosine').fit_transform(data)

In [None]:
data_umap.shape

In [None]:
data_umap_kd = KDTree(data_umap)

In [None]:
pairs = data_umap_kd.query_pairs(0.02)
pairs

In [None]:
# dist between known duplicates:
diff = data_dict['mdp.39015001704199'] - data_dict['mdp.39015048784154']
print((diff * diff).sum() ** 0.5)
plt.plot(diff)

In [None]:
# dist between known duplicates:
diff = data_dict['mdp.39015001704199'] - data_dict['nc01.ark+=13960=t2j68kf84']
print((diff * diff).sum() ** 0.5)
plt.plot(diff)

In [None]:
# dist between random pair:
diff = data_dict['mdp.39015001704199'] - data_dict['mdp.39015041912760']
print((diff * diff).sum() ** 0.5)
plt.plot(diff)

In [None]:
'mdp.39015029726430', 'inu.39000000259478'

In [None]:
# dist between pairs in base space
pair_dists = [((data[x] - data[y]) ** 2).sum() ** 0.5 for x, y in pairs]
pair_dists

In [None]:
# dist between pairs in umap space
pair_umap_dists = [((data_umap[x] - data_umap[y]) ** 2).sum() ** 0.5 for x, y in pairs]
pair_umap_dists

In [None]:
pair_ids = [(files[x], files[y]) for x, y in pairs]
pair_ids

In [None]:
for a, b in pair_ids:
    diff = data_dict[a] - data_dict[b]
    plt.plot(diff)
    plt.show()

In [None]:
data_freq_array = [fft_vec_to_array(r) / len(data) for r in data]  # 300 rows, 20 cols in each array, ~1000 arrays
data_freq_mean = data_freq_array[0]
for dfa in data_freq_array[1:]:
    data_freq_mean += dfa

In [None]:
power_a = data_freq_mean[:, 2::2]
power_b = data_freq_mean[:, 3::2]
power = (power_a * power_a + power_b * power_b) ** 0.5

mean_power = power.sum(axis=0) / 300
plt.plot(mean_power)
plt.plot(power[2])

In [None]:
plt.plot(numpy.fft.ifft(mean_power)[2:])