In [1]:

import multiprocessing
import os
import re
import signal
from math import ceil
from os.path import join

import numpy as np
import torch
from numpy.random import choice
from torchtext.data import Field, TabularDataset

import glob
from io import StringIO

%matplotlib inline

In [198]:
if not os.path.exists('model_data/data'):
    os.mkdir('model_data/data')

In [199]:
base_path = 'model_data/data/'

## sklearn

In [18]:
sk_glob = sorted(glob.glob('sklearn/*'))

In [19]:
samples = []
for file in sk_glob:
    with open(file) as f:
        samples.append(f.read())

In [35]:
len(samples)

243

In [31]:

with open(f'{base_path}sklearn.csv', 'w') as f:
    doc_text = "text\n" + "\n".join(['"' + sample.replace('\n', ' ') + '"' for sample in samples])
    f.write(doc_text)


## Caret

In [127]:
caret_glob = sorted(glob.glob('caret/*'))

In [128]:
caret_samples = []
for file in caret_glob:
    with open(file) as f:
        caret_samples.append(f.read())

In [26]:
len(caret_samples)

141

In [32]:

with open(f'{base_path}caret.csv', 'w') as f:
    doc_text = "text\n" + "\n".join(['"' + sample.replace('\n', ' ') + '"' for sample in samples])
    f.write(doc_text)


## Numpy

In [195]:
numpy_glob = sorted(glob.glob('numpy/*'))

In [196]:
numpy_samples = []
for file in numpy_glob:
    with open(file) as f:
        numpy_samples.append(f.read())

In [197]:
len(numpy_samples)

22

In [200]:

with open(f'{base_path}numpy.csv', 'w') as f:
    doc_text = "text\n" + "\n".join(['"' + sample.replace('\n', ' ') + '"' for sample in samples])
    f.write(doc_text)


## SciPy

In [201]:
scipy_glob = sorted(glob.glob('scipy/*'))

In [202]:
scipy_samples = []
for file in scipy_glob:
    with open(file) as f:
        scipy_samples.append(f.read())

In [204]:
len(scipy_samples)

70

In [205]:

with open(f'{base_path}scipy.csv', 'w') as f:
    doc_text = "text\n" + "\n".join(['"' + sample.replace('\n', ' ') + '"' for sample in samples])
    f.write(doc_text)


## Total

In [206]:
os.listdir(base_path)

['caret.csv',
 'numpy.csv',
 'sklearn.csv',
 'all_data.csv',
 'scipy.csv',
 'all_data_model.dbow_numnoisewords.2_vecdim.100_batchsize.32_lr.0.001000_epoch.97_loss.0.750086.csv']

In [208]:
filenames = [
    'caret.csv',
    'numpy.csv',
    'sklearn.csv',
    'scipy.csv'
]
with open(base_path + 'all_data.csv', 'w') as outfile:
    outfile.write("text\n")
    for fname in filenames:
        with open(base_path + fname) as infile:
            for line in infile:
                if line.strip() == 'text':
                    print("text")
                    continue
                outfile.write(line)

text
text
text
text


## Output vs Input

In [41]:
input_lines = []
with open('model_data/data/all_data.csv') as f:
    for line in f:
        input_lines.append(line)

In [43]:
len(input_lines)

486

In [46]:
input_lines[0]

'text\n'

In [44]:
output_lines = []
with open('model_data/data/all_data_model.dbow_numnoisewords.2_vecdim.100_batchsize.32_lr.0.001000_epoch.97_loss.0.750086.csv') as f:
    for line in f:
        output_lines.append(line)

In [45]:
len(output_lines)

486

In [47]:
output_lines[0]

'd0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15,d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31,d32,d33,d34,d35,d36,d37,d38,d39,d40,d41,d42,d43,d44,d45,d46,d47,d48,d49,d50,d51,d52,d53,d54,d55,d56,d57,d58,d59,d60,d61,d62,d63,d64,d65,d66,d67,d68,d69,d70,d71,d72,d73,d74,d75,d76,d77,d78,d79,d80,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90,d91,d92,d93,d94,d95,d96,d97,d98,d99\n'

In [62]:
input_lines[100].strip('"').split(" ")[:15]

['sklearn_2_1_gaussian_mixture_models',
 '2.1.',
 'Gaussian',
 'mixture',
 'models',
 'modules/mixture.html',
 '',
 '2.1.2.',
 'Variational',
 'Bayesian',
 'Gaussian',
 'Mixture',
 '',
 'The',
 'BayesianGaussianMixture']

In [133]:
sklearn_index = []
for sample in samples:

    sample_split = sample.split("\n")
    page_id = sample_split[0]
    page_name = sample_split[1]
    link = sample_split[2]
    text = sample_split[3]
    try:
        field_name = re.search("^[0-9.]*[ ][A-Z][-a-zA-Z]*(?:\s+[A-Z][-a-zA-Z]*)*", text.strip()).group(0)
    except AttributeError:
        field_name = " ".join(text.strip().split(" ")[0:10])
    sklearn_index.append((page_id, page_name, link, field_name))


In [135]:
len(sklearn_index)

243

In [129]:
caret_index = []
for caret_sample in caret_samples:
    sample_split = caret_sample.split("\n")
    page_id = sample_split[0]
    page_name = sample_split[1]
    link = sample_split[2]
    text = sample_split[3]

    field_name = re.search("^[0-9.]*[ ][A-Z][-a-zA-Z]*(?:\s+[A-Z][-a-zA-Z]*)*", text.strip()).group(0)
    caret_index.append((page_id, page_name, link, field_name))


In [122]:
sample

('caret_11_subsampling_for_class_imbalances',
 '11 Subsampling For Class Imbalances',
 'subsampling-for-class-imbalances.html',
 '11.1 Subsampling')

In [111]:
re.search("^[0-9.]*[ ][A-Z][-a-zA-Z]*(?:\s+[A-Z][-a-zA-Z]*)*", text.strip()).group(0)

'7.0.9 Distance Weighted Discrimination'

In [139]:
matched = []
for line_idx in range(len(output_lines)):
    matched.append((input_lines[line_idx], output_lines[line_idx]))

In [145]:
matched_clean = [x for x in matched if not re.match(".*[A-Z].*", x[1])]

In [153]:
matched_clean_vectors = [x[1].split(",") for x in matched_clean[1:]]

In [161]:
vec_matrix = np.array(matched_clean_vectors)

In [162]:
vec_matrix.shape

(481, 100)

In [191]:
from scipy.spatial import distance

distances = distance.cdist([vec_matrix[5]], vec_matrix, "cosine")[0]
# min_index = np.argmin(distances)
ind = np.argpartition(a, 5)[:5]
sorted_ind = ind[np.argsort(distances[ind])]
min_distances = distances[sorted_ind]
max_similarity = [1 - x for x in min_distances]

In [192]:
max_similarity

[0.6670743594734331,
 0.6652293382107732,
 0.6196681421232006,
 0.5960319745742784,
 0.5836655087197274]