In [1]:

import multiprocessing
import os
import re
import signal
from math import ceil
from os.path import join

import numpy as np
import torch
from numpy.random import choice
from torchtext.data import Field, TabularDataset

import glob
from io import StringIO

%matplotlib inline

In [2]:
if not os.path.exists('model_data/data'):
    os.mkdir('model_data/data')

In [3]:
base_path = 'model_data/data/'

## sklearn

In [97]:
sk_glob = sorted(glob.glob('sklearn/*'))

In [98]:
sk_glob

['sklearn/sklearn_1_10_decision_trees_1_10_1_classification.txt',
 'sklearn/sklearn_1_10_decision_trees_1_10_2_regression.txt',
 'sklearn/sklearn_1_10_decision_trees_1_10_3_multi-output_problems.txt',
 'sklearn/sklearn_1_10_decision_trees_1_10_4_complexity.txt',
 'sklearn/sklearn_1_10_decision_trees_1_10_5_tips_on_practical_use.txt',
 'sklearn/sklearn_1_10_decision_trees_1_10_6_tree_algorithms_id3_c4_5_c5_0_and_cart.txt',
 'sklearn/sklearn_1_10_decision_trees_1_10_7_mathematical_formulation.txt',
 'sklearn/sklearn_1_10_decision_trees_1_10_8_minimal_cost-complexity_pruning.txt',
 'sklearn/sklearn_1_11_ensemble_methods_1_11_1_bagging_meta-estimator.txt',
 'sklearn/sklearn_1_11_ensemble_methods_1_11_2_forests_of_randomized_trees.txt',
 'sklearn/sklearn_1_11_ensemble_methods_1_11_3_adaboost.txt',
 'sklearn/sklearn_1_11_ensemble_methods_1_11_4_gradient_tree_boosting.txt',
 'sklearn/sklearn_1_11_ensemble_methods_1_11_5_histogram-based_gradient_boosting.txt',
 'sklearn/sklearn_1_11_ensemble_m

In [99]:
samples = []
for file in sk_glob:
    with open(file) as f:
        samples.append(f.read())

In [100]:
len(samples)

242

In [101]:

with open(f'{base_path}sklearn.csv', 'w') as f:
    doc_text = "text\n" + "\n".join(['"' + sample.replace('\n', ' ') + '"' for sample in samples])
    f.write(doc_text)


In [88]:
samples

['sklearn_1_10_decision_trees\n1.10. Decision Trees\nmodules/tree.html\n 1.10.1. Classification  DecisionTreeClassifier is a class capable of performing multi-class classification on a dataset. As with other classifiers, DecisionTreeClassifier takes as input two arrays: an array X, sparse or dense, of size holding the training samples, and an array Y of integer values, size , holding the class labels for the training samples: After being fitted, the model can then be used to predict the class of samples: Alternatively, the probability of each class can be predicted, which is the fraction of training samples of the same class in a leaf: DecisionTreeClassifier is capable of both binary (where the labels are [-1, 1]) classification and multiclass (where the labels are [0, …, K-1]) classification. Using the Iris dataset, we can construct a tree as follows: Once trained, you can plot the tree with the plot_tree function: We can also export the tree in Graphviz format using the export_graphv

In [89]:
def extractLabels(samples, package, language):
    reList = []
    for x in samples:
        if x.strip() == 'text':
            continue
        y = x.split("\n")
        page_id = y[0]
        section = y[1]
        url = y[2]
        text = y[3]
        try:
            field_name = re.search("^[0-9.]*[ ][A-Z][-a-zA-Z]*(?:\s+[A-Z][-a-zA-Z]*)*", text.strip()).group(0)
            if len(field_name) > 30:
                field_name = " ".join(text.strip().split(" ")[0:4])
        except AttributeError:
            field_name = " ".join(text.strip().split(" ")[0:4])
        reList.append((package, language, page_id, section, url, field_name))
    
    return reList

In [90]:
with open(f'{base_path}sklearn_labels.csv', 'w') as f:
    for tup in extractLabels(samples, 'sklearn', 'Python'):
        f.write('|'.join(tup) + '\n')

## Caret

In [4]:
caret_glob = sorted(glob.glob('caret/*'))

In [5]:
caret_samples = []
for file in caret_glob:
    with open(file) as f:
        caret_samples.append(f.read())

In [6]:
len(caret_samples)

141

In [7]:

with open(f'{base_path}caret.csv', 'w') as f:
    doc_text = "text\n" + "\n".join(['"' + sample.replace('\n', ' ') + '"' for sample in caret_samples])
    f.write(doc_text)


In [91]:
with open(f'{base_path}caret_labels.csv', 'w') as f:
    for tup in extractLabels(caret_samples, 'caret', 'R'):
        f.write('|'.join(tup) + '\n')

## Numpy

In [8]:
numpy_glob = sorted(glob.glob('numpy/*'))

In [9]:
numpy_samples = []
for file in numpy_glob:
    with open(file) as f:
        numpy_samples.append(f.read())

In [10]:
len(numpy_samples)

22

In [11]:

with open(f'{base_path}numpy.csv', 'w') as f:
    doc_text = "text\n" + "\n".join(['"' + sample.replace('\n', ' ') + '"' for sample in numpy_samples])
    f.write(doc_text)


In [92]:
with open(f'{base_path}numpy_labels.csv', 'w') as f:
    for tup in extractLabels(numpy_samples, 'numpy', 'Python'):
        f.write('|'.join(tup) + '\n')

## SciPy

In [12]:
scipy_glob = sorted(glob.glob('scipy/*'))

In [13]:
scipy_samples = []
for file in scipy_glob:
    with open(file) as f:
        scipy_samples.append(f.read())

In [14]:
len(scipy_samples)

70

In [15]:

with open(f'{base_path}scipy.csv', 'w') as f:
    doc_text = "text\n" + "\n".join(['"' + sample.replace('\n', ' ') + '"' for sample in scipy_samples])
    f.write(doc_text)


In [93]:
with open(f'{base_path}scipy_labels.csv', 'w') as f:
    for tup in extractLabels(scipy_samples, 'scipy', 'Python'):
        f.write('|'.join(tup) + '\n')

## Total

In [16]:
os.listdir(base_path)

['all_data_model.dbow_numnoisewords.2_vecdim.100_batchsize.32_lr.0.001000_epoch.95_loss.0.755801.csv',
 'caret.csv',
 'numpy.csv',
 'sklearn.csv',
 'all_data.csv',
 'scipy.csv',
 'all_data_model.dbow_numnoisewords.2_vecdim.100_batchsize.32_lr.0.001000_epoch.97_loss.0.750086.csv']

In [119]:
filenames = [
    'caret.csv',
    'numpy.csv',
    'sklearn.csv',
    'scipy.csv'
]
with open(base_path + 'all_data2.csv', 'w') as outfile:
    j = 0
    outfile.write("text\n")
    j +=1 
    for fname in filenames:
        i = 0
        print(fname)
        with open(base_path + fname) as infile:
            for line in infile:
                
                if line.strip() == 'text':
                    print("text")
                    continue
                outfile.write(line.strip() + '\n')
                i += 1
                j +=1
        print(i)
    print(j)

caret.csv
text
141
numpy.csv
text
22
sklearn.csv
text
242
scipy.csv
text
70
476


In [105]:
filenames = [
    'caret_labels.csv',
    'numpy_labels.csv',
    'sklearn_labels.csv',
    'scipy_labels.csv'
]
with open(base_path + 'all_labels.csv', 'w') as outfile:
    outfile.write("text\n")
    for fname in filenames:
        i = 0
        print(fname)
        with open(base_path + fname) as infile:
            for line in infile:
                
                if not line: continue
                if line.strip() == 'text':
                    print("text")
                    continue
                i += 1
                outfile.write(line)
            print(i)

caret_labels.csv
141
numpy_labels.csv
22
sklearn_labels.csv
242
scipy_labels.csv
70


In [112]:
data_line = []
with open(base_path + 'all_data2.csv') as f:
    for line in f:
        data_line.append(line)

In [113]:
label_line = []
with open(base_path + 'all_labels.csv') as f:
    for line in f:
        label_line.append(line)

In [114]:
sorted([len(x) for x in label_line])

[5,
 65,
 69,
 71,
 75,
 77,
 80,
 81,
 81,
 81,
 82,
 82,
 83,
 83,
 83,
 83,
 83,
 83,
 84,
 84,
 84,
 85,
 85,
 86,
 86,
 87,
 87,
 87,
 88,
 88,
 89,
 89,
 89,
 90,
 90,
 92,
 92,
 94,
 94,
 95,
 95,
 95,
 95,
 96,
 97,
 97,
 98,
 98,
 98,
 99,
 99,
 99,
 100,
 100,
 100,
 100,
 100,
 101,
 101,
 101,
 101,
 102,
 102,
 102,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 104,
 104,
 104,
 104,
 104,
 104,
 104,
 104,
 104,
 104,
 104,
 104,
 105,
 105,
 105,
 105,
 105,
 105,
 105,
 105,
 105,
 105,
 105,
 105,
 105,
 105,
 106,
 106,
 106,
 106,
 107,
 107,
 107,
 107,
 107,
 107,
 107,
 107,
 107,
 107,
 107,
 108,
 108,
 109,
 109,
 109,
 109,
 109,
 110,
 110,
 110,
 110,
 110,
 110,
 111,
 111,
 111,
 111,
 111,
 111,
 111,
 111,
 111,
 111,
 111,
 112,
 112,
 112,
 112,
 112,
 112,
 112,
 112,
 112,
 113,
 113,
 113,
 113,
 113,
 113,
 113,
 113,
 113,
 113,
 114,
 114,
 114,
 114,
 114,
 114,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 11

In [115]:
print(len(label_line)); print(len(data_line))

476
473


## Output vs Input

In [81]:
input_lines = []
with open('model_data/data/all_data.csv') as f:
    for line in f:
        input_lines.append(line)

In [82]:
len(input_lines)

474

In [83]:
input_lines[0]

'text\n'

In [44]:
output_lines = []
with open('model_data/data/all_data_model.dbow_numnoisewords.2_vecdim.100_batchsize.32_lr.0.001000_epoch.97_loss.0.750086.csv') as f:
    for line in f:
        output_lines.append(line)

In [45]:
len(output_lines)

486

In [47]:
output_lines[0]

'd0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15,d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31,d32,d33,d34,d35,d36,d37,d38,d39,d40,d41,d42,d43,d44,d45,d46,d47,d48,d49,d50,d51,d52,d53,d54,d55,d56,d57,d58,d59,d60,d61,d62,d63,d64,d65,d66,d67,d68,d69,d70,d71,d72,d73,d74,d75,d76,d77,d78,d79,d80,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90,d91,d92,d93,d94,d95,d96,d97,d98,d99\n'

In [62]:
input_lines[100].strip('"').split(" ")[:15]

['sklearn_2_1_gaussian_mixture_models',
 '2.1.',
 'Gaussian',
 'mixture',
 'models',
 'modules/mixture.html',
 '',
 '2.1.2.',
 'Variational',
 'Bayesian',
 'Gaussian',
 'Mixture',
 '',
 'The',
 'BayesianGaussianMixture']

In [133]:
sklearn_index = []
for sample in samples:

    sample_split = sample.split("\n")
    page_id = sample_split[0]
    page_name = sample_split[1]
    link = sample_split[2]
    text = sample_split[3]
    try:
        field_name = re.search("^[0-9.]*[ ][A-Z][-a-zA-Z]*(?:\s+[A-Z][-a-zA-Z]*)*", text.strip()).group(0)
    except AttributeError:
        field_name = " ".join(text.strip().split(" ")[0:10])
    sklearn_index.append((page_id, page_name, link, field_name))


In [135]:
len(sklearn_index)

243

In [129]:
caret_index = []
for caret_sample in caret_samples:
    sample_split = caret_sample.split("\n")
    page_id = sample_split[0]
    page_name = sample_split[1]
    link = sample_split[2]
    text = sample_split[3]

    field_name = re.search("^[0-9.]*[ ][A-Z][-a-zA-Z]*(?:\s+[A-Z][-a-zA-Z]*)*", text.strip()).group(0)
    caret_index.append((page_id, page_name, link, field_name))


In [122]:
sample

('caret_11_subsampling_for_class_imbalances',
 '11 Subsampling For Class Imbalances',
 'subsampling-for-class-imbalances.html',
 '11.1 Subsampling')

In [111]:
re.search("^[0-9.]*[ ][A-Z][-a-zA-Z]*(?:\s+[A-Z][-a-zA-Z]*)*", text.strip()).group(0)

'7.0.9 Distance Weighted Discrimination'

In [139]:
matched = []
for line_idx in range(len(output_lines)):
    matched.append((input_lines[line_idx], output_lines[line_idx]))

In [145]:
matched_clean = [x for x in matched if not re.match(".*[A-Z].*", x[1])]

In [153]:
matched_clean_vectors = [x[1].split(",") for x in matched_clean[1:]]

In [161]:
vec_matrix = np.array(matched_clean_vectors)

In [162]:
vec_matrix.shape

(481, 100)

In [191]:
from scipy.spatial import distance

distances = distance.cdist([vec_matrix[5]], vec_matrix, "cosine")[0]
# min_index = np.argmin(distances)
ind = np.argpartition(a, 5)[:5]
sorted_ind = ind[np.argsort(distances[ind])]
min_distances = distances[sorted_ind]
max_similarity = [1 - x for x in min_distances]

In [192]:
max_similarity

[0.6670743594734331,
 0.6652293382107732,
 0.6196681421232006,
 0.5960319745742784,
 0.5836655087197274]