Skip to content

Latest commit

 

History

History
1257 lines (1026 loc) · 49 KB

experiment.org

File metadata and controls

1257 lines (1026 loc) · 49 KB

CPSC 599 NLP Project Experimentation

from util import change_extension, sliding_window
from collections import defaultdict
import nltk
import spacy
from spacy.tokens import Token, DocBin
import en_core_web_sm
import re
import json
import os
import glob
import components

def show_spans(doc):
    print(doc)
    print(*((s, s.label_, s.start, s.end) for s in doc.spans['sc']))
data_dir = 'data/html/processed/'
pattern = os.path.join(data_dir, '**', '*' + '.txt')
all_txt = glob.glob(pattern, recursive=True)
texts = []
for txt in all_txt:
    with open(txt, 'r') as f:
        texts.append(f.read())

# TODO: update to apply same preprocessing to these texts before making predictions
print(all_txt.index('data/html/processed/unsupervised_learning.txt'))
for doc in trained.pipe(texts[:10]):
    l = len(doc.spans['sc'])
    if l > 0:
        show_spans(doc)

exs = ['One needs to fit the model to the data.', 'Refer to the glossary'
       , 'This was provided by Thomas Fan'
       , 'To provide a good fit the model needs to be trained.'
       , 'A pipeline can be used to improve the model by using cross validation.'
       , 'This is referred to as a warm start'
       , 'In classification the model is trained to predict a number of classes'
       , 'Classification algorithms usually also offer a way to quantify certainty of a prediction, either using decision_function or predict_proba'
       , 'All built-in estimators also have a set_params method, which sets data-independent parameters (overriding previous parameter values passed to __init__).'
       , 'y might be ignored in the case of unsupervised learning. However, to make it possible to use the estimator as part of a pipeline that can mix both supervised and unsupervised transformers, even unsupervised estimators need to accept a y=None keyword argument in the second position that is just ignored by the estimator.'
       , 'When fit is called, any previous call to fit should be ignored. In general, calling estimator.fit(X1) and then estimator.fit(X2) should be the same as only calling estimator.fit(X2). However, this may not be true in practice when fit depends on some random process, see random_state. Another exception to this rule is when the hyper-parameter warm_start is set to True for estimators that support it.'
       , 'Attributes that have been estimated from the data must always have a name ending with trailing underscore, for example the coefficients of some regression estimator would be stored in a coef_ attribute after fit has been called.'
       , 'Often, the subestimator has a name (as e.g. named steps in a Pipeline object), in which case the key should become <name>__C, <name>__class_weight, etc.'
       , 'Often, the subestimator has a name (as e.g. named steps in Pipeline objects), in which case the key should become <name>__C, <name>__class_weight, etc.'
       , 'For an estimator to be usable together with pipeline.Pipeline in any but the last step, it needs to provide a fit or fit_transform function. To be able to evaluate the pipeline on any data but the training set, it also needs to provide a transform function. There are no special requirements for the last step in a pipeline, except that it has a fit function. All fit and fit_transform functions must take arguments X, y, even if y is not used. Similarly, for score to be usable, the last step of the pipeline needs to have a score function that accepts an optional y.'
       , 'In a Pipeline object you can use GridSearchCV or RandomForestClassifier'
       , 'Whether you are proposing an estimator for inclusion in scikit-learn, developing a separate package compatible with scikit-learn, or implementing custom components for your own projects, this chapter details how to develop objects that safely interact with scikit-learn Pipelines and model selection tools.'
       , 'The base object, implements a fit method to learn from data'
       , 'Elements of the scikit-learn API are described more definitively in the Glossary of Common Terms and API Elements.']
for doc in trained.pipe(exs):
    show_spans(doc)
    print()

db = DocBin().from_disk('test.spacy')
docs = list(db.get_docs(trained.vocab))
common_labels = ['https://sphinx-gallery.github.io']
desired_labels = [
    # 'glossary.html#term-fit',
    # 'glossary.html#term-random_state',
    # 'glossary.html#term-n_jobs',
    # 'https://github.com/scikit-learn/scikit-learn/blob/449940985/sklearn/base.py#l153',
    'sklearn.pipeline.pipeline.html#sklearn.pipeline.pipeline'
]
c = 0
for doc, orig_spans in zip(trained.pipe(doc.text for doc in docs), (doc.spans['sc'] for doc in docs)):
    spans = doc.spans['sc']
    # if len(spans) > 0 and any(s.label_ not in common_labels for s in spans):
    # if len(spans) > 0 and any(s.label_ in desired_labels for s in spans):
    links = set(s.label_ for s in spans)
    orig_links = set(s.label_ for s in orig_spans)
    if orig_links != links:
        c += 1
        print(*((s, s.label_, s.start, s.end) for s in orig_spans))
        show_spans(doc)
        if c > 5:
            break

Exploring preprocessing

data_dir = 'data/html/processed/'
pattern = os.path.join(data_dir, '**', '*' + '.txt')
all_txt = glob.glob(pattern, recursive=True)
print(len(all_txt))
all_linkdata = []
all_links = []
for path in all_txt:
    with open(change_extension(path, '.linkdata.json'), 'r') as f:
        linkdata = json.load(f)
        all_linkdata.append(linkdata)
        for v in linkdata.values():
            all_links.append(v['link'])

all_links_set = set(all_links)

There’s a few links that are the same but with different capitalization:

link_forms = defaultdict(list)
for link in all_links_set:
    link_forms[link.lower()].append(link)
for links in link_forms.values():
    if len(links) > 1:
        print(links)

Technically links are supposed to be case sensitive, but in this dataset it appears we should treat the links as case insensitive. The only questionable case is “term-y” vs “term-Y”, however those both lead to the same place (here). When making this a user-facing product, it may be beneficial to have some settings around case sensitivity: insensitive/sensitive by default, exceptions.

all_links_lower = [l.lower() for l in all_links]
all_links_lower_set = set(all_links_lower)

Next we’ll explore normalization of the urls.

Initial exploration indicated that the only part of the url that could be allowed to vary while still considering links to be equivalent was the scheme; in particular “http” vs “https”. The host (netloc) cannot vary due to cases like “github.com/username” and “twitter.com/username”. The query/params cannot vary due to cases of websites that show different pages based on the query, for example youtube or mybinder. The fragment cannot vary because it may be used to point to different places within the same page. So we’ll count the cases where the scheme differs across examples:

from urllib.parse import urlparse, urlunparse

link_paths = defaultdict(list)

for l in all_links_lower_set:
    parse = urlparse(l)
    p = parse.netloc + parse.path + parse.fragment + parse.query + parse.params
    if p != '':
        link_paths[p].append((l, parse))

change_scheme = {}
for links in link_paths.values():
    if len(links) > 1:
        print(links)
        has_scheme = lambda scheme, ls: filter(lambda l: l[1].scheme == scheme, ls)
        for httpsl, _ in has_scheme('https', links):
           for l, p in links:
               if p.scheme == 'http':
                   change_scheme[l] = httpsl

Not that many, but still worth taking into account. For this case, changing the scheme may actually break the link, so we’ll only change the scheme to “https” for cases where there is already an “https” variation of the link. We’ll also do a roundtrip of parsing the url, which should normalize alternate forms of the same link (in particular ” and ‘#’).

print(urlunparse(urlparse('#')))
all_links_lower_norm = []
for l in all_links_lower:
    scheme_updated = change_scheme.get(l) or l
    all_links_lower_norm.append(urlunparse(urlparse(scheme_updated)))
print(len(change_scheme))
print(len(set(all_links_lower)))
all_links_lower_norm_set = set(all_links_lower_norm)
print(len(all_links_lower_norm_set))

Relative links

Finally we look at normalizing relative paths:

def norm_relative_link(link):
    if link.startswith('./') or link.startswith('../'):
        return re.sub(r'^(\.?\./)+', '', link)
    return link

link_tails = defaultdict(list)

for l in all_links_lower_norm_set:
    norm = norm_relative_link(l)
    link_tails[norm].append(l)

multis = [(norm, links) for norm, links in link_tails.items() if len(links) > 1]

print('Number of links with the same tail but different strings:', len(multis))
for norm, links in multis[:5]:
    if len(links) > 1:
        print(norm, links)

We see there are a lot of cases where two different strings refer to the same location due to the relative link structure. Relative paths are something the can be rebuilt automatically, so we’ll normalize these as well:

all_links_processed = [norm_relative_link(l) for l in all_links_lower_norm if l != '']
print(len(all_links_set))
print(len(set(all_links_processed)))

We see that we have reduced the total number of classes by about 800, but this is likely still too many.

Reducing the number of classes

We look at the number of examples we have for each link:

counter = defaultdict(int)

for link in all_links_processed:
    counter[link] += 1

def count_eq_to(counts, n):
    return [k for k, v in counts.items() if v == n]

freq_links = []
print('Total links:', len(counter))
for n in range(50):
    links = count_eq_to(counter, n)
    c = len(links)
    print(f'Number of links with {n} example(s):', c, sep='\t')
    if n > 25:
        freq_links += links

print('Links with 25 or more examples:', freq_links)

We see that the majority of links have few examples, and a minority are heavily used. Fortunately for this application, we can focus our efforts on classifying the most heavily used links, as those are the links which will most likely be needed again.

We can look at the number of classification classes depending on our minimum threshold for the number of available examples:

def count_ge_to(counts, n):
    return [k for k, v in counts.items() if v >= n]

for n in range(50):
    links = count_ge_to(counter, n)
    print(f'Number of classes with at least {n} example(s):', len(links), sep='\t')

We can visualize the relationship between the number of examples and the number of classes:

import sys
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
x = range(50)
ax.plot(x, [len(count_ge_to(counter, n)) for n in x])
ax.set_xlabel('Num examples')
ax.set_ylabel('Num classes')
plt.ylim([0, 800])
fig.tight_layout()

plt.savefig(sys.stdout.buffer)

Train/test splitting

We can make preprocessed data splits using:

python train_test_split.py data/html/processed/ <min_examples> <test_size>

This script is a tool to split the data while trying to keep a good amount of examples in the training and test sets, but it’s not perfect.

We’ll explore which data set provides the best balance between number of classes and number of examples in the test and train sets.

def show_data_summary(paths, desired_examples, show_classes):
    docs_data = []
    for p in paths:
        with open(p, 'r') as f:
            link_data = json.load(f)
            docs_data.append(link_data)
    totals = defaultdict(int)
    for v in (v for d in docs_data for v in d.values()):
        link = v['link']
        if link is not None:
            totals[link] += 1
    print('Total classes:\t', len(totals))
    print('Total classes with desired number of examples:\t', len({k: v for k, v in totals.items() if v >= desired_examples}))
    if show_classes:
        print('Classes:', sorted([(v, k) for k, v in totals.items()], reverse=True))
    return len(totals)
def summarize_data_split(min_examples, test_size, show_classes=False):
    pattern = os.path.join('split', f'train-{min_examples}-{test_size*100:.0f}', '**', '*' + '.linkdata.json')
    trainpaths = glob.glob(pattern, recursive=True)
    pattern = os.path.join('split', f'test-{min_examples}-{test_size*100:.0f}', '**', '*' + '.linkdata.json')
    testpaths = glob.glob(pattern, recursive=True)
    print('Min examples:', min_examples, '| Test size:', test_size, f'| Train/test counts: {len(trainpaths)}/{len(testpaths)}')
    print('(train)')
    trainclasses = show_data_summary(trainpaths, 2, show_classes)
    print('(test)')
    testclasses = show_data_summary(testpaths, 2, show_classes)
    print('Num classes with no test examples:', trainclasses - testclasses)
    print()
summarize_data_split(5, 0.05)
summarize_data_split(10, 0.05)
summarize_data_split(15, 0.05)
summarize_data_split(20, 0.05, show_classes=False)
summarize_data_split(25, 0.05)
summarize_data_split(30, 0.05)

summarize_data_split(5, 0.07)
summarize_data_split(10, 0.07)
summarize_data_split(15, 0.07)
summarize_data_split(20, 0.07, show_classes=False)
summarize_data_split(25, 0.07)
summarize_data_split(30, 0.07)

summarize_data_split(5, 0.15)
summarize_data_split(10, 0.15)
summarize_data_split(15, 0.15)
summarize_data_split(20, 0.15, show_classes=False)
summarize_data_split(25, 0.15)
summarize_data_split(30, 0.15)

summarize_data_split(5, 0.33)
summarize_data_split(10, 0.33)
summarize_data_split(15, 0.33)
summarize_data_split(20, 0.33, show_classes=False)
summarize_data_split(25, 0.33)
summarize_data_split(30, 0.33)

Removing duplicates

test_db = DocBin().from_disk('data/iter6/test.spacy')
train_db = DocBin().from_disk('data/iter6/train.spacy')
test_docs = list(test_db.get_docs(trained.vocab))
train_docs = list(train_db.get_docs(trained.vocab))
print(len(test_docs), len(train_docs))
def remove_duplicates(docs, seen):
    new_docs = []
    for i, doc in enumerate(docs):
        # print('Doc:', i)
        spans = doc.spans['sc']
        spans_set = set(spans)
        spans_used = set()
        no_duplicates = True
        for tokens in sliding_window(doc, 8):
            ts = (' '.join(t.text.strip() for t in tokens)).lower()
            all_spans_used = False
            for span in filter(lambda s: any(t in s for t in tokens), spans):
                if (ts, span.label) in seen:
                    # print('Duplicate found:', ts)
                    no_duplicates = False
                    all_spans_used = spans_used == spans_set
                else:
                    seen.add((ts, span.label))
                    spans_used.add(span)
            if all_spans_used:
                break
        if no_duplicates:
            new_docs.append(doc)
    return new_docs, seen
seen = set()
new_test_docs, seen = remove_duplicates(test_docs, seen)
print(len(test_docs), len(new_test_docs))
new_train_docs, seen = remove_duplicates(train_docs, seen)
print(len(train_docs), len(new_train_docs))
def show_docs_summary(docs, desired_examples, show_classes):
    totals = defaultdict(int)
    for link in (s.label_ for d in docs for s in d.spans['sc']):
        totals[link] += 1
    print('Total classes:\t', len(totals))
    print('Total classes with desired number of examples:\t', len({k: v for k, v in totals.items() if v >= desired_examples}))
    if show_classes:
        print('Classes:', sorted([(v, k) for k, v in totals.items()], reverse=True))
    return len(totals)
def summarize_docs_split(train_docs, test_docs, show_classes=False):
    print(f'Train/test counts: {len(train_docs)}/{len(test_docs)}')
    print('(train)')
    trainclasses = show_docs_summary(train_docs, 2, show_classes)
    print('(test)')
    testclasses = show_docs_summary(test_docs, 1, show_classes)
    print('Num classes with no test examples:', trainclasses - testclasses)
    print()
summarize_data_split(30, 0.05)
print()
summarize_docs_split(new_train_docs, new_test_docs)
summarize_docs_split(train_docs, test_docs)

def make_db(docs, name):
    db = DocBin()
    for doc in docs:
        db.add(doc)
    db.to_disk(f'{name}.spacy')
make_db(new_train_docs, 'train-no-dups')
make_db(new_test_docs, 'test-no-dups')