Skip to content

Commit

Permalink
K-means, mean shift clustering in sparse categorical ("LE") vector sp…
Browse files Browse the repository at this point in the history
…ace (#126)
  • Loading branch information
OlegBaskov committed Nov 10, 2018
1 parent 2cb9c1e commit 75675e7
Show file tree
Hide file tree
Showing 12 changed files with 187,225 additions and 1,062 deletions.
35,107 changes: 35,107 additions & 0 deletions data/CDS-caps-br-text/LG-English-clean-clean/br-text-clean-clean.ull

Large diffs are not rendered by default.

41,914 changes: 41,914 additions & 0 deletions data/CDS-caps-br-text/LG-English-clean/br-text-txt_U-clean.ull

Large diffs are not rendered by default.

56,547 changes: 56,547 additions & 0 deletions data/CDS-caps-br-text/LG-English/br-text.txt_U.ull

Large diffs are not rendered by default.

52,957 changes: 52,957 additions & 0 deletions data/CDS-caps-br-text/R=6-Weight=6:R-mst-weight=+1:R/br-text-parses-win6-omdist.ull

Large diffs are not rendered by default.

526 changes: 0 additions & 526 deletions notebooks/Child-Directed-Speech-ALE-2018-10-24.ipynb

This file was deleted.

571 changes: 571 additions & 0 deletions notebooks/LE-clustering-KLE-MLE-2018-11-10_.ipynb

Large diffs are not rendered by default.

428 changes: 0 additions & 428 deletions notebooks/POC-English-2018-10-21.ipynb

This file was deleted.

80 changes: 29 additions & 51 deletions src/grammar_learner/category_learner.py
@@ -1,9 +1,9 @@
# language-learning/src/category_learner.py # 81102
# language-learning/src/category_learner.py # 81110
import numpy as np
import pandas as pd
from copy import deepcopy
from collections import OrderedDict
from .utl import UTC # , round1,round2,round3,round4,round5
from .utl import UTC, kwa # , round1,round2,round3,round4,round5
from .read_files import check_dir # , check_mst_files
from .hyperwords import vector_space_dim, pmisvd
from .clustering import cluster_id, best_clusters, group_links, random_clusters
Expand Down Expand Up @@ -52,39 +52,21 @@ def add_disjuncts(cats, links, **kwargs):

def learn_categories(links, **kwargs): # 80802 poc05 restructured learner.py
# links == pd.DataFrame(columns = ['word', 'link', 'count'])
def kwa(v, k):
return kwargs[k] if k in kwargs else v

cats_file = kwa('/output', 'output_categories') # to define tmpath
tmpath = kwa('', 'tmpath')
parse_mode = kwa('given', 'parse_mode')
left_wall = kwa('', 'left_wall')
period = kwa(False, 'period')
cats_file = kwa('/output', 'output_categories', **kwargs) # to define tmpath
tmpath = kwa('', 'tmpath', **kwargs)
context = kwa(1, 'context')
window = kwa('mst', 'window')
weighting = kwa('ppmi', 'weighting')
# ? distance = kwa(??, 'distance')
group = kwa(True, 'group')
word_space = kwa('vectors', 'word_space')
dim_max = kwa(100, 'dim_max')
sv_min = kwa(0.1, 'sv_min')
dim_reduction = kwa('svm', 'dim_reduction')
algorithm = kwa('kmeans', 'clustering') # ⇒ best_clusters
cluster_range = kwa((2, 50, 2), 'cluster_range') # ⇒ best_clusters
cluster_criteria = kwa('silhouette', 'cluster_criteria')
cluster_level = kwa(0.9, 'cluster_level')
generalization = kwa('off', 'categories_generalization')
merge = kwa(0.8, 'categories_merge')
aggregate = kwa(0.2, 'categories_aggregation')
grammar_rules = kwa(1, 'grammar_rules')
verbose = kwa('none', 'verbose')
word_space = kwa('vectors', 'word_space', **kwargs)
dim_max = kwa(100, 'dim_max', **kwargs)
sv_min = kwa(0.1, 'sv_min', **kwargs)
algorithm = kwa('kmeans', 'clustering', **kwargs) # ⇒ best_clusters
verbose = kwa('none', 'verbose', **kwargs)

log = OrderedDict()
log.update({'category_learner': '80803-81101'})
log.update({'category_learner': '80803-81110'})
if verbose in ['max', 'debug']:
print(UTC(), ':: category_learner: word_space/algorithm:', word_space, '/', algorithm)

if tmpath == '' or tmpath == 'auto': # temporary files path
if tmpath == '' or tmpath == 'auto':
if '.' not in cats_file:
tmpath = cats_file
else:
Expand All @@ -99,7 +81,7 @@ def kwa(v, k):

cdf = pd.DataFrame(columns=['cluster', 'cluster_words'])

# Random Clusters # 80825
# Random Clusters
if algorithm == 'random':
log.update({'clustering': 'random'})
cdf = random_clusters(links, **kwargs)
Expand Down Expand Up @@ -128,8 +110,6 @@ def kwa(v, k):
log.update({'vector_space_dim': dim})
if verbose in ['mid', 'max', 'debug']:
print(UTC(), ':: category_learner: vector space dimensionality:', dim, '⇒ pmisvd')
# -vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
# -vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim) # 81021:
vdf, sv, re01 = pmisvd(links, dict_path, tmpath, dim)
log.update(re01)
if verbose in ['max', 'debug']:
Expand All @@ -139,24 +119,25 @@ def kwa(v, k):

# Sparse word space, agglomerative clustering 81021, ... ⇒ any clustering
elif word_space[0] == 's': # sparse
log.update({'clustering': 'agglomerative'})
log.update({'word_space': 'sparse'})
linx, words, features = clean_links(links, **kwargs)
print(
f'{len(links)} links: {len(set(links["word"].tolist()))} unique words, {len(set(links["link"].tolist()))} links')
print(f'{len(links)} links: {len(set(links["word"].tolist()))} unique words, {len(set(links["link"].tolist()))} links')
print(f'words: len {len(words)}, min {min(words)}, max {max(words)}')
print(f'features: len {len(features)}, min {min(features)}, max {max(features)}')
print(f'features: {features}')
print(f'linx: {linx}')

counts = co_occurrence_matrix(linx, **kwargs)
print(f'counts: {counts}')
if verbose in ['max', 'debug']:
print(f'counts: {counts}')
cd = categorical_distribution(counts, **kwargs)
print(f'counts.shape {counts.shape},cd.shape {cd.shape}')

if verbose in ['max', 'debug']:
print(f'counts.shape {counts.shape},cd.shape {cd.shape}')
labels, metrics, centroids = optimal_clusters(cd, **kwargs) # skl_clustering.py
if verbose in ['max', 'debug']:
print(f'labels: {labels},\n{len(sorted(np.unique(labels)))} unique: {sorted(np.unique(labels))}')

print(f'labels: {labels},\n{len(sorted(np.unique(labels)))} unique: {sorted(np.unique(labels))}')

log.update({'silhouette': metrics['silhouette_index']})
log.update(metrics)
# labels ⇒ cdf (legacy, extracted from agglomerative_clustering:
cdf['cluster'] = sorted(np.unique(labels)) # set(labels)
clusters = {x: [] for x in cdf['cluster'].tolist()}
Expand All @@ -166,18 +147,16 @@ def kwa(v, k):
cdf['cluster'] = range(1, len(cdf) + 1)
cdf['cluster'] = cdf['cluster'].apply(lambda x: cluster_id(x, len(cdf)))


else: # overkill: ILE
else: # random clusters
if verbose in ['max', 'debug']:
print(UTC(), ':: category_learner ⇒ else ⇒ ILE group_links')
cdf = group_links(links, verbose)
log.update({'clustering': 'else: ILE'})
print(UTC(), ':: category_learner ⇒ else ⇒ random clusters')
cdf = random_clusters(links, **kwargs)
log.update({'clustering': 'random'})

log.update({'n_clusters': len(cdf)})

print('\ncategory_learner: log:\n', log)

return cdf2cats(cdf, **kwargs), log # 81020: cdf2cats
if verbose in ['max', 'debug']:
print('\ncategory_learner: log:\n', log)
return cdf2cats(cdf, **kwargs), log


def cats2list(cats):
Expand Down Expand Up @@ -227,7 +206,6 @@ def cdf2cats(clusters, **kwargs):
cats['quality'] = [0 for x in cats['words']]
cats['similarities'] = [[0 for y in x] for x in cats['words']]
cats['children'] = [0 for x in cats['words']]

return cats

# Notes:
Expand Down
31 changes: 15 additions & 16 deletions src/grammar_learner/grammar_inducer.py
@@ -1,18 +1,14 @@
# language-learning/src/grammar_inducer.py # 81102
# language-learning/src/grammar_inducer.py # 81110
from copy import deepcopy
from collections import Counter
from typing import List, Tuple
from .utl import UTC
from .utl import UTC, kwa


def induce_grammar(categories, **kwargs): # 81025
# categories == {'cluster': [], 'words': [], ...}
def kwa(v, k):
return kwargs[k] if k in kwargs else v

max_disjuncts = kwa(1000, 'max_disjuncts') # 81025
verbose = kwa('none', 'verbose')

max_disjuncts = kwa(100000, 'max_disjuncts', **kwargs)
verbose = kwa('none', 'verbose', **kwargs)
if verbose in ['max', 'debug']:
print(UTC(), ':: induce_grammar: categories.keys():', categories.keys())

Expand Down Expand Up @@ -59,19 +55,22 @@ def kwa(v, k):
print('induce_grammar: rules["disjuncts"][' + str(cluster) + ']', len(rules['disjuncts'][cluster]),
'rules,', len(dj_counts), 'total unique disjunts')

# 81025 add only top (filtered) disjuncts:
# Add only top-frequency disjuncts:
top_djs = set([x[0] for x in dj_counts.most_common(max_disjuncts)])

for cluster in clusters:
rules['disjuncts'][cluster] = top_djs & rules['disjuncts'][cluster]

if verbose in ['debug']:
print(max_disjuncts, 'top_djs:', top_djs)
for cluster in clusters:
rules['disjuncts'][cluster] = top_djs & rules['disjuncts'][cluster]
print(max_disjuncts, 'max_disjuncts, len(top_djs):', len(top_djs))
print('\nrules:')
nr = 0
for cluster in clusters:
print('\n', cluster, len(rules['disjuncts'][cluster]))
nr += len(rules['disjuncts'][cluster])
print('Total:', nr)
rule_lengths = {x: len(rules['disjuncts'][x]) for x in clusters}
#for cluster in clusters:
# print('\n', cluster, len(rules['disjuncts'][cluster]))
# nr += len(rules['disjuncts'][cluster])
print('Rule lengths:', rule_lengths, 'total', sum(rule_lengths.values()),
'rules ⇒ average', round(sum(rule_lengths.values())/len(rule_lengths)), 'rules/cluster' )

# rules['djs'] = deepcopy(rules['disjuncts']) # no need: conversion in g12n
# TOD0?: check jaccard with tuples else replace with numbers?
Expand Down
28 changes: 11 additions & 17 deletions src/grammar_learner/learner.py
@@ -1,12 +1,12 @@
# language-learning/src/learner.py # 81021
# language-learning/src/learner.py # 81106
import os
from copy import deepcopy
import pickle, numpy as np, pandas as pd
from shutil import copy2 as copy
from IPython.display import display
from collections import OrderedDict
from .widgets import html_table
from .utl import UTC
from .utl import UTC, kwa
from .read_files import check_dir, check_mst_files
from .pparser import files2links
from .category_learner import learn_categories, add_disjuncts, cats2list
Expand All @@ -18,16 +18,13 @@


def learn_grammar(**kwargs):
log = OrderedDict({'start': str(UTC()), 'learn_grammar': 'v.0.6.80929'})

def kwa(v, k):
return kwargs[k] if k in kwargs else v
log = OrderedDict({'start': str(UTC()), 'learn_grammar': 'v.0.7.81109'})

input_parses = kwargs['input_parses']
output_grammar = kwargs['output_grammar']
output_categories = kwa('', 'output_categories')
output_statistics = kwa('', 'output_statistics')
temp_dir = kwa('', 'temp_dir')
output_categories = kwa('', 'output_categories', **kwargs)
output_statistics = kwa('', 'output_statistics', **kwargs)
temp_dir = kwa('', 'temp_dir', **kwargs)
if os.path.isdir(output_grammar):
prj_dir = output_grammar
else:
Expand All @@ -43,14 +40,11 @@ def kwa(v, k):
if os.path.isdir(temp_dir):
kwargs['tmpath'] = temp_dir

context = kwa(1, 'context')
word_space = kwa('vectors', 'word_space')
clustering = kwa('kmeans', 'clustering') # TODO: update
cluster_range = kwa((2, 48, 1), 'cluster_range')
cats_gen = kwa('off', 'categories_generalization')
grammar_rules = kwa(1, 'grammar_rules')
verbose = kwa('none', 'verbose')
tmpath = kwa('', 'tmpath')
context = kwa(1, 'context', **kwargs)
clustering = kwa('kmeans', 'clustering', **kwargs) # TODO: update
cats_gen = kwa('off', 'categories_generalization', **kwargs)
grammar_rules = kwa(1, 'grammar_rules', **kwargs)
verbose = kwa('none', 'verbose', **kwargs)

files, re01 = check_mst_files(input_parses, verbose)
log.update(re01)
Expand Down
30 changes: 24 additions & 6 deletions src/grammar_learner/pqa_table.py
@@ -1,4 +1,4 @@
# language-learning/src/grammar_learner/pqa_table.py # 81022
# language-learning/src/grammar_learner/pqa_table.py # 81109
# Test Grammar Learner to fill in ULL Project Plan Parses spreadshit
import os, sys, time
from ..common import handle_path_string
Expand All @@ -8,8 +8,12 @@
from .learner import learn_grammar


def params(corpus, dataset, module_path, out_dir, **kwargs): # 81022
def params(corpus, dataset, module_path, out_dir, **kwargs): # 81109
input_parses = module_path + '/data/' + corpus + '/' + dataset
if type(kwargs['clustering']) is str:
clustering = kwargs['clustering']
else:
clustering = kwargs['clustering'][0]
if check_dir(input_parses, create=False, verbose='min'):
batch_dir = out_dir + '/' + corpus
spaces = ['w', 'c', 'd'] # 'words','connectors', 'disjuncts'
Expand All @@ -27,7 +31,15 @@ def params(corpus, dataset, module_path, out_dir, **kwargs): # 81022
elif kwargs['word_space'] == 'discrete':
wtf = 'ILE'
elif kwargs['word_space'] == 'sparse':
wtf = 'ALE'
# wtf = 'ALE' # 81109:
if clustering == 'agglomerative':
wtf = 'ALE'
elif clustering in ['k-means', 'kmeans']:
wtf = 'KLE'
elif clustering[:4] == 'mean': # ['mean shift', 'mean_shift']:
wtf = 'MLE'
else:
wtf = '?LE'
else: wtf = '???'
if kwargs['left_wall'] in ['', 'none']:
left_wall = 'no-LW'
Expand Down Expand Up @@ -81,7 +93,7 @@ def table_rows(lines, out_dir, cp, rp, runs=(1, 1), **kwargs): # 81021
header = ['Line', 'Corpus', 'Parsing', 'LW', 'RW', 'Gen.', 'Space',
'Rules', 'Silhouette', 'PA', 'PQ', 'F1']
spaces = ''
if kwargs['clustering'] == 'random': # 80825 Random clusters
if kwargs['clustering'] == 'random':
spaces += 'RND'
else:
if kwargs['context'] == 1:
Expand All @@ -93,7 +105,14 @@ def table_rows(lines, out_dir, cp, rp, runs=(1, 1), **kwargs): # 81021
elif kwargs['word_space'] == 'discrete':
spaces += 'ILE'
elif kwargs['word_space'] == 'sparse':
spaces += 'ALE'
if kwargs['clustering'][0] == 'agglomerative':
spaces += 'ALE'
elif kwargs['clustering'][0] in ['k-means', 'kmeans']:
spaces += 'KLE'
elif kwargs['clustering'][0][:4] == 'mean': # ['mean shift', 'mean_shift']:
spaces += 'MLE'
else:
spaces += '?LE'
else:
spaces += '???'
if kwargs['grammar_rules'] == 1:
Expand Down Expand Up @@ -216,4 +235,3 @@ def table_rows(lines, out_dir, cp, rp, runs=(1, 1), **kwargs): # 81021
# -1: connectors #Cxx: {C01Cxx- or ... CnCxx-} and {CxxC01+ or ... CxxCn+}
# -2: disjuncts #Cxx: (C01Cxx-) or (C02Cxx-) ... or (CxxCn+)
# 81018: unified table_rows, ready for next test_grammar, table: PA/PQ/F1
# TODO: add new test_grammar; add new metrics from updated learn_grammar

0 comments on commit 75675e7

Please sign in to comment.