K-means, mean shift clustering in sparse categorical ("LE") vector sp…

…ace (#126)
singnet · Nov 10, 2018 · 75675e7 · 75675e7
1 parent 2cb9c1e
commit 75675e7
Show file tree

Hide file tree

Showing 12 changed files with 187,225 additions and 1,062 deletions.
diff --git a/data/CDS-caps-br-text/LG-English-clean-clean/br-text-clean-clean.ull b/data/CDS-caps-br-text/LG-English-clean-clean/br-text-clean-clean.ull
diff --git a/data/CDS-caps-br-text/LG-English-clean/br-text-txt_U-clean.ull b/data/CDS-caps-br-text/LG-English-clean/br-text-txt_U-clean.ull
diff --git a/data/CDS-caps-br-text/LG-English/br-text.txt_U.ull b/data/CDS-caps-br-text/LG-English/br-text.txt_U.ull
diff --git a/data/CDS-caps-br-text/R=6-Weight=6:R-mst-weight=+1:R/br-text-parses-win6-omdist.ull b/data/CDS-caps-br-text/R=6-Weight=6:R-mst-weight=+1:R/br-text-parses-win6-omdist.ull
diff --git a/notebooks/Child-Directed-Speech-ALE-2018-10-24.ipynb b/notebooks/Child-Directed-Speech-ALE-2018-10-24.ipynb
diff --git a/notebooks/LE-clustering-KLE-MLE-2018-11-10_.ipynb b/notebooks/LE-clustering-KLE-MLE-2018-11-10_.ipynb
diff --git a/notebooks/POC-English-2018-10-21.ipynb b/notebooks/POC-English-2018-10-21.ipynb
diff --git a/src/grammar_learner/category_learner.py b/src/grammar_learner/category_learner.py
@@ -1,9 +1,9 @@
-# language-learning/src/category_learner.py                             # 81102
+# language-learning/src/category_learner.py                             # 81110
 import numpy as np
 import pandas as pd
 from copy import deepcopy
 from collections import OrderedDict
-from .utl import UTC  # , round1,round2,round3,round4,round5
+from .utl import UTC, kwa  # , round1,round2,round3,round4,round5
 from .read_files import check_dir  # , check_mst_files
 from .hyperwords import vector_space_dim, pmisvd
 from .clustering import cluster_id, best_clusters, group_links, random_clusters
@@ -52,39 +52,21 @@ def add_disjuncts(cats, links, **kwargs):
 
 def learn_categories(links, **kwargs):  # 80802 poc05 restructured learner.py
     # links == pd.DataFrame(columns = ['word', 'link', 'count'])
-    def kwa(v, k):
-        return kwargs[k] if k in kwargs else v
-
-    cats_file = kwa('/output', 'output_categories')  # to define tmpath
-    tmpath = kwa('', 'tmpath')
-    parse_mode = kwa('given', 'parse_mode')
-    left_wall = kwa('', 'left_wall')
-    period = kwa(False, 'period')
+    cats_file = kwa('/output', 'output_categories', **kwargs)  # to define tmpath
+    tmpath = kwa('', 'tmpath', **kwargs)
     context = kwa(1, 'context')
-    window = kwa('mst', 'window')
-    weighting = kwa('ppmi', 'weighting')
-    # ? distance       = kwa(??,   'distance')
-    group = kwa(True, 'group')
-    word_space = kwa('vectors', 'word_space')
-    dim_max = kwa(100, 'dim_max')
-    sv_min = kwa(0.1, 'sv_min')
-    dim_reduction = kwa('svm', 'dim_reduction')
-    algorithm = kwa('kmeans', 'clustering')  # ⇒ best_clusters
-    cluster_range = kwa((2, 50, 2), 'cluster_range')  # ⇒ best_clusters
-    cluster_criteria = kwa('silhouette', 'cluster_criteria')
-    cluster_level = kwa(0.9, 'cluster_level')
-    generalization = kwa('off', 'categories_generalization')
-    merge = kwa(0.8, 'categories_merge')
-    aggregate = kwa(0.2, 'categories_aggregation')
-    grammar_rules = kwa(1, 'grammar_rules')
-    verbose = kwa('none', 'verbose')
+    word_space = kwa('vectors', 'word_space', **kwargs)
+    dim_max = kwa(100, 'dim_max', **kwargs)
+    sv_min = kwa(0.1, 'sv_min', **kwargs)
+    algorithm = kwa('kmeans', 'clustering', **kwargs)  # ⇒ best_clusters
+    verbose = kwa('none', 'verbose', **kwargs)
 
     log = OrderedDict()
-    log.update({'category_learner': '80803-81101'})
+    log.update({'category_learner': '80803-81110'})
     if verbose in ['max', 'debug']:
         print(UTC(), ':: category_learner: word_space/algorithm:', word_space, '/', algorithm)
 
-    if tmpath == '' or tmpath == 'auto':  # temporary files path
+    if tmpath == '' or tmpath == 'auto':
         if '.' not in cats_file:
             tmpath = cats_file
         else:
@@ -99,7 +81,7 @@ def kwa(v, k):
 
     cdf = pd.DataFrame(columns=['cluster', 'cluster_words'])
 
-    # Random Clusters   # 80825
+    # Random Clusters
     if algorithm == 'random':
         log.update({'clustering': 'random'})
         cdf = random_clusters(links, **kwargs)
@@ -128,8 +110,6 @@ def kwa(v, k):
         log.update({'vector_space_dim': dim})
         if verbose in ['mid', 'max', 'debug']:
             print(UTC(), ':: category_learner: vector space dimensionality:', dim, '⇒ pmisvd')
-        # -vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
-        # -vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim)    # 81021:
         vdf, sv, re01 = pmisvd(links, dict_path, tmpath, dim)
         log.update(re01)
         if verbose in ['max', 'debug']:
@@ -139,24 +119,25 @@ def kwa(v, k):
 
     # Sparse word space, agglomerative clustering 81021, ... ⇒ any clustering
     elif word_space[0] == 's':  # sparse
-        log.update({'clustering': 'agglomerative'})
+        log.update({'word_space': 'sparse'})
         linx, words, features = clean_links(links, **kwargs)
-        print(
-            f'{len(links)} links: {len(set(links["word"].tolist()))} unique words, {len(set(links["link"].tolist()))} links')
+        print(f'{len(links)} links: {len(set(links["word"].tolist()))} unique words, {len(set(links["link"].tolist()))} links')
         print(f'words: len {len(words)}, min {min(words)}, max {max(words)}')
         print(f'features: len {len(features)}, min {min(features)}, max {max(features)}')
         print(f'features: {features}')
         print(f'linx: {linx}')
+
         counts = co_occurrence_matrix(linx, **kwargs)
-        print(f'counts: {counts}')
+        if verbose in ['max', 'debug']:
+            print(f'counts: {counts}')
         cd = categorical_distribution(counts, **kwargs)
-        print(f'counts.shape {counts.shape},cd.shape {cd.shape}')
-
+        if verbose in ['max', 'debug']:
+            print(f'counts.shape {counts.shape},cd.shape {cd.shape}')
         labels, metrics, centroids = optimal_clusters(cd, **kwargs)  # skl_clustering.py
+        if verbose in ['max', 'debug']:
+            print(f'labels: {labels},\n{len(sorted(np.unique(labels)))} unique: {sorted(np.unique(labels))}')
 
-        print(f'labels: {labels},\n{len(sorted(np.unique(labels)))} unique: {sorted(np.unique(labels))}')
-
-        log.update({'silhouette': metrics['silhouette_index']})
+        log.update(metrics)
         # labels ⇒ cdf (legacy, extracted from agglomerative_clustering:
         cdf['cluster'] = sorted(np.unique(labels))  # set(labels)
         clusters = {x: [] for x in cdf['cluster'].tolist()}
@@ -166,18 +147,16 @@ def kwa(v, k):
         cdf['cluster'] = range(1, len(cdf) + 1)
         cdf['cluster'] = cdf['cluster'].apply(lambda x: cluster_id(x, len(cdf)))
 
-
-    else:  # overkill: ILE
+    else:  # random clusters
         if verbose in ['max', 'debug']:
-            print(UTC(), ':: category_learner ⇒ else ⇒ ILE group_links')
-        cdf = group_links(links, verbose)
-        log.update({'clustering': 'else: ILE'})
+            print(UTC(), ':: category_learner ⇒ else ⇒ random clusters')
+        cdf = random_clusters(links, **kwargs)
+        log.update({'clustering': 'random'})
 
     log.update({'n_clusters': len(cdf)})
-
-    print('\ncategory_learner: log:\n', log)
-
-    return cdf2cats(cdf, **kwargs), log  # 81020: cdf2cats
+    if verbose in ['max', 'debug']:
+        print('\ncategory_learner: log:\n', log)
+    return cdf2cats(cdf, **kwargs), log
 
 
 def cats2list(cats):
@@ -227,7 +206,6 @@ def cdf2cats(clusters, **kwargs):
         cats['quality'] = [0 for x in cats['words']]
         cats['similarities'] = [[0 for y in x] for x in cats['words']]
     cats['children'] = [0 for x in cats['words']]
-
     return cats
 
 # Notes:

diff --git a/src/grammar_learner/grammar_inducer.py b/src/grammar_learner/grammar_inducer.py
@@ -1,18 +1,14 @@
-# language-learning/src/grammar_inducer.py                              # 81102
+# language-learning/src/grammar_inducer.py                              # 81110
 from copy import deepcopy
 from collections import Counter
 from typing import List, Tuple
-from .utl import UTC
+from .utl import UTC, kwa
 
 
 def induce_grammar(categories, **kwargs):  # 81025
     # categories == {'cluster': [], 'words': [], ...}
-    def kwa(v, k):
-        return kwargs[k] if k in kwargs else v
-
-    max_disjuncts = kwa(1000, 'max_disjuncts')  # 81025
-    verbose = kwa('none', 'verbose')
-
+    max_disjuncts = kwa(100000, 'max_disjuncts', **kwargs)
+    verbose = kwa('none', 'verbose', **kwargs)
     if verbose in ['max', 'debug']:
         print(UTC(), ':: induce_grammar: categories.keys():', categories.keys())
 
@@ -59,19 +55,22 @@ def kwa(v, k):
             print('induce_grammar: rules["disjuncts"][' + str(cluster) + ']', len(rules['disjuncts'][cluster]),
                   'rules,', len(dj_counts), 'total unique disjunts')
 
-    # 81025 add only top (filtered) disjuncts:
+    # Add only top-frequency disjuncts:
     top_djs = set([x[0] for x in dj_counts.most_common(max_disjuncts)])
 
+    for cluster in clusters:
+        rules['disjuncts'][cluster] = top_djs & rules['disjuncts'][cluster]
+
     if verbose in ['debug']:
-        print(max_disjuncts, 'top_djs:', top_djs)
-        for cluster in clusters:
-            rules['disjuncts'][cluster] = top_djs & rules['disjuncts'][cluster]
+        print(max_disjuncts, 'max_disjuncts, len(top_djs):', len(top_djs))
         print('\nrules:')
         nr = 0
-        for cluster in clusters:
-            print('\n', cluster, len(rules['disjuncts'][cluster]))
-            nr += len(rules['disjuncts'][cluster])
-        print('Total:', nr)
+        rule_lengths = {x: len(rules['disjuncts'][x]) for x in clusters}
+        #for cluster in clusters:
+        #    print('\n', cluster, len(rules['disjuncts'][cluster]))
+        #    nr += len(rules['disjuncts'][cluster])
+        print('Rule lengths:', rule_lengths, 'total', sum(rule_lengths.values()),
+              'rules ⇒ average', round(sum(rule_lengths.values())/len(rule_lengths)), 'rules/cluster' )
 
     # rules['djs'] = deepcopy(rules['disjuncts'])  # no need: conversion in g12n
     # TOD0?: check jaccard with tuples else replace with numbers?

diff --git a/src/grammar_learner/learner.py b/src/grammar_learner/learner.py
@@ -1,12 +1,12 @@
-# language-learning/src/learner.py                                      # 81021
+# language-learning/src/learner.py                                      # 81106
 import os
 from copy import deepcopy
 import pickle, numpy as np, pandas as pd
 from shutil import copy2 as copy
 from IPython.display import display
 from collections import OrderedDict
 from .widgets import html_table
-from .utl import UTC
+from .utl import UTC, kwa
 from .read_files import check_dir, check_mst_files
 from .pparser import files2links
 from .category_learner import learn_categories, add_disjuncts, cats2list
@@ -18,16 +18,13 @@
 
 
 def learn_grammar(**kwargs):
-    log = OrderedDict({'start': str(UTC()), 'learn_grammar': 'v.0.6.80929'})
-
-    def kwa(v, k):
-        return kwargs[k] if k in kwargs else v
+    log = OrderedDict({'start': str(UTC()), 'learn_grammar': 'v.0.7.81109'})
 
     input_parses = kwargs['input_parses']
     output_grammar = kwargs['output_grammar']
-    output_categories = kwa('', 'output_categories')
-    output_statistics = kwa('', 'output_statistics')
-    temp_dir = kwa('', 'temp_dir')
+    output_categories = kwa('', 'output_categories', **kwargs)
+    output_statistics = kwa('', 'output_statistics', **kwargs)
+    temp_dir = kwa('', 'temp_dir', **kwargs)
     if os.path.isdir(output_grammar):
         prj_dir = output_grammar
     else:
@@ -43,14 +40,11 @@ def kwa(v, k):
         if os.path.isdir(temp_dir):
             kwargs['tmpath'] = temp_dir
 
-    context = kwa(1, 'context')
-    word_space = kwa('vectors', 'word_space')
-    clustering = kwa('kmeans', 'clustering')  # TODO: update
-    cluster_range = kwa((2, 48, 1), 'cluster_range')
-    cats_gen = kwa('off', 'categories_generalization')
-    grammar_rules = kwa(1, 'grammar_rules')
-    verbose = kwa('none', 'verbose')
-    tmpath = kwa('', 'tmpath')
+    context = kwa(1, 'context', **kwargs)
+    clustering = kwa('kmeans', 'clustering', **kwargs)  # TODO: update
+    cats_gen = kwa('off', 'categories_generalization', **kwargs)
+    grammar_rules = kwa(1, 'grammar_rules', **kwargs)
+    verbose = kwa('none', 'verbose', **kwargs)
 
     files, re01 = check_mst_files(input_parses, verbose)
     log.update(re01)

diff --git a/src/grammar_learner/pqa_table.py b/src/grammar_learner/pqa_table.py
@@ -1,4 +1,4 @@
-# language-learning/src/grammar_learner/pqa_table.py                    # 81022
+# language-learning/src/grammar_learner/pqa_table.py                    # 81109
 # Test Grammar Learner to fill in ULL Project Plan Parses spreadshit
 import os, sys, time
 from ..common import handle_path_string
@@ -8,8 +8,12 @@
 from .learner import learn_grammar
 
 
-def params(corpus, dataset, module_path, out_dir, **kwargs):            # 81022
+def params(corpus, dataset, module_path, out_dir, **kwargs):            # 81109
     input_parses = module_path + '/data/' + corpus + '/' + dataset
+    if type(kwargs['clustering']) is str:
+        clustering = kwargs['clustering']
+    else:
+        clustering = kwargs['clustering'][0]
     if check_dir(input_parses, create=False, verbose='min'):
         batch_dir = out_dir + '/' + corpus
         spaces = ['w', 'c', 'd']        # 'words','connectors', 'disjuncts'
@@ -27,7 +31,15 @@ def params(corpus, dataset, module_path, out_dir, **kwargs):            # 81022
         elif kwargs['word_space'] == 'discrete':
             wtf = 'ILE'
         elif kwargs['word_space'] == 'sparse':
-            wtf = 'ALE'
+            # wtf = 'ALE'  # 81109:
+            if clustering == 'agglomerative':
+                wtf = 'ALE'
+            elif clustering in ['k-means', 'kmeans']:
+                wtf = 'KLE'
+            elif clustering[:4] == 'mean': # ['mean shift', 'mean_shift']:
+                wtf = 'MLE'
+            else:
+                wtf = '?LE'
         else: wtf = '???'
         if kwargs['left_wall'] in ['', 'none']:
             left_wall = 'no-LW'
@@ -81,7 +93,7 @@ def table_rows(lines, out_dir, cp, rp, runs=(1, 1), **kwargs):          # 81021
     header = ['Line', 'Corpus', 'Parsing', 'LW', 'RW', 'Gen.', 'Space',
               'Rules', 'Silhouette', 'PA', 'PQ', 'F1']
     spaces = ''
-    if kwargs['clustering'] == 'random':  # 80825 Random clusters
+    if kwargs['clustering'] == 'random':
         spaces += 'RND'
     else:
         if kwargs['context'] == 1:
@@ -93,7 +105,14 @@ def table_rows(lines, out_dir, cp, rp, runs=(1, 1), **kwargs):          # 81021
         elif kwargs['word_space'] == 'discrete':
             spaces += 'ILE'
         elif kwargs['word_space'] == 'sparse':
-            spaces += 'ALE'
+            if kwargs['clustering'][0] == 'agglomerative':
+                spaces += 'ALE'
+            elif kwargs['clustering'][0] in ['k-means', 'kmeans']:
+                spaces += 'KLE'
+            elif kwargs['clustering'][0][:4] == 'mean': # ['mean shift', 'mean_shift']:
+                spaces += 'MLE'
+            else:
+                spaces += '?LE'
         else:
             spaces += '???'
     if kwargs['grammar_rules'] == 1:
@@ -216,4 +235,3 @@ def table_rows(lines, out_dir, cp, rp, runs=(1, 1), **kwargs):          # 81021
 # -1: connectors #Cxx: {C01Cxx- or ... CnCxx-} and {CxxC01+ or ... CxxCn+}
 # -2: disjuncts  #Cxx: (C01Cxx-) or (C02Cxx-) ... or (CxxCn+)
 # 81018: unified table_rows, ready for next test_grammar, table: PA/PQ/F1
-# TODO: add new test_grammar; add new metrics from updated learn_grammar