From f7beeaccbade430390193b8d75afba13e51166ce Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sat, 11 Aug 2018 18:40:23 +0200 Subject: [PATCH 1/4] Migrate to sourced.ml - 1 Signed-off-by: Alexander Bezzubov --- requirements.txt | 2 +- setup.py | 4 ++-- tmsc/__main__.py | 6 +++--- tmsc/environment.py | 5 ++--- tmsc/topic_detector.py | 10 ++++++---- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index ce0d015..a8beb9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -ast2vec>=0.3.8-alpha +sourced-ml==0.5.1 diff --git a/setup.py b/setup.py index 862350f..8f9537e 100644 --- a/setup.py +++ b/setup.py @@ -17,8 +17,8 @@ "console_scripts": ["tmsc=tmsc.__main__:main"], }, keywords=["machine learning on source code", "topic modeling", - "github", "bblfsh", "babelfish", "ast2vec"], - install_requires=["ast2vec>=0.3.8-alpha"], + "github", "bblfsh", "babelfish"], + install_requires=["sourced-ml>=0.5.1"], package_data={"": ["LICENSE.md", "README.md"]}, classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tmsc/__main__.py b/tmsc/__main__.py index c4f2a56..fdac0c3 100644 --- a/tmsc/__main__.py +++ b/tmsc/__main__.py @@ -3,13 +3,13 @@ import logging import sys -from ast2vec import Topics, DocumentFrequencies, DEFAULT_BBLFSH_TIMEOUT -from ast2vec.bow import BOWBase +from sourced.ml.models import BOW, Topics, DocumentFrequencies from modelforge.backends import create_backend from tmsc.environment import initialize from tmsc.topic_detector import TopicDetector +DEFAULT_BBLFSH_TIMEOUT = 10 def main(): parser = argparse.ArgumentParser() @@ -52,7 +52,7 @@ def main(): args.df = DocumentFrequencies(log_level=args.log_level).load( source=args.df, backend=backend) if args.bow is not None: - args.bow = BOWBase(log_level=args.log_level).load(source=args.bow, backend=backend) + args.bow = BOW(log_level=args.log_level).load(source=args.bow, backend=backend) sr = TopicDetector( topics=args.topics, docfreq=args.df, bow=args.bow, verbosity=args.log_level, prune_df_threshold=args.prune_df, gcs_bucket=args.gcs, repo2bow_kwargs={ diff --git a/tmsc/environment.py b/tmsc/environment.py index fdbc0e5..111c20e 100644 --- a/tmsc/environment.py +++ b/tmsc/environment.py @@ -1,7 +1,6 @@ import logging from modelforge.logs import setup_logging -from ast2vec import ensure_bblfsh_is_running_noexc, install_enry __initialized__ = False @@ -22,6 +21,6 @@ def initialize(log_level=logging.INFO, enry="./enry"): if __initialized__: return setup_logging(log_level) - ensure_bblfsh_is_running_noexc() - install_enry(target=enry, warn_exists=False) +# ensure_bblfsh_is_running_noexc() +# install_enry(target=enry, warn_exists=False) __initialized__ = True diff --git a/tmsc/topic_detector.py b/tmsc/topic_detector.py index 1d0e463..da400ae 100644 --- a/tmsc/topic_detector.py +++ b/tmsc/topic_detector.py @@ -1,9 +1,11 @@ import logging import re -from ast2vec import Topics, Repo2Base, DocumentFrequencies -from ast2vec.bow import BOWBase +from sourced.ml.models import BOW, Topics, DocumentFrequencies + +from ast2vec import Repo2Base from ast2vec.model2.uast2bow import Uasts2BOW + from modelforge.backends import create_backend import numpy from scipy.sparse import csr_matrix @@ -15,7 +17,7 @@ class Repo2BOW(Repo2Base): """ Implements the step repository -> :class:`ast2vec.nbow.NBOW`. """ - MODEL_CLASS = BOWBase + MODEL_CLASS = BOW def __init__(self, vocabulary, docfreq, **kwargs): super().__init__(**kwargs) @@ -62,7 +64,7 @@ def __init__(self, topics=None, docfreq=None, bow=None, verbosity=logging.DEBUG, self._docfreq = self._docfreq.prune(prune_df_threshold) self._log.info("Loaded docfreq model: %s", self._docfreq) if bow is not None: - assert isinstance(bow, BOWBase) + assert isinstance(bow, BOW) self._bow = bow if self._topics.matrix.shape[1] != self._bow.matrix.shape[1]: raise ValueError("Models do not match: topics has %s tokens while bow has %s" % From b43e3c585f2e8a100722f9bb3ae2f9028088a558 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sun, 12 Aug 2018 08:00:43 +0200 Subject: [PATCH 2/4] Migrate to sourced.ml - 2 Signed-off-by: Alexander Bezzubov --- requirements.txt | 1 + setup.py | 2 +- tmsc/__main__.py | 28 +++++++++----- tmsc/topic_detector.py | 86 ++++++++++++++++++------------------------ tmsc/uast2bow.py | 55 +++++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 61 deletions(-) create mode 100644 tmsc/uast2bow.py diff --git a/requirements.txt b/requirements.txt index a8beb9c..14e430a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ sourced-ml==0.5.1 +ast2vec>=0.3.8-alpha diff --git a/setup.py b/setup.py index 8f9537e..18cf304 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ }, keywords=["machine learning on source code", "topic modeling", "github", "bblfsh", "babelfish"], - install_requires=["sourced-ml>=0.5.1"], + install_requires=["sourced-ml>=0.5.1", "ast2vec>=0.3.8-alpha"], package_data={"": ["LICENSE.md", "README.md"]}, classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tmsc/__main__.py b/tmsc/__main__.py index fdac0c3..60e175d 100644 --- a/tmsc/__main__.py +++ b/tmsc/__main__.py @@ -2,14 +2,16 @@ import json import logging import sys +import os from sourced.ml.models import BOW, Topics, DocumentFrequencies from modelforge.backends import create_backend +from modelforge.index import GitIndex from tmsc.environment import initialize from tmsc.topic_detector import TopicDetector -DEFAULT_BBLFSH_TIMEOUT = 10 +DEFAULT_BBLFSH_TIMEOUT = 20 def main(): parser = argparse.ArgumentParser() @@ -33,6 +35,10 @@ def main(): parser.add_argument("--prune-df", default=20, type=int, help="Minimum number of times an identifer must occur in different " "documents to be taken into account.") + parser.add_argument("--index_repo", default="https://github.com/src-d/models", + help="Models index repository.") + parser.add_argument("--index_cache", default=os.path.join(BOW.cache_dir(), "models"), + help="Local cache of models index repository") parser.add_argument("-n", "--nnn", default=10, type=int, help="Number of topics to print.") parser.add_argument("-f", "--format", default="human", choices=["json", "human"], @@ -42,24 +48,26 @@ def main(): if args.linguist is None: args.linguist = "./enry" initialize(args.log_level, enry=args.linguist) + if args.gcs: backend = create_backend(args="bucket=" + args.gcs) else: - backend = create_backend() - if args.topics is not None: - args.topics = Topics(log_level=args.log_level).load(source=args.topics, backend=backend) - if args.df is not None: - args.df = DocumentFrequencies(log_level=args.log_level).load( - source=args.df, backend=backend) - if args.bow is not None: - args.bow = BOW(log_level=args.log_level).load(source=args.bow, backend=backend) + git_index = GitIndex(index_repo=args.index_repo, cache=args.index_cache, log_level=args.log_level) + backend = create_backend(git_index=git_index) + + args.topics = Topics(log_level=args.log_level).load(source=args.topics, backend=backend) #source=args.topics + args.df = DocumentFrequencies(log_level=args.log_level).load(source=args.df, backend=backend) + args.bow = BOW(log_level=args.log_level).load(source=args.bow, backend=backend) + sr = TopicDetector( topics=args.topics, docfreq=args.df, bow=args.bow, verbosity=args.log_level, - prune_df_threshold=args.prune_df, gcs_bucket=args.gcs, repo2bow_kwargs={ + prune_df_threshold=args.prune_df, repo2bow_kwargs={ "linguist": args.linguist, "bblfsh_endpoint": args.bblfsh, "timeout": args.timeout}) + topics = sr.query(args.input, size=args.nnn) + if args.format == "json": json.dump({"repository": args.input, "topics": topics}, sys.stdout) elif args.format == "human": diff --git a/tmsc/topic_detector.py b/tmsc/topic_detector.py index da400ae..c96edd8 100644 --- a/tmsc/topic_detector.py +++ b/tmsc/topic_detector.py @@ -1,17 +1,16 @@ import logging import re -from sourced.ml.models import BOW, Topics, DocumentFrequencies - from ast2vec import Repo2Base -from ast2vec.model2.uast2bow import Uasts2BOW +#from ast2vec.model2.uast2bow import Uasts2BOW #?replace \w sourced.ml + +from sourced.ml.models import BOW, Topics, DocumentFrequencies -from modelforge.backends import create_backend import numpy from scipy.sparse import csr_matrix from tmsc.environment import initialize - +from tmsc.uast2bow import Uasts2BOW class Repo2BOW(Repo2Base): """ @@ -33,77 +32,63 @@ class TopicDetector: r"(https://|ssh://git@|git://)(github.com/[^/]+/[^/]+)(|.git|/)") def __init__(self, topics=None, docfreq=None, bow=None, verbosity=logging.DEBUG, - prune_df_threshold=1, gcs_bucket=None, initialize_environment=True, - repo2bow_kwargs=None): - if initialize_environment: - initialize() + prune_df_threshold=1, repo2bow_kwargs=None): + self._log = logging.getLogger("topic_detector") self._log.setLevel(verbosity) - if gcs_bucket: - backend = create_backend(args="bucket=" + gcs_bucket) - else: - backend = create_backend() - if topics is None: - self._topics = Topics(log_level=verbosity).load(backend=backend) - else: - assert isinstance(topics, Topics) - self._topics = topics + + if not topics: + raise ValueError("Please provide a Topic model") + assert isinstance(topics, Topics) + self._topics = topics self._log.info("Loaded topics model: %s", self._topics) - if docfreq is None: - if docfreq is not False: - self._docfreq = DocumentFrequencies(log_level=verbosity).load( - source=self._topics.dep("docfreq")["uuid"], backend=backend) - else: - self._docfreq = None - self._log.warning("Disabled document frequencies - you will " - "not be able to query custom repositories.") + + if not docfreq: + self._docfreq = None + self._log.warning("Disabled document frequencies - you will " + "not be able to query arbitrary repositories.") + self._repo2bow = None else: assert isinstance(docfreq, DocumentFrequencies) self._docfreq = docfreq - if self._docfreq is not None: self._docfreq = self._docfreq.prune(prune_df_threshold) - self._log.info("Loaded docfreq model: %s", self._docfreq) - if bow is not None: + self._log.info("Loaded docfreq model: %s", self._docfreq) + self._repo2bow = Repo2BOW( + {t: i for i, t in enumerate(self._topics.tokens)}, self._docfreq, + **(repo2bow_kwargs or {})) + + if not bow: + self._bow = None + self._log.warning("No BOW cache was loaded.") + else: assert isinstance(bow, BOW) self._bow = bow if self._topics.matrix.shape[1] != self._bow.matrix.shape[1]: raise ValueError("Models do not match: topics has %s tokens while bow has %s" % (self._topics.matrix.shape[1], self._bow.matrix.shape[1])) self._log.info("Attached BOW model: %s", self._bow) - else: - self._bow = None - self._log.warning("No BOW cache was loaded.") - if self._docfreq is not None: - self._repo2bow = Repo2BOW( - {t: i for i, t in enumerate(self._topics.tokens)}, self._docfreq, - **(repo2bow_kwargs or {})) - else: - self._repo2bow = None def query(self, url_or_path_or_name, size=5): if size > len(self._topics): raise ValueError("size may not be greater than the number of topics - %d" % len(self._topics)) - if self._bow is not None: + if self._bow: try: - repo_index = self._bow.repository_index_by_name( + repo_index = self._bow.repository_index_by_name( #TODO url_or_path_or_name) except KeyError: - repo_index = -1 - if repo_index == -1: match = self.GITHUB_URL_RE.match(url_or_path_or_name) - if match is not None: + if match: name = match.group(2) try: - repo_index = self._bow.repository_index_by_name(name) + repo_index = self._bow.repository_index_by_name(name) #TODO except KeyError: pass - else: - repo_index = -1 - if repo_index >= 0: - token_vector = self._bow.matrix[repo_index] - else: - if self._docfreq is None: + if repo_index: + token_vector = self._bow.matrix[repo_index] + + if not token_vector: + if not self._docfreq: raise ValueError("You need to specify document frequencies model to process " "custom repositories") bow_dict = self._repo2bow.convert_repository(url_or_path_or_name) @@ -111,6 +96,7 @@ def query(self, url_or_path_or_name, size=5): for i, v in bow_dict.items(): token_vector[i] = v token_vector = csr_matrix(token_vector) + topic_vector = -numpy.squeeze(self._topics.matrix.dot(token_vector.T).toarray()) order = numpy.argsort(topic_vector) result = [] diff --git a/tmsc/uast2bow.py b/tmsc/uast2bow.py new file mode 100644 index 0000000..2d28685 --- /dev/null +++ b/tmsc/uast2bow.py @@ -0,0 +1,55 @@ +from collections import defaultdict +import marshal +import math +import types + +from sourced.ml.models import BOW, DocumentFrequencies +from sourced.ml.algorithms.uast_ids_to_bag import UastIds2Bag + +class Uasts2BOW: + def __init__(self, vocabulary: dict, docfreq: DocumentFrequencies, + getter: callable): + self._docfreq = docfreq + self._uast2bag = UastIds2Bag(vocabulary) #TODO replace with sourced.ml. + self._reverse_vocabulary = [None] * len(vocabulary) + for key, val in vocabulary.items(): + self._reverse_vocabulary[val] = key + self._getter = getter + + @property + def vocabulary(self): + return self._uast2bag.token2index #.vocabulary + + @property + def docfreq(self): + return self._docfreq + + def __call__(self, file_uast_generator): + freqs = defaultdict(int) + for file_uast in file_uast_generator: + bag = self._uast2bag(self._getter(file_uast)) #.uast_to_bag + for key, freq in bag.items(): + freqs[key] += freq + missing = [] + for key, val in freqs.items(): + try: + freqs[key] = math.log(1 + val) * math.log( + self._docfreq.docs / self._docfreq[self._reverse_vocabulary[key]]) + except KeyError: + missing.append(key) + for key in missing: + del freqs[key] + return dict(freqs) + + def __getstate__(self): + state = self.__dict__.copy() + if isinstance(self._getter, types.FunctionType) \ + and self._getter.__name__ == (lambda: None).__name__: + assert self._getter.__closure__ is None + state["_getter"] = marshal.dumps(self._getter.__code__) + return state + + def __setstate__(self, state): + self.__dict__ = state + if isinstance(self._getter, bytes): + self._getter = types.FunctionType(marshal.loads(self._getter), globals()) From 742555c2d466f80e31095b1b33cc2d187db9d9be Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sun, 12 Aug 2018 22:05:28 +0200 Subject: [PATCH 3/4] Migrate to sourced.ml - 3 Signed-off-by: Alexander Bezzubov --- tmsc/__main__.py | 2 +- tmsc/topic_detector.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tmsc/__main__.py b/tmsc/__main__.py index 60e175d..7496653 100644 --- a/tmsc/__main__.py +++ b/tmsc/__main__.py @@ -57,7 +57,7 @@ def main(): args.topics = Topics(log_level=args.log_level).load(source=args.topics, backend=backend) #source=args.topics args.df = DocumentFrequencies(log_level=args.log_level).load(source=args.df, backend=backend) - args.bow = BOW(log_level=args.log_level).load(source=args.bow, backend=backend) + #args.bow = BOW(log_level=args.log_level).load(source=args.bow, backend=backend) sr = TopicDetector( topics=args.topics, docfreq=args.df, bow=args.bow, verbosity=args.log_level, diff --git a/tmsc/topic_detector.py b/tmsc/topic_detector.py index c96edd8..2f2104d 100644 --- a/tmsc/topic_detector.py +++ b/tmsc/topic_detector.py @@ -72,21 +72,22 @@ def query(self, url_or_path_or_name, size=5): if size > len(self._topics): raise ValueError("size may not be greater than the number of topics - %d" % len(self._topics)) + token_vector = None if self._bow: try: - repo_index = self._bow.repository_index_by_name( #TODO + repo_index = self._bow.documents_index( url_or_path_or_name) except KeyError: match = self.GITHUB_URL_RE.match(url_or_path_or_name) if match: name = match.group(2) try: - repo_index = self._bow.repository_index_by_name(name) #TODO + repo_index = self._bow.documents_index(name) except KeyError: pass if repo_index: token_vector = self._bow.matrix[repo_index] - + if not token_vector: if not self._docfreq: raise ValueError("You need to specify document frequencies model to process " From f1902e2e053f073bc38ebc4e8fbb471afe0ca7b0 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Sun, 12 Aug 2018 22:41:08 +0200 Subject: [PATCH 4/4] Migrate to sourced.ml - 4 Signed-off-by: Alexander Bezzubov --- tmsc/topic_detector.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tmsc/topic_detector.py b/tmsc/topic_detector.py index 2f2104d..7838824 100644 --- a/tmsc/topic_detector.py +++ b/tmsc/topic_detector.py @@ -1,13 +1,15 @@ import logging +import sys import re -from ast2vec import Repo2Base -#from ast2vec.model2.uast2bow import Uasts2BOW #?replace \w sourced.ml - -from sourced.ml.models import BOW, Topics, DocumentFrequencies +# compatibility with old ast2vec version that depends on old modelforge +sys.modules["modelforge.generate_meta"] = None +sys.modules["modelforge.model.write_model"] = None import numpy from scipy.sparse import csr_matrix +from ast2vec.repo2.base import Repo2Base +from sourced.ml.models import BOW, Topics, DocumentFrequencies from tmsc.environment import initialize from tmsc.uast2bow import Uasts2BOW