src-d · vmarkovtsev · Dec 3, 2018 · Nov 30, 2018 · Dec 3, 2018 · Dec 3, 2018
diff --git a/lookout/style/format/cmdline_tools.py b/lookout/style/format/cmdline_tools.py
@@ -10,7 +10,7 @@
 from lookout.style.format.benchmarks.evaluate_smoke import evaluate_smoke_entry
 from lookout.style.format.benchmarks.generate_smoke import generate_smoke_entry
 from lookout.style.format.quality_report import quality_report
-from lookout.style.format.robustness import plot_pr_curve, style_robustness_report
+from lookout.style.format.quality_report_noisy import quality_report_noisy
 from lookout.style.format.rule_stat import print_rules_report
 
 
@@ -67,6 +67,18 @@ def add_true_noisy_repos_args(my_parser: ArgumentParser):
              "modified by adding artificial style mistakes.")
 
 
+def add_rules_thresholds(my_parser: ArgumentParser):
+    """
+    Add threshold arguments to filter rules.
+
+    :param my_parser: Parser to add the arguments to.
+    """
+    my_parser.add_argument("--confidence-threshold", type=float, default=0.95,
+                           help="Confidence threshold to filter relevant rules.")
+    my_parser.add_argument("--support-threshold", type=int, default=80,
+                           help="Support threshold to filter relevant rules.")
+
+
 def create_parser() -> ArgumentParser:
     """
     Create a parser for the lookout.style.format utility.
@@ -101,27 +113,20 @@ def add_parser(name, help):
     add_bblfsh_arg(rule_parser)
     add_model_args(rule_parser)
 
-    # Style robustness quality report, includes precision, recall and F1-score
-    robust_parser = add_parser("robust-eval", "Quality report made by analyzing how well the "
-                                              "is able to fix random style mistakes among a model "
-                                              "repository: includes precision, recall and "
-                                              "F1-score.")
-    robust_parser.set_defaults(handler=style_robustness_report)
-    add_true_noisy_repos_args(robust_parser)
-    add_bblfsh_arg(robust_parser)
-    add_model_args(robust_parser)
-
-    # Plot Precision and Recall curves
-    pr_curve_parser = add_parser("pr-curve", "Plot Precision/Recall curves with different rules "
-                                             "selected based on their confidence.")
-    pr_curve_parser.set_defaults(handler=plot_pr_curve)
-    add_true_noisy_repos_args(pr_curve_parser)
-    add_bblfsh_arg(pr_curve_parser)
-    add_model_args(pr_curve_parser)
-    pr_curve_parser.add_argument("--support-threshold", type=int, default=0,
-                                 help="Support threshold to filter relevant rules.")
-    pr_curve_parser.add_argument("-o", "--output", required=True, type=str,
-                                 help="Path to the output figure. Could be a png or svg file.")
+    # Generate the quality report based on the artificial noisy dataset
+    quality_report_noisy_parser = add_parser("quality-report-noisy", "Quality report on the "
+                                                                     "artificial noisy dataset")
+    quality_report_noisy_parser.set_defaults(handler=quality_report_noisy)
+    add_true_noisy_repos_args(quality_report_noisy_parser)
+    add_bblfsh_arg(quality_report_noisy_parser)
+    add_model_args(quality_report_noisy_parser)
+    add_rules_thresholds(quality_report_noisy_parser)
+    quality_report_noisy_parser.add_argument("--precision-threshold", type=float, default=0.95,
+                                             help="Precision threshold tolerated for the model.")
+    quality_report_noisy_parser.add_argument(
+        "--dir-output", required=True, type=str,
+        help="Path to the output directory where to store the quality report and the "
+             "precision-recall curve.")
 
     # Generate dataset of different styles in code for smoke testing.
     gen_smoke_parser = add_parser("gen-smoke-dataset",

diff --git a/lookout/style/format/robustness.py → lookout/style/format/quality_report_noisy.py b/lookout/style/format/robustness.py → lookout/style/format/quality_report_noisy.py
@@ -3,10 +3,12 @@
 from difflib import SequenceMatcher
 import glob
 import logging
+import os
 import sys
 from typing import Iterable, List, Mapping, NamedTuple, Set, Tuple
 
 from bblfsh import BblfshClient
+import jinja2
 import numpy
 
 from lookout.style.format.feature_extractor import FeatureExtractor
@@ -204,85 +206,17 @@ def compute_metrics(changes_count: int, predictions_count: int, true_positive: i
     return precision, recall, f1_score
 
 
-def style_robustness_report(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
-                            model_path: str) -> None:
-    """
-    Print the quality report of a model tested on a given repository.
-
-    The tests consists in adding random style mistakes in the given repo and looking how well
-    the model is able to fix them according to the style of the original repository.
-
-    :param true_repo: Path to the original repository we want to test the model on.
-    :param noisy_repo: Path to the noisy version of the repository where 1 style mistake is \
-           randomly added in every file.
-    :param bblfsh: Babelfish client. Babelfish server should be started accordingly.
-    :param language: Language to consider, others will be discarded.
-    :param model_path: Path to the model to test. It should be previously trained on the original \
-           repository located in ':param true_repo:'.
-    """
-    log = logging.getLogger("style_robustness_report")
-
-    true_content = get_content_from_repo(true_repo)
-    noisy_content = get_content_from_repo(noisy_repo)
-    true_files, noisy_files, lines_changed = get_difflib_changes(true_content, noisy_content)
-    log.info("Number of files modified by adding style noise: %d / %d", len(true_files),
-             len(true_content))
-    del true_content, noisy_content
-
-    client = BblfshClient(bblfsh)
-    analyzer = FormatModel().load(model_path)
-    rules = analyzer[language]
-    feature_extractor = FeatureExtractor(language=language,
-                                         **rules.origin_config["feature_extractor"])
-    vnodes_y_true = files2vnodes(true_files, feature_extractor, client)
-    mispreds_noise = files2mispreds(noisy_files, feature_extractor, rules, client, log)
-    diff_mispreds = get_diff_mispreds(mispreds_noise, lines_changed)
-    changes_count = len(lines_changed)
-    log.info("Number of artificial mistakes potentially fixed by the model "
-             "(diff of mispredictions): %d / %d", len(diff_mispreds), changes_count)
-    style_fixes = get_style_fixes(diff_mispreds, vnodes_y_true, true_files, noisy_files,
-                                  feature_extractor)
-    log.info("style-analyzer fixes in the noisy repos: %d / %d -> %.1f %%",
-             len(style_fixes), changes_count, 100 * len(style_fixes) / changes_count)
-
-    precision, recall, f1_score = compute_metrics(changes_count=changes_count,
-                                                  predictions_count=len(diff_mispreds),
-                                                  true_positive=len(style_fixes))
-    print("precision:", round(precision, 3))
-    print("recall:", round(recall, 3))
-    print("F1 score:", round(f1_score, 3))
-
-    print()
-    print("list of files where the style-analyzer succeeds in fixing the random noise:")
-    for mispred in style_fixes:
-        print(mispred.node.path)
-
-
-def filter_relevant_rules(rules: Iterable[Rules], support_threshold: int, log: logging.Logger
-                          ) -> Iterable[Tuple[int, float]]:
-    """
-    Filter relevant rules that have a support higher than `support threshold`.
-
-    :param rules: List of `Rules` from the model.
-    :param support_threshold: Support threshold to filter relevant rules.
-    :param log: Logger.
-    :return: List of `Rules` index and confidence we filter according to `support_threshold`.
-    """
-    log.info("Filtering rules with support higher than %d", support_threshold)
-    rules_id = [(i, r.stats.conf, r.stats.support) for i, r in enumerate(rules)
-                if r.stats.support > support_threshold]
-    rules_selection = sorted(rules_id, key=lambda k: k[1], reverse=True)
-    log.info("Number of rules decreased from %d to %d", len(rules), len(rules_selection))
-    return rules_selection
-
-
-def plot_curve(x: numpy.ndarray, y: numpy.ndarray, output: str) -> None:
+def plot_curve(repo: str, x: numpy.ndarray, y: numpy.ndarray, precision_threshold: float,
+               path_to_figure: str) -> None:
     """
     Plot y versus x as lines and markers using matplotlib.
 
+    :param repo: Name of the repository we plot the precision-recall curve of.
     :param x: 1-D numpy array containing the x coordinates.
     :param y: 1-D numpy array containing the y coordinates.
-    :param output: Path to the output figure, could be either a png or svg file.
+    :param precision_threshold: Precision threshold tolerated by the model. \
+           Limit drawn as a red horizontal line on the figure.
+    :param path_to_figure: Path to the output figure, in png format.
     """
     try:
         import matplotlib
@@ -293,17 +227,21 @@ def plot_curve(x: numpy.ndarray, y: numpy.ndarray, output: str) -> None:
     plt.figure(figsize=(15, 10))
     ax = plt.subplot(111)
     ax.plot(x, y, marker="x", linestyle="--")
+    handle = plt.axhline(precision_threshold, color="r")
+    ax.legend([handle], ["precision threshold"], fontsize=17)
+    ax.set_title("Precision-recall curve on the %s repository" % repo, fontsize=17)
     ax.set_ylabel("Precision", fontsize=17, labelpad=15)
     ax.set_xlabel("Recall", fontsize=17, labelpad=15)
     ax.spines["right"].set_visible(False)
     ax.spines["top"].set_visible(False)
-    plt.savefig(output)
+    plt.savefig(path_to_figure)
 
 
-def plot_pr_curve(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
-                  model_path: str, support_threshold: int, output: str) -> None:
+def quality_report_noisy(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
+                         model_path: str, confidence_threshold: float, support_threshold: int,
+                         precision_threshold: float, dir_output) -> None:
     """
-    Plot a precision/recall curve with rules having higher support than `support_threshold`.
+    Generate a quality report on the artificial noisy dataset including a precision-recall curve.
 
     :param true_repo: Path to the original repository we want to test the model on.
     :param noisy_repo: Path to the noisy version of the repository where 1 style mistake is \
@@ -312,10 +250,14 @@ def plot_pr_curve(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
     :param language: Language to consider, others will be discarded.
     :param model_path: Path to the model to test. It should be previously trained on the original \
            repository located in ':param true_repo:'.
+    :param confidence_threshold: Confidence threshold to filter relevant rules.
     :param support_threshold: Support threshold to filter relevant rules.
-    :param output: Path to the output figure. Could yield to a png or svg file.
+    :param precision_threshold: Precision threshold tolerated by the model. \
+           Limit drawn as a red horizontal line on the figure.
+    :param dir_output: Path to the output directory where to store the quality report in Markdown \
+           and the precision-recall curve in png format.
     """
-    log = logging.getLogger("plot_pr_curve")
+    log = logging.getLogger("quality_report_noisy")
 
     true_content = get_content_from_repo(true_repo)
     noisy_content = get_content_from_repo(noisy_repo)
@@ -335,18 +277,57 @@ def plot_pr_curve(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
     changes_count = len(lines_changed)
 
     precisions, recalls = [], []
-    rules_selection = filter_relevant_rules(rules.rules, support_threshold, log)
-    for i in range(len(rules_selection)):
+    n_rules = len(rules.rules)
+    rules_id = [(i, r.stats.conf) for i, r in enumerate(rules.rules)
+                if r.stats.conf > confidence_threshold and r.stats.support > support_threshold]
+    rules_id = sorted(rules_id, key=lambda k: k[1], reverse=True)
+    for i in range(len(rules.rules)):
         filtered_mispreds = {k: m for k, m in diff_mispreds.items()
-                             if any(r[0] == m.rule for r in rules_selection[:i + 1])}
+                             if any(r[0] == m.rule for r in rules_id[:i + 1])}
         style_fixes = get_style_fixes(filtered_mispreds, vnodes_y_true,
                                       true_files, noisy_files, feature_extractor)
         precision, recall, f1_score = compute_metrics(changes_count=changes_count,
                                                       predictions_count=len(filtered_mispreds),
                                                       true_positive=len(style_fixes))
         precisions.append(round(precision, 3))
         recalls.append(round(recall, 3))
-
     print("recall x:", recalls)
     print("precision y:", precisions)
-    plot_curve(numpy.asarray(recalls), numpy.asarray(precisions), output)
+
+    # Compute some stats and quality metrics for the model's evaluation
+    n_mistakes = len(true_files)
+    prec_max_rec = precisions[-1]
+    max_rec = max(recalls)
+    n_rules_filtered = len(rules.rules)
+    # Compute the recall score at the given threshold for precision.
+    for (prec, rec) in zip(precisions, recalls):
+        if prec < precision_threshold:
+            break
+        rec_threshold_prec = rec
+
+    # Compile the precision-recall curve
+    path_to_figure = os.path.join(dir_output, "pr_curve_jquery.png")
+    plot_curve("jquery", numpy.asarray(recalls), numpy.asarray(precisions), precision_threshold,
+               path_to_figure)
+
+    # Compile the markdown template for the report through jinja2
+    loader = jinja2.FileSystemLoader(("/", os.path.dirname(__file__), os.getcwd()),
+                                     followlinks=True)
+    env = jinja2.Environment(
+        trim_blocks=True,
+        lstrip_blocks=True,
+        keep_trailing_newline=True,
+    )
+    env.globals.update(range=range)
+    template = loader.load(env, "templates/noisy_quality_report.md.jinja2")
+    report = template.render(n_mistakes=n_mistakes, rec_threshold_prec=rec_threshold_prec,
+                             prec_max_rec=prec_max_rec, max_rec=max_rec,
+                             confidence_threshold=confidence_threshold,
+                             support_threshold=support_threshold,
+                             n_rules=n_rules, n_rules_filtered=n_rules_filtered,
+                             path_to_figure=path_to_figure)
+
+    # Write the quality report
+    path_to_report = os.path.join(dir_output, "report_noise.md")
+    with open(path_to_report, "w", encoding="utf-8") as f:
+        f.write(report)
diff --git a/lookout/style/format/templates/noisy_quality_report.md.jinja2 b/lookout/style/format/templates/noisy_quality_report.md.jinja2
@@ -0,0 +1,15 @@
+# Quality report on the artificial noisy dataset
+
+### Rules filtering thresholds
+
+* `{{ "Confidence: %s  " % (confidence_threshold) }}`
+* `{{ "Support: %s" % (support_threshold) }}`
+
+### Metrics table for the *jquery/jquery* repository
+
+
+| number of mistakes | recall at threshold precision | precision at max recall | max recall  |             Number of rules              |
+|:------------------:|:-----------------------------:|:-----------------------:|:-----------:|:----------------------------------------:|
+|   {{n_mistakes}}   |     {{rec_threshold_prec}}    |     {{prec_max_rec}}    | {{max_rec}} | `{{ n_rules_filtered }} / {{ n_rules }}` |
+
+![Precision-Racall curve]({{path_to_figure}})
diff --git a/lookout/style/format/tests/test_main.py b/lookout/style/format/tests/test_main.py
@@ -10,8 +10,7 @@ def test_handlers(self):
         action2handler = {
             "eval": "quality_report",
             "rule": "print_rules_report",
-            "robust-eval": "style_robustness_report",
-            "pr-curve": "plot_pr_curve",
+            "quality-report-noisy": "quality_report_noisy",
             "gen-smoke-dataset": "generate_smoke_entry",
             "eval-smoke-dataset": "evaluate_smoke_entry",
         }

diff --git a/lookout/style/format/tests/test_quality_report_noisy.py b/lookout/style/format/tests/test_quality_report_noisy.py
@@ -0,0 +1,66 @@
+import os
+from pathlib import Path
+import re
+import sys
+import tarfile
+import tempfile
+import unittest
+
+from lookout.style.format.quality_report_noisy import quality_report_noisy
+from lookout.style.format.tests.test_quality_report import Capturing
+
+
+class RobustnessTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.bblfsh = "0.0.0.0:9432"
+        cls.language = "javascript"
+
+        cls.parent_loc = Path(__file__).parent.resolve()
+        cls.base_dir_ = tempfile.TemporaryDirectory(dir=str(cls.parent_loc))
+        cls.base_dir = cls.base_dir_.name
+
+        with tarfile.open(str(cls.parent_loc / "jquery.tar.xz")) as tar:
+            tar.extractall(path=cls.base_dir)
+        cls.jquery_dir = os.path.join(cls.base_dir, "jquery")
+        with tarfile.open(str(cls.parent_loc / "jquery_noisy.tar.xz")) as tar:
+            tar.extractall(path=cls.base_dir)
+        cls.jquery_noisy_dir = os.path.join(cls.base_dir, "jquery_noisy")
+        cls.input_pattern = os.path.join(cls.jquery_dir, "**", "*.js")
+        cls.input_pattern_noisy = os.path.join(cls.jquery_noisy_dir, "**", "*.js")
+        cls.model_path = str(Path(__file__).parent.resolve() / "model_jquery.asdf")
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.base_dir_.cleanup()
+
+    @unittest.skipIf(sys.version_info.minor == 5, "Python 3.5 is not yet supported by difflib")
+    def test_quality_report_noisy(self):
+        with Capturing() as output:
+            try:
+                quality_report_noisy(true_repo=self.input_pattern,
+                                     noisy_repo=self.input_pattern_noisy,
+                                     bblfsh=self.bblfsh,
+                                     language=self.language,
+                                     model_path=self.model_path,
+                                     confidence_threshold=0.8,
+                                     support_threshold=20,
+                                     precision_threshold=0.95,
+                                     dir_output=tempfile.tempdir)
+            except SystemExit:
+                self.skipTest("Matplotlib is required to run this test")
+        pattern = re.compile(r"((?:recall x)|(?:precision y)): \[(\d+.\d+(, \d+.\d+)+)\]")
+        metrics = {}
+        for line in output:
+            match = pattern.search(line)
+            if match:
+                metric, scores_string = list(match.groups())[:2]
+                scores_string = scores_string.split(", ")
+                scores = [float(f) for f in scores_string]
+                metrics[metric] = scores
+        self.assertGreater(metrics["recall x"][-1], 0)
+        self.assertGreater(metrics["precision y"][-1], 0)
+
+
+if __name__ == "__main__":
+    unittest.main()