Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add quality report on the artificial noisy dataset #366

Merged
merged 4 commits into from Dec 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
49 changes: 27 additions & 22 deletions lookout/style/format/cmdline_tools.py
Expand Up @@ -10,7 +10,7 @@
from lookout.style.format.benchmarks.evaluate_smoke import evaluate_smoke_entry
from lookout.style.format.benchmarks.generate_smoke import generate_smoke_entry
from lookout.style.format.quality_report import quality_report
from lookout.style.format.robustness import plot_pr_curve, style_robustness_report
from lookout.style.format.quality_report_noisy import quality_report_noisy
from lookout.style.format.rule_stat import print_rules_report


Expand Down Expand Up @@ -67,6 +67,18 @@ def add_true_noisy_repos_args(my_parser: ArgumentParser):
"modified by adding artificial style mistakes.")


def add_rules_thresholds(my_parser: ArgumentParser):
"""
Add threshold arguments to filter rules.

:param my_parser: Parser to add the arguments to.
"""
my_parser.add_argument("--confidence-threshold", type=float, default=0.95,
help="Confidence threshold to filter relevant rules.")
my_parser.add_argument("--support-threshold", type=int, default=80,
help="Support threshold to filter relevant rules.")


def create_parser() -> ArgumentParser:
"""
Create a parser for the lookout.style.format utility.
Expand Down Expand Up @@ -101,27 +113,20 @@ def add_parser(name, help):
add_bblfsh_arg(rule_parser)
add_model_args(rule_parser)

# Style robustness quality report, includes precision, recall and F1-score
robust_parser = add_parser("robust-eval", "Quality report made by analyzing how well the "
"is able to fix random style mistakes among a model "
"repository: includes precision, recall and "
"F1-score.")
robust_parser.set_defaults(handler=style_robustness_report)
add_true_noisy_repos_args(robust_parser)
add_bblfsh_arg(robust_parser)
add_model_args(robust_parser)

# Plot Precision and Recall curves
pr_curve_parser = add_parser("pr-curve", "Plot Precision/Recall curves with different rules "
"selected based on their confidence.")
pr_curve_parser.set_defaults(handler=plot_pr_curve)
add_true_noisy_repos_args(pr_curve_parser)
add_bblfsh_arg(pr_curve_parser)
add_model_args(pr_curve_parser)
pr_curve_parser.add_argument("--support-threshold", type=int, default=0,
help="Support threshold to filter relevant rules.")
pr_curve_parser.add_argument("-o", "--output", required=True, type=str,
help="Path to the output figure. Could be a png or svg file.")
# Generate the quality report based on the artificial noisy dataset
quality_report_noisy_parser = add_parser("quality-report-noisy", "Quality report on the "
"artificial noisy dataset")
quality_report_noisy_parser.set_defaults(handler=quality_report_noisy)
add_true_noisy_repos_args(quality_report_noisy_parser)
add_bblfsh_arg(quality_report_noisy_parser)
add_model_args(quality_report_noisy_parser)
add_rules_thresholds(quality_report_noisy_parser)
quality_report_noisy_parser.add_argument("--precision-threshold", type=float, default=0.95,
help="Precision threshold tolerated for the model.")
quality_report_noisy_parser.add_argument(
"--dir-output", required=True, type=str,
help="Path to the output directory where to store the quality report and the "
"precision-recall curve.")

# Generate dataset of different styles in code for smoke testing.
gen_smoke_parser = add_parser("gen-smoke-dataset",
Expand Down
Expand Up @@ -3,10 +3,12 @@
from difflib import SequenceMatcher
import glob
import logging
import os
import sys
from typing import Iterable, List, Mapping, NamedTuple, Set, Tuple

from bblfsh import BblfshClient
import jinja2
import numpy

from lookout.style.format.feature_extractor import FeatureExtractor
Expand Down Expand Up @@ -204,85 +206,17 @@ def compute_metrics(changes_count: int, predictions_count: int, true_positive: i
return precision, recall, f1_score


def style_robustness_report(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
model_path: str) -> None:
"""
Print the quality report of a model tested on a given repository.

The tests consists in adding random style mistakes in the given repo and looking how well
the model is able to fix them according to the style of the original repository.

:param true_repo: Path to the original repository we want to test the model on.
:param noisy_repo: Path to the noisy version of the repository where 1 style mistake is \
randomly added in every file.
:param bblfsh: Babelfish client. Babelfish server should be started accordingly.
:param language: Language to consider, others will be discarded.
:param model_path: Path to the model to test. It should be previously trained on the original \
repository located in ':param true_repo:'.
"""
log = logging.getLogger("style_robustness_report")

true_content = get_content_from_repo(true_repo)
noisy_content = get_content_from_repo(noisy_repo)
true_files, noisy_files, lines_changed = get_difflib_changes(true_content, noisy_content)
log.info("Number of files modified by adding style noise: %d / %d", len(true_files),
len(true_content))
del true_content, noisy_content

client = BblfshClient(bblfsh)
analyzer = FormatModel().load(model_path)
rules = analyzer[language]
feature_extractor = FeatureExtractor(language=language,
**rules.origin_config["feature_extractor"])
vnodes_y_true = files2vnodes(true_files, feature_extractor, client)
mispreds_noise = files2mispreds(noisy_files, feature_extractor, rules, client, log)
diff_mispreds = get_diff_mispreds(mispreds_noise, lines_changed)
changes_count = len(lines_changed)
log.info("Number of artificial mistakes potentially fixed by the model "
"(diff of mispredictions): %d / %d", len(diff_mispreds), changes_count)
style_fixes = get_style_fixes(diff_mispreds, vnodes_y_true, true_files, noisy_files,
feature_extractor)
log.info("style-analyzer fixes in the noisy repos: %d / %d -> %.1f %%",
len(style_fixes), changes_count, 100 * len(style_fixes) / changes_count)

precision, recall, f1_score = compute_metrics(changes_count=changes_count,
predictions_count=len(diff_mispreds),
true_positive=len(style_fixes))
print("precision:", round(precision, 3))
print("recall:", round(recall, 3))
print("F1 score:", round(f1_score, 3))

print()
print("list of files where the style-analyzer succeeds in fixing the random noise:")
for mispred in style_fixes:
print(mispred.node.path)


def filter_relevant_rules(rules: Iterable[Rules], support_threshold: int, log: logging.Logger
) -> Iterable[Tuple[int, float]]:
"""
Filter relevant rules that have a support higher than `support threshold`.

:param rules: List of `Rules` from the model.
:param support_threshold: Support threshold to filter relevant rules.
:param log: Logger.
:return: List of `Rules` index and confidence we filter according to `support_threshold`.
"""
log.info("Filtering rules with support higher than %d", support_threshold)
rules_id = [(i, r.stats.conf, r.stats.support) for i, r in enumerate(rules)
if r.stats.support > support_threshold]
rules_selection = sorted(rules_id, key=lambda k: k[1], reverse=True)
log.info("Number of rules decreased from %d to %d", len(rules), len(rules_selection))
return rules_selection


def plot_curve(x: numpy.ndarray, y: numpy.ndarray, output: str) -> None:
def plot_curve(repo: str, x: numpy.ndarray, y: numpy.ndarray, precision_threshold: float,
path_to_figure: str) -> None:
"""
Plot y versus x as lines and markers using matplotlib.

:param repo: Name of the repository we plot the precision-recall curve of.
:param x: 1-D numpy array containing the x coordinates.
:param y: 1-D numpy array containing the y coordinates.
:param output: Path to the output figure, could be either a png or svg file.
:param precision_threshold: Precision threshold tolerated by the model. \
Limit drawn as a red horizontal line on the figure.
:param path_to_figure: Path to the output figure, in png format.
"""
try:
import matplotlib
Expand All @@ -293,17 +227,21 @@ def plot_curve(x: numpy.ndarray, y: numpy.ndarray, output: str) -> None:
plt.figure(figsize=(15, 10))
ax = plt.subplot(111)
ax.plot(x, y, marker="x", linestyle="--")
handle = plt.axhline(precision_threshold, color="r")
ax.legend([handle], ["precision threshold"], fontsize=17)
ax.set_title("Precision-recall curve on the %s repository" % repo, fontsize=17)
ax.set_ylabel("Precision", fontsize=17, labelpad=15)
ax.set_xlabel("Recall", fontsize=17, labelpad=15)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
plt.savefig(output)
plt.savefig(path_to_figure)


def plot_pr_curve(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
model_path: str, support_threshold: int, output: str) -> None:
def quality_report_noisy(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
model_path: str, confidence_threshold: float, support_threshold: int,
precision_threshold: float, dir_output) -> None:
"""
Plot a precision/recall curve with rules having higher support than `support_threshold`.
Generate a quality report on the artificial noisy dataset including a precision-recall curve.

:param true_repo: Path to the original repository we want to test the model on.
:param noisy_repo: Path to the noisy version of the repository where 1 style mistake is \
Expand All @@ -312,10 +250,14 @@ def plot_pr_curve(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
:param language: Language to consider, others will be discarded.
:param model_path: Path to the model to test. It should be previously trained on the original \
repository located in ':param true_repo:'.
:param confidence_threshold: Confidence threshold to filter relevant rules.
:param support_threshold: Support threshold to filter relevant rules.
:param output: Path to the output figure. Could yield to a png or svg file.
:param precision_threshold: Precision threshold tolerated by the model. \
Limit drawn as a red horizontal line on the figure.
:param dir_output: Path to the output directory where to store the quality report in Markdown \
and the precision-recall curve in png format.
"""
log = logging.getLogger("plot_pr_curve")
log = logging.getLogger("quality_report_noisy")

true_content = get_content_from_repo(true_repo)
noisy_content = get_content_from_repo(noisy_repo)
Expand All @@ -335,18 +277,57 @@ def plot_pr_curve(true_repo: str, noisy_repo: str, bblfsh: str, language: str,
changes_count = len(lines_changed)

precisions, recalls = [], []
rules_selection = filter_relevant_rules(rules.rules, support_threshold, log)
for i in range(len(rules_selection)):
n_rules = len(rules.rules)
rules_id = [(i, r.stats.conf) for i, r in enumerate(rules.rules)
if r.stats.conf > confidence_threshold and r.stats.support > support_threshold]
rules_id = sorted(rules_id, key=lambda k: k[1], reverse=True)
for i in range(len(rules.rules)):
filtered_mispreds = {k: m for k, m in diff_mispreds.items()
if any(r[0] == m.rule for r in rules_selection[:i + 1])}
if any(r[0] == m.rule for r in rules_id[:i + 1])}
style_fixes = get_style_fixes(filtered_mispreds, vnodes_y_true,
true_files, noisy_files, feature_extractor)
precision, recall, f1_score = compute_metrics(changes_count=changes_count,
predictions_count=len(filtered_mispreds),
true_positive=len(style_fixes))
precisions.append(round(precision, 3))
recalls.append(round(recall, 3))

print("recall x:", recalls)
print("precision y:", precisions)
plot_curve(numpy.asarray(recalls), numpy.asarray(precisions), output)

# Compute some stats and quality metrics for the model's evaluation
n_mistakes = len(true_files)
prec_max_rec = precisions[-1]
max_rec = max(recalls)
n_rules_filtered = len(rules.rules)
# Compute the recall score at the given threshold for precision.
for (prec, rec) in zip(precisions, recalls):
if prec < precision_threshold:
break
rec_threshold_prec = rec

# Compile the precision-recall curve
path_to_figure = os.path.join(dir_output, "pr_curve_jquery.png")
plot_curve("jquery", numpy.asarray(recalls), numpy.asarray(precisions), precision_threshold,
path_to_figure)

# Compile the markdown template for the report through jinja2
loader = jinja2.FileSystemLoader(("/", os.path.dirname(__file__), os.getcwd()),
followlinks=True)
env = jinja2.Environment(
trim_blocks=True,
lstrip_blocks=True,
keep_trailing_newline=True,
)
env.globals.update(range=range)
template = loader.load(env, "templates/noisy_quality_report.md.jinja2")
report = template.render(n_mistakes=n_mistakes, rec_threshold_prec=rec_threshold_prec,
prec_max_rec=prec_max_rec, max_rec=max_rec,
confidence_threshold=confidence_threshold,
support_threshold=support_threshold,
n_rules=n_rules, n_rules_filtered=n_rules_filtered,
path_to_figure=path_to_figure)

# Write the quality report
path_to_report = os.path.join(dir_output, "report_noise.md")
with open(path_to_report, "w", encoding="utf-8") as f:
f.write(report)
15 changes: 15 additions & 0 deletions lookout/style/format/templates/noisy_quality_report.md.jinja2
@@ -0,0 +1,15 @@
# Quality report on the artificial noisy dataset

### Rules filtering thresholds

* `{{ "Confidence: %s " % (confidence_threshold) }}`
* `{{ "Support: %s" % (support_threshold) }}`

### Metrics table for the *jquery/jquery* repository


| number of mistakes | recall at threshold precision | precision at max recall | max recall | Number of rules |
|:------------------:|:-----------------------------:|:-----------------------:|:-----------:|:----------------------------------------:|
| {{n_mistakes}} | {{rec_threshold_prec}} | {{prec_max_rec}} | {{max_rec}} | `{{ n_rules_filtered }} / {{ n_rules }}` |

![Precision-Racall curve]({{path_to_figure}})
3 changes: 1 addition & 2 deletions lookout/style/format/tests/test_main.py
Expand Up @@ -10,8 +10,7 @@ def test_handlers(self):
action2handler = {
"eval": "quality_report",
"rule": "print_rules_report",
"robust-eval": "style_robustness_report",
"pr-curve": "plot_pr_curve",
"quality-report-noisy": "quality_report_noisy",
"gen-smoke-dataset": "generate_smoke_entry",
"eval-smoke-dataset": "evaluate_smoke_entry",
}
Expand Down
66 changes: 66 additions & 0 deletions lookout/style/format/tests/test_quality_report_noisy.py
@@ -0,0 +1,66 @@
import os
from pathlib import Path
import re
import sys
import tarfile
import tempfile
import unittest

from lookout.style.format.quality_report_noisy import quality_report_noisy
from lookout.style.format.tests.test_quality_report import Capturing


class RobustnessTests(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.bblfsh = "0.0.0.0:9432"
cls.language = "javascript"

cls.parent_loc = Path(__file__).parent.resolve()
cls.base_dir_ = tempfile.TemporaryDirectory(dir=str(cls.parent_loc))
cls.base_dir = cls.base_dir_.name

with tarfile.open(str(cls.parent_loc / "jquery.tar.xz")) as tar:
tar.extractall(path=cls.base_dir)
cls.jquery_dir = os.path.join(cls.base_dir, "jquery")
with tarfile.open(str(cls.parent_loc / "jquery_noisy.tar.xz")) as tar:
tar.extractall(path=cls.base_dir)
cls.jquery_noisy_dir = os.path.join(cls.base_dir, "jquery_noisy")
cls.input_pattern = os.path.join(cls.jquery_dir, "**", "*.js")
cls.input_pattern_noisy = os.path.join(cls.jquery_noisy_dir, "**", "*.js")
cls.model_path = str(Path(__file__).parent.resolve() / "model_jquery.asdf")

@classmethod
def tearDownClass(cls):
cls.base_dir_.cleanup()

@unittest.skipIf(sys.version_info.minor == 5, "Python 3.5 is not yet supported by difflib")
def test_quality_report_noisy(self):
with Capturing() as output:
try:
quality_report_noisy(true_repo=self.input_pattern,
noisy_repo=self.input_pattern_noisy,
bblfsh=self.bblfsh,
language=self.language,
model_path=self.model_path,
confidence_threshold=0.8,
support_threshold=20,
precision_threshold=0.95,
dir_output=tempfile.tempdir)
except SystemExit:
self.skipTest("Matplotlib is required to run this test")
pattern = re.compile(r"((?:recall x)|(?:precision y)): \[(\d+.\d+(, \d+.\d+)+)\]")
metrics = {}
for line in output:
match = pattern.search(line)
if match:
metric, scores_string = list(match.groups())[:2]
scores_string = scores_string.split(", ")
scores = [float(f) for f in scores_string]
metrics[metric] = scores
self.assertGreater(metrics["recall x"][-1], 0)
self.assertGreater(metrics["precision y"][-1], 0)


if __name__ == "__main__":
unittest.main()