This repository has been archived by the owner on May 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 44
/
args.py
152 lines (125 loc) · 6.26 KB
/
args.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import argparse
import json
import logging
from typing import Optional, Union, Iterable
import sys
from sourced.ml import extractors
from sourced.ml.transformers import BOWWriter, Moder
from sourced.ml.utils.engine import add_engine_args
class ArgumentDefaultsHelpFormatterNoNone(argparse.ArgumentDefaultsHelpFormatter):
"""
Pretty formatter of help message for arguments.
It adds default value to the end if it is not None.
"""
def _get_help_string(self, action):
if action.default is None:
return action.help
return super()._get_help_string(action)
def handle_input_arg(input_arg: Union[str, Iterable[str]],
log: Optional[logging.Logger] = None):
"""
Process input arguments and return an iterator over input files.
:param input_arg: list of files to process or `-` to get \
file paths from stdin.
:param log: Logger if you want to log handling process.
:return: An iterator over input files.
"""
log = log.info if log else (lambda *x: None)
if input_arg == "-" or input_arg == ['-']:
log("Reading file paths from stdin.")
for line in sys.stdin:
yield line.strip()
else:
if isinstance(input_arg, str):
yield input_arg
else:
yield from input_arg
def add_repartitioner_arg(my_parser: argparse.ArgumentParser):
my_parser.add_argument(
"--partitions", required=False, default=None, type=int,
help="Performs data repartition to specified number of partitions. "
"Nothing happens if parameter is unset.")
my_parser.add_argument(
"--shuffle", action="store_true",
help="Use RDD.repartition() instead of RDD.coalesce().")
def add_split_stem_arg(my_parser: argparse.ArgumentParser):
my_parser.add_argument(
"--split", action="store_true",
help="Split identifiers based on special characters or case changes. For example split "
"'ThisIs_token' to ['this', 'is', 'token'].")
def add_vocabulary_size_arg(my_parser: argparse.ArgumentParser):
my_parser.add_argument(
"-v", "--vocabulary-size", default=10000000, type=int,
help="The maximum vocabulary size.")
def add_min_docfreq(my_parser: argparse.ArgumentParser):
my_parser.add_argument(
"--min-docfreq", default=1, type=int,
help="The minimum document frequency of each feature.")
def add_repo2_args(my_parser: argparse.ArgumentParser, default_packages=None):
my_parser.add_argument(
"-r", "--repositories", required=True,
help="The path to the repositories.")
my_parser.add_argument(
"--parquet", action="store_true", help="Use Parquet files as input.")
my_parser.add_argument(
"--graph", help="Write the tree in Graphviz format to this file.")
# TODO(zurk): get languages from bblfsh directly as soon as
# https://github.com/bblfsh/client-scala/issues/98 resolved
languages = ["Java", "Python", "Go", "JavaScript", "TypeScript", "Ruby", "Bash", "Php"]
my_parser.add_argument(
"-l", "--languages", nargs="+", choices=languages,
default=None, # Default value for --languages arg should be None.
# Otherwise if you process parquet files without 'lang' column, you will
# fail to process it with any --languages argument.
help="The programming languages to analyse.")
my_parser.add_argument("--blacklist", action="store_true",
help="Exclude the languages in --languages from the analysis "
"instead of filtering by default.")
add_dzhigurda_arg(my_parser)
add_engine_args(my_parser, default_packages)
def add_df_args(my_parser: argparse.ArgumentParser, required=True):
my_parser.add_argument(
"--min-docfreq", default=1, type=int,
help="The minimum document frequency of each feature.")
df_group = my_parser.add_mutually_exclusive_group(required=required)
df_group.add_argument(
"--docfreq-out", help="Path to save generated DocumentFrequencies model.")
df_group.add_argument(
"--docfreq-in", help="Path to load pre-generated DocumentFrequencies model.")
add_vocabulary_size_arg(my_parser)
def add_feature_args(my_parser: argparse.ArgumentParser, required=True):
my_parser.add_argument("-x", "--mode", choices=Moder.Options.__all__,
default="file", help="What to select for analysis.")
my_parser.add_argument(
"--quant", help="[IN/OUT] The path to the QuantizationLevels model.")
my_parser.add_argument(
"-f", "--feature", nargs="+",
choices=[ex.NAME for ex in extractors.__extractors__.values()],
required=required, help="The feature extraction scheme to apply.")
for ex in extractors.__extractors__.values():
for opt, val in ex.OPTS.items():
my_parser.add_argument(
"--%s-%s" % (ex.NAME, opt), default=val, type=json.loads,
help="%s's kwarg" % ex.__name__)
def add_bow_args(my_parser: argparse.ArgumentParser):
my_parser.add_argument(
"--bow", required=True, help="[OUT] The path to the Bag-Of-Words model.")
my_parser.add_argument(
"--batch", default=BOWWriter.DEFAULT_CHUNK_SIZE, type=int,
help="The maximum size of a single BOW file in bytes.")
my_parser.add_argument(
"--num-iterations", default=1, type=int,
help="After partitioning by document we run the pipeline on each partition separately "
"in a loop. This number indicates the number of partitions.")
def add_cached_index_arg(my_parser: argparse.ArgumentParser, create: bool = False):
direction = "OUT" if create else "IN"
my_parser.add_argument(
"--cached-index-path", default=None, required=True,
help="[%s] Path to the docfreq model holding the document's index." % direction)
def add_dzhigurda_arg(my_parser):
my_parser.add_argument(
"--dzhigurda", default=0, type=int,
help="Number of the additional commits look over in the history starting from the HEAD "
"commits. 0 corresponds to HEAD only commits, 1 to HEAD and HEAD~1, 2 to HEAD, HEAD~1"
" and HEAD~2, etc. With `--dzhigurda -1` we keep all possible commits for each "
"document.")