/
__main__.py
226 lines (198 loc) · 10.9 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import argparse
import json
import logging
import sys
from time import time
from igraph import Graph
from modelforge.logs import setup_logging
from sourced.ml.repo2 import wmhash
from gemini.bags import source2bags
from gemini.cassandra_utils import reset_db, sha1_to_url
from gemini.graph import find_connected_components, dumpcc, detect_communities
from gemini.hasher import hash_batches
from gemini.query import query
from gemini.warmup import warmup
CASSANDRA_PACKAGE = "com.datastax.spark:spark-cassandra-connector_2.11:2.0.3"
def get_parser() -> argparse.ArgumentParser:
"""
Create main parser.
:return: Parser
"""
parser = argparse.ArgumentParser()
parser.add_argument("--log-level", default="INFO",
choices=logging._nameToLevel,
help="Logging verbosity.")
def add_spark_args(my_parser):
my_parser.add_argument(
"-s", "--spark", default="local[*]", help="Spark's master address.")
my_parser.add_argument(
"--config", nargs="+", default=[], help="Spark configuration (key=value).")
my_parser.add_argument(
"--package", nargs="+", default=[CASSANDRA_PACKAGE], help="Additional Spark package.")
my_parser.add_argument(
"--spark-local-dir", default="/tmp/spark", help="Spark local directory.")
def add_engine_args(my_parser):
add_spark_args(my_parser)
my_parser.add_argument(
"--bblfsh", default="localhost", help="Babelfish server's address.")
my_parser.add_argument(
"--engine", default="0.2.0", help="source{d} engine version.")
my_parser.add_argument("--explain", action="store_true",
help="Print the PySpark execution plans.")
my_parser.add_argument("--pause", action="store_true",
help="Do not terminate in the end.")
my_parser.add_argument("--dzhigurda", default=0, type=int,
help="Index of the examined commit in the history.")
def add_features_arg(my_parser, required: bool, suffix="."):
my_parser.add_argument(
"-f", "--feature", nargs="+",
choices=[ex.NAME for ex in wmhash.__extractors__.values()],
required=required, help="The feature extraction scheme to apply" + suffix)
for ex in wmhash.__extractors__.values():
for opt, val in ex.OPTS.items():
my_parser.add_argument("--%s-%s" % (ex.NAME, opt), default=val, type=json.loads,
help="%s's kwarg" % ex.__name__)
def add_cassandra_args(my_parser):
my_parser.add_argument(
"--cassandra", default="0.0.0.0:9042", help="Cassandra's host:port.")
my_parser.add_argument("--keyspace", default="gemini",
help="Cassandra's key space.")
my_parser.add_argument(
"--tables", help="Table name mapping (JSON): bags, hashes, hashtables, hashtables2.")
def add_wmh_args(my_parser, params_help: str, add_hash_size: bool, required: bool):
if add_hash_size:
my_parser.add_argument("--size", type=int, default=128, help="Hash size.")
my_parser.add_argument("-p", "--params", required=required, help=params_help)
my_parser.add_argument("-t", "--threshold", required=required, type=float,
help="Jaccard similarity threshold.")
my_parser.add_argument("--false-positive-weight", type=float, default=0.5,
help="Used to adjust the relative importance of "
"minimizing false positives count when optimizing "
"for the Jaccard similarity threshold.")
my_parser.add_argument("--false-negative-weight", type=float, default=0.5,
help="Used to adjust the relative importance of "
"minimizing false negatives count when optimizing "
"for the Jaccard similarity threshold.")
subparsers = parser.add_subparsers(help="Commands", dest="command")
source2bags_parser = subparsers.add_parser(
"bags", help="Convert source code to weighted sets.")
source2bags_parser.set_defaults(handler=source2bags)
source2bags_parser.add_argument(
"-r", "--repositories", required=True,
help="The path to the repositories.")
source2bags_parser.add_argument(
"--batches", required=True,
help="[OUT] The path to the Parquet files with bag batches.")
source2bags_parser.add_argument(
"--docfreq", required=True,
help="[OUT] The path to the OrderedDocumentFrequencies model.")
source2bags_parser.add_argument(
"--vocabulary-size", default=10000000, type=int,
help="The maximum vocabulary size.")
source2bags_parser.add_argument(
"--min-docfreq", default=1, type=int,
help="The minimum document frequency of each element.")
add_features_arg(source2bags_parser, True)
source2bags_parser.add_argument(
"-l", "--language", choices=("Java", "Python"),
help="The programming language to analyse.")
source2bags_parser.add_argument(
"--persist", default=None, help="Persistence type (StorageClass.*).")
source2bags_parser.add_argument(
"--graph", help="Write the tree in Graphviz format to this file.")
add_cassandra_args(source2bags_parser)
add_engine_args(source2bags_parser)
warmup_parser = subparsers.add_parser(
"warmup", help="Initialize source{d} engine.")
warmup_parser.set_defaults(handler=warmup)
add_engine_args(warmup_parser)
hash_parser = subparsers.add_parser(
"hash", help="Run MinHashCUDA on the bag batches.")
hash_parser.set_defaults(handler=hash_batches)
hash_parser.add_argument("input",
help="Path to the directory with Parquet files.")
hash_parser.add_argument("--seed", type=int, default=int(time()),
help="Random generator's seed.")
hash_parser.add_argument("--mhc-verbosity", type=int, default=1,
help="MinHashCUDA logs verbosity level.")
hash_parser.add_argument("--devices", type=int, default=0,
help="Or-red indices of NVIDIA devices to use. 0 means all.")
add_wmh_args(hash_parser, "Path to the output file with WMH parameters.", True, True)
add_cassandra_args(hash_parser)
add_spark_args(hash_parser)
query_parser = subparsers.add_parser("query", help="Query for similar files.")
query_parser.set_defaults(handler=query)
mode_group = query_parser.add_mutually_exclusive_group(required=True)
mode_group.add_argument("-i", "--id", help="Query for this id (id mode).")
mode_group.add_argument("-c", "--file", help="Query for this file (file mode).")
query_parser.add_argument("--docfreq", help="Path to OrderedDocumentFrequencies (file mode).")
query_parser.add_argument(
"--bblfsh", default="localhost:9432", help="Babelfish server's endpoint.")
add_features_arg(query_parser, False, " (file mode).")
query_parser.add_argument("-x", "--precise", action="store_true",
help="Calculate the precise set.")
query_parser.add_argument("-o", "--format", choices=("human", "json"), help="Output format.")
add_wmh_args(query_parser, "Path to the Weighted MinHash parameters.", False, False)
add_cassandra_args(query_parser)
db_parser = subparsers.add_parser("resetdb", help="Destructively initialize the database.")
db_parser.set_defaults(handler=reset_db)
add_cassandra_args(db_parser)
db_parser.add_argument(
"--hashes-only", action="store_true",
help="Only clear the tables: hashes, hashtables, hashtables2. Do not touch the rest.")
cc_parser = subparsers.add_parser(
"cc", help="Load the similar pairs of files and run connected components analysis.")
cc_parser.set_defaults(handler=find_connected_components)
add_cassandra_args(cc_parser)
cc_parser.add_argument("-o", "--output",
help="Path to save the asdf file with connected components.")
dumpcc_parser = subparsers.add_parser(
"dumpcc", help="Output the connected components to stdout.")
dumpcc_parser.set_defaults(handler=dumpcc)
dumpcc_parser.add_argument("input", help="Path to the asdf file with CCs.")
community_parser = subparsers.add_parser(
"cmd", help="Run Community Detection analysis on the connected components from \"cc\".")
community_parser.set_defaults(handler=detect_communities)
community_parser.add_argument("-i", "--input", required=True,
help="The path to connected components ASDF model.")
community_parser.add_argument("-o", "--output", required=True,
help="Output path to the communities ASDF model.")
community_parser.add_argument("--edges", choices=("linear", "quadratic", "1", "2"),
default="linear",
help="The method to generate the graph's edges: bipartite - "
"linear and fast, but may not fit some the CD algorithms, "
"or all to all within a bucket - quadratic and slow, but "
"surely fits all the algorithms.")
cmd_choices = [k[10:] for k in dir(Graph) if k.startswith("community_")]
community_parser.add_argument("-a", "--algorithm", choices=cmd_choices,
default="walktrap",
help="The community detection algorithm to apply.")
community_parser.add_argument("-p", "--params", type=json.loads, default={},
help="Parameters for the algorithm (**kwargs, JSON format).")
community_parser.add_argument("--no-spark", action="store_true", help="Do not use Spark.")
add_spark_args(community_parser)
urls_parser = subparsers.add_parser("urls", help="Convert all sha1 from stdin to URLs.")
urls_parser.set_defaults(handler=sha1_to_url)
urls_parser.add_argument("--batch", type=int, default=100,
help="Number of hashes to query at a time.")
add_cassandra_args(urls_parser)
# TODO: retable [.....] -> [.] [.] [.] [.] [.]
return parser
def main():
"""
Creates all the argparse-rs and invokes the function from set_defaults().
:return: The result of the function from set_defaults().
"""
parser = get_parser()
args = parser.parse_args()
args.log_level = logging._nameToLevel[args.log_level]
setup_logging(args.log_level)
try:
handler = args.handler
except AttributeError:
def print_usage(_):
parser.print_usage()
handler = print_usage
return handler(args)
if __name__ == "__main__":
sys.exit(main())