-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_ns_whitelist.py
executable file
·125 lines (102 loc) · 4.83 KB
/
get_ns_whitelist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
"""
This script creates a list of namespaces used by published packages.
In most cases, it is the same as the package name list, with few exceptions.
For example, NPM module @scope/package will be imported as
`require('@scope/package')`. However, in Python, module scikit-learn will
be imported as sklearn. So, *package name* `scikit-learn` is different from
namespace `sklearn`.
This list will be used to whitelist vocabulary tokens. We do so to filter out
local imports and WoC parsing artifacts. Examples:
* `pl:Project`: build script for Postgres
* `JS:com.unity.modules.vehicles`: looks to be parsed from an arbitrary .json
file, not related to NPM
* `jl:"joinpath(dir, bubblesort"`: WoC import parsing artifact
"""
import argparse
import os
import pandas as pd
# array(['Alcatraz', 'Dub', 'Hex', 'Jam', 'Emacs', 'Bower', 'Sublime',
# 'Pub', 'NPM', 'Cargo', 'Pypi', 'Packagist', 'Rubygems', 'Hackage',
# 'Nimble', 'Maven', 'Go', 'Wordpress', 'NuGet', 'CPAN', 'CRAN',
# 'Meteor', 'Clojars', 'CocoaPods', 'Elm', 'Julia', 'PlatformIO',
# 'Atom', 'Inqlude', 'Homebrew', 'Carthage', 'Shards', 'SwiftPM',
# 'Haxelib', 'Puppet', 'Racket', 'PureScript'], dtype=object)
# From the ecosystems we use only Python and Java require parsing namespaces
LANGUAGES = { # correspondence of libraries.io platforms to WoC imports
# 'Pypi': 'PY', # Python
'CRAN': 'R',
'NPM': 'JS', # Javascript
'Go': 'Go',
# 'Maven': 'java', # needs special parsing
'Julia': 'jl', # Julia
'CPAN': 'pl', # Perl
}
def get_python_namespaces(package_names):
"""Getting this data takes up to 20 hours of processing.
It is better to cache the result"""
cached_fname = 'python_namespaces.csv'
if os.path.isfile(cached_fname):
namespaces = pd.read_csv(
cached_fname, index_col=0, squeeze=True, header=None,
names=['package', 'namespaces'])
namespaces = namespaces.apply(lambda x: str(x).split(','))
return namespaces
from stecosystems import pypi
from stutils import mapreduce # TODO: replace with joblib
def get_module(i, package_name):
try:
namespaces = pypi.Package(package_name).modules()
except:
# Package either does not exist or its setup.py has errors
namespaces = []
return namespaces or [package_name]
# higher number of workers hungs Docker
namespaces = mapreduce.map(get_module, package_names, num_workers=8)
namespaces = pd.Series(namespaces.values, index=list(package_names))
namespaces.apply(lambda x: ','.join(str(s) for s in x)).to_csv(cached_fname)
return namespaces
def main(projects_file_path, save_path):
df = pd.read_csv(projects_file_path, index_col=False,
usecols=['Platform', 'Name'],
dtype={'Platform': str, 'Name': str})
# strip github.com/.../ prefix from Go packages
df.loc[df['Platform'] == 'Go', 'Name'] = df.loc[
df['Platform'] == 'Go', 'Name'].apply(
lambda namespace: namespace.rsplit('/', 1)[-1])
for platform, language in LANGUAGES.items():
fname = language + '_whitelist.csv'
print("writing", fname)
combined_packages = set(
str(ns) for ns in df.loc[df['Platform'] == platform, 'Name'])
with open(os.path.join(save_path, fname), 'wb') as fh:
fh.write('\n'.join(combined_packages))
# Python
python_pkgnames = df.loc[df['Platform'] == 'Pypi', 'Name']
python_namespaces = get_python_namespaces(python_pkgnames)
# namespaces are multilevel, need to strip 2+ levels
# e.g. lib3to2.fixes -> lib3to2
# also, remove all the invalid ones (can't start with a digit)
python_namespaces = python_namespaces.apply(
lambda x: [s.split('.', 1)[0] for s in x if s and not s[0].isdigit()])
# remove non-unique ones
combined_namespaces = set().union(*python_namespaces)
print("writing", 'PY_whitelist.csv')
with open(os.path.join(save_path, 'PY_whitelist.csv'), 'wb') as fh:
fh.write('\n'.join(combined_namespaces))
# TODO: java
# the idea here is to map classes back to package name,
# or come up with a list of shortest prefix for each package
# non-trivial in both cases
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Build whitelist of namespaces')
parser.add_argument('packages_file',
type=argparse.FileType('r'),
help='projects_with_repository* file from libraries.io')
parser.add_argument('--save-path', default='ns_whitelist',
help='Directory to save whitelists to')
parser.add_argument('-v', '--verbose', action='store_true',
help="Log progress to stderr")
args = parser.parse_args()
main(args.packages_file, args.save_path)