/
build_allowed_sites.py
143 lines (110 loc) · 3.14 KB
/
build_allowed_sites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import collections
import json
import re
import subprocess
URL_LIMIT = 100000
# Clean up hosts data
with open('hosts.txt') as f:
hosts_data = f.read()
hosts_data = re.sub(r'\r|\t|www\.', '', hosts_data, flags=re.MULTILINE)
hosts_data = re.sub(r'#.+$', '', hosts_data, flags=re.MULTILINE)
hosts_data = re.sub(r'^.+ (?P<url>.+)$', r'\1', hosts_data, flags=re.MULTILINE)
hosts_data = hosts_data[hosts_data.find('0.0.0.0\n') + 9:]
hosts_data = re.sub(r' +$', '', hosts_data, flags=re.MULTILINE)
hosts_data = re.sub(r'^#?\n', '', hosts_data, flags=re.MULTILINE)
known_bad_urls = set(sorted(hosts_data.split('\n'))[1:])
# Clean up tranco data
with open('tranco_custom.csv') as f:
tranco_data = f.read()
print('Cleaning data...')
# Data cleanup
tranco_data = tranco_data.replace('\r', '')
tranco_data = re.sub(r'^\d+,', '', tranco_data, flags=re.MULTILINE)
tranco_data = re.sub(r'^.+\.\w+-+\w*\n', '', tranco_data, flags=re.MULTILINE)
# Remove unwanted URLs (edu|gov|mil are allowed already)
tranco_data = re.sub(r'^.+\.(?:edu|gov|mil)(?:\.\w\w)?\n', '', tranco_data, flags=re.MULTILINE)
# Remove common spam items
forbidden_words = '.*|.*'.join("""\
bride
dating
wives
viagra
xxx
porn
nude
spank
fuck
jizz
urbate
hentai
peep
pussy
whore
slut
xvideo
casino
gamble
gamling
slots
darknet
darkweb""".split('\n'))
forbidden_regexes = '|'.join(f"""\
.*(anal((?!ytics|ysis).)*)
((?!mid|house|home|gracious|military).)*wife.*
.*(naughty((?!dog).)*)
((?!essex).)*sex((?!tant|ism|press).)*
.*fap((?!ps).)*
.*bonga-?cam.*
.*naked((?!capitalism|nutrition|-science).)*""".split('\n'))
forbidden_cam_suffixes = '|'.join("""\
s
fox
girl
beaut
chick
model
show
site
smut
online
party
stream
teen
ming""".split('\n'))
full_regex = f'^(?:' + f'(.*{forbidden_words}.*)' + f'|{forbidden_regexes}|' + f'(.*cam((?!{forbidden_cam_suffixes}).)*)' + ')$'
matched = re.findall(full_regex, tranco_data, flags=re.MULTILINE)
with open('forbidden_matches.txt', 'w') as f:
print('\n'.join((max(m) for m in matched)), file=f)
tranco_data = re.sub(full_regex, '', tranco_data, flags=re.MULTILINE)
print('De-duplicating data...')
already_added = set()
out = []
for url in tranco_data.split('\n'):
url = url.rstrip()
if url in already_added:
continue
out.append(url)
already_added.add(url)
already_added.clear()
print('Removing known bad actors...')
# Remove known bad hosts urls from tranco list
out = [_ for _ in out if _.strip() != '' and _ not in known_bad_urls]
# Add known good hosts
with open('known-urls.txt') as f:
known_urls = f.readlines()
out = [_.strip() for _ in known_urls if _.strip() != ''] + out
out = out[:URL_LIMIT]
with open('allowed_sites.txt', 'w') as f:
print('\n'.join(out), file=f)
print('Sorting...')
out_dict = collections.defaultdict(list)
for url in out:
hostname, _, tld = url.rpartition('.')
out_dict[tld].append(hostname)
for url_list in out_dict.values():
url_list.sort()
with open('allowed-sites.json', 'w') as f:
f.write(json.dumps(out_dict, separators=(",", ":")))
print('GZIPing allowed-sites.json...')
subprocess.run(r'"C:/Program Files/7-Zip/7z.exe" a "./allowed-sites.json.gz" "./allowed-sites.json"')
print('Done!\n')