/
adm_entity_clear.py
88 lines (71 loc) · 2.62 KB
/
adm_entity_clear.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# coding=utf-8
import pywikibot
import re
from pywikibot import pagegenerators as pg
from utils.properties import PID_BIRTH_PLACE, PID_DEATH_PLACE, PID_RESIDENCE, PID_ADM_UNIT, PID_COUNTRY
PID_FILE = 'adm_entity_%s'
REMOVE_SUMMARY = u'Remove administrative entity qualifiers that can be obtained from the target item'
site = pywikibot.Site('ru', 'wikipedia')
repo = pywikibot.Site('wikidata', 'wikidata')
processed = {}
def mark_processed(qid, pid):
processed[pid].append(qid)
with open(PID_FILE % pid, 'a') as file:
file.write('%s\n' % qid)
def is_processed(qid, pid):
if pid not in processed:
try:
with open(PID_FILE % pid) as file:
processed[pid] = [row.strip() for row in file]
except FileNotFoundError:
processed[pid] = []
return qid in processed[pid]
def load_preview(code):
html = site.expand_text(code)
return re.sub('<[^<]+?>', '', html)
def remove_qualifiers(claim):
qualifiers = []
if PID_ADM_UNIT in claim.qualifiers:
for qualifier in claim.qualifiers[PID_ADM_UNIT]:
qualifiers.append(qualifier)
if PID_COUNTRY in claim.qualifiers:
for qualifier in claim.qualifiers[PID_COUNTRY]:
qualifiers.append(qualifier)
claim.removeQualifiers(qualifiers, summary=REMOVE_SUMMARY)
return
def check_adm_entity(item, place_pid):
if is_processed(item.getID(), place_pid):
return
data = item.get()
if 'claims' not in data or place_pid not in data['claims'] or len(data['claims'][place_pid]) != 1:
return
for claim in data['claims'][place_pid]:
current_html = load_preview('{{wikidata|%s|from=%s}}' % (place_pid, item.getID()))
new_html = load_preview('{{wikidata/песочница|%s|from=%s}}' % (place_pid, item.getID()))
if current_html == new_html:
print('%s: REMOVE' % item.getID())
remove_qualifiers(claim)
else:
print('%s: SKIP' % item.getID())
mark_processed(item.getID(), place_pid)
def iterate_items(pid):
print(pid)
query = '''
SELECT DISTINCT ?item
WHERE {
?item ^schema:about/schema:isPartOf <https://ru.wikipedia.org/>;
p:%s ?place .
{ ?place pq:P17 ?country }
UNION
{ ?place pq:P131 ?unit }
}
''' % pid
generator = pg.WikidataSPARQLPageGenerator(query, site=repo)
for item in generator:
try:
check_adm_entity(item, pid)
except:
print('%s: OOPS!!!' % item.getID())
iterate_items(PID_BIRTH_PLACE)
iterate_items(PID_DEATH_PLACE)
iterate_items(PID_RESIDENCE)