Skip to content

Commit

Permalink
Create dokuwikidump.py
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed Feb 14, 2023
0 parents commit 6806925
Showing 1 changed file with 389 additions and 0 deletions.
389 changes: 389 additions & 0 deletions dokuwikidump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,389 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# dumpgenerator.py A generator of dumps for wikis
# Copyright (C) 2011-2014 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki

try:
from BeautifulSoup import BeautifulSoup
except:
print 'Need BeautifulSoup for current version. In the future it should use regex for scraping.'

import HTMLParser
import urlparse
import requests
import os
import socket
import re
from datetime import datetime
import gzip
import time


def getTitles(url, ns=None):
"""Get titles given a doku.php URL and an (optional) namespace"""
titles = []
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
params = {'call': 'index'}
if ns:
params['idx'] = ns
else:
print 'Finding titles'
ns = ns or ''
depth = len(ns.split(':'))
if ns:
print '%sLooking in namespace %s' % (' ' * depth, ns)
r = requests.post(ajax, params)
if r.status_code != 200 or "AJAX call 'index' unknown!" in r.text:
return getTitlesOld(url, ns=None)
soup = BeautifulSoup(r.text)
for a in soup.findAll('a', href=True):
if a.has_key('title'):
title = a['title']
else:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
title = (query['idx' if 'idx' in query else 'id'])[0]
if a['class'] == 'idx_dir':
titles += getTitles(url, title)
else:
titles.append(title)
time.sleep(1.5)
print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
return titles


def getTitlesOld(url, ns=None, ancient=False):
"""Get titles using the doku.php?do=index"""

titles = []
params = {'do': 'index'}

if ns:
params['idx'] = ns
ns = ns or ''
depth = len(ns.split(':'))

r = requests.get(url, params=params)
soup = BeautifulSoup(r.text).findAll('ul', {'class': 'idx'})[0]
attr = 'text' if ancient else 'title'

if ns:
print '%sSearching in namespace %s' % (' ' * depth, ns)

def match(href):
if not href:
return False
qs = urlparse.urlparse(href).query
qs = urlparse.parse_qs(qs)
return 'idx' in qs and qs['idx'][0] in (ns, ':' + ns)
result = soup.findAll(
'a', {
'class': 'idx_dir', 'href': match})[0].findAllPrevious('li')[0].findAll(
'a', {
'href': lambda x: x and not match(x)})
else:
print 'Finding titles (?do=index)'
result = soup.findAll('a')

for a in result:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
if a['class'] == 'idx_dir':
titles += getTitlesOld(url, query['idx'][0])
else:
titles.append(query['id'][0])

print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')

return titles


def getSourceExport(url, title, rev=''):
"""Export the raw source of a page (at a given revision)"""

r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'export_raw'})
return r.text


def getSourceEdit(url, title, rev=''):
"""Export the raw source of a page by scraping the edit box content. Yuck."""

r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'edit'})
soup = BeautifulSoup(r.text)
return ''.join(soup.find('textarea', {'name': 'wikitext'}).contents).strip()


def domain2prefix(url):
""" Convert domain name to a valid prefix filename. """

domain = url

domain = domain.lower()
domain = re.sub(r'(https?://|www\.|/doku\.php)', '', domain)
domain = re.sub(r'/', '_', domain)
domain = re.sub(r'\.', '', domain)
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)

return domain


def getRevisions(url, title, use_hidden_rev=False, select_revs=False):
""" Get the revisions of a page. This is nontrivial because different versions of DokuWiki return completely different revision HTML."""

revs = []
h = HTMLParser.HTMLParser()
if select_revs:
r = requests.get(url, params={'id': title, 'do': 'diff'})
soup = BeautifulSoup(r.text)
select = soup.find(
'select', {
'class': 'quickselect', 'name': 'rev2[1]'})
for option in select.findAll('option'):
text = option.text
date = ' '.join(text.split(' ')[:2])
username = len(text.split(' ')) > 2 and text.split(' ')[2]
summary = ' '.join(text.split(' ')[3:])

revs.append({'id': option['value'],
'user': username,
'sum': summary,
'date': date})

i = 0
continue_index = -1
cont = True

while cont:
r = requests.get(
url,
params={
'id': title,
'do': 'revisions',
'first': continue_index})

soup = BeautifulSoup(r.text)
lis = soup.findAll(
'div', {
'class': 'level1'})[0].findNext('ul').findAll('li')

for li in lis:
rev = {}
rev_hrefs = li.findAll(
'a', href=lambda href: href and (
'&rev=' in href or '?rev=' in href))
rev['minor'] = ('class', 'minor') in li.attrs

if rev_hrefs:
rev['id'] = urlparse.parse_qs(
urlparse.urlparse(
rev_hrefs[0]['href']).query)['rev'][0]

sum_span = li.findAll('span', {'class': 'sum'})
if sum_span and not select_revs:
sum_span = sum_span[0]
sum_text = sum_span.text.split(' ')[1:]
if sum_span.findAll('bdi'):
rev['sum'] = h.unescape(sum_span.find('bdi').text).strip()
else:
rev['sum'] = h.unescape(' '.join(sum_text)).strip()
elif not select_revs:
print repr(li.text)
wikilink1 = li.find('a', {'class': 'wikilink1'})
text_node = wikilink1 and wikilink1.next and wikilink1.next.next or ''
if text_node.strip:
rev['sum'] = h.unescape(text_node).strip(u'\u2013 \n')

date_span = li.find('span', {'class': 'date'})
if date_span:
rev['date'] = date_span.text.strip()
else:
rev['date'] = ' '.join(li.text.split(' ')[:2])
matches = re.findall(
r'([0-9./]+ [0-9]{1,2}:[0-9]{1,2})',
rev['date'])
if matches:
rev['date'] = matches[0]

if not (select_revs and len(revs) > i and revs[i]['user']):
user_span = li.find('span', {'class': 'user'})
if user_span:
rev['user'] = user_span.text

if select_revs and len(revs) > i:
revs[i].update(rev)
else:
revs.append(rev)
i += 1

first = soup.findAll('input', {'name': 'first', 'value': True})
continue_index = first and max(map(lambda x: x['value'], first))
cont = soup.find('input', {'class': 'button', 'accesskey': 'n'})
time.sleep(1.5)

if revs and use_hidden_rev and not select_revs:
soup2 = BeautifulSoup(requests.get(url, params={'id': title}).text)
revs[0]['id'] = soup2.find(
'input', {
'type': 'hidden', 'name': 'rev', 'value': True})['value']

return revs


def getFiles(url, ns=''):
""" Return a list of media filenames of a wiki """
files = set()
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
medialist = BeautifulSoup(
requests.post(
ajax, {
'call': 'medialist', 'ns': ns, 'do': 'media'}).text)
medians = BeautifulSoup(
requests.post(
ajax, {
'call': 'medians', 'ns': ns, 'do': 'media'}).text)
imagelinks = medialist.findAll(
'a',
href=lambda x: x and re.findall(
'[?&](media|image)=',
x))
for a in imagelinks:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
key = 'media' if 'media' in query else 'image'
files.add(query[key][0])
files = list(files)
namespacelinks = medians.findAll('a', {'class': 'idx_dir', 'href': True})
for a in namespacelinks:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
files += getFiles(url, query['ns'][0])
print 'Found %d files in namespace %s' % (len(files), ns or '(all)')
return files


def dumpContent(url):
os.mkdir(domain2prefix(url) + '/pages')
os.mkdir(domain2prefix(url) + '/attic')
os.mkdir(domain2prefix(url) + '/meta')

titles = getTitles(url)
if not len(titles):
print 'Empty wiki'
return

r1 = requests.get(url, params={'id': titles[0], 'do': 'export_raw'})
r2 = requests.get(url, params={'id': titles[0]})
r3 = requests.get(url, params={'id': titles[0], 'do': 'diff'})

getSource = getSourceExport
if 'html' in r1.headers['content-type']:
getSource = getSourceEdit

soup = BeautifulSoup(r2.text)
hidden_rev = soup.findAll(
'input', {
'type': 'hidden', 'name': 'rev', 'value': True})
use_hidden_rev = hidden_rev and hidden_rev[0]['value']

soup = BeautifulSoup(r3.text)
select_revs = soup.findAll(
'select', {
'class': 'quickselect', 'name': 'rev2[0]'})

for title in titles:
titleparts = title.split(':')
for i in range(len(titleparts)):
dir = "/".join(titleparts[:i])
if not os.path.exists(domain2prefix(url) + '/pages/' + dir):
os.mkdir(domain2prefix(url) + '/pages/' + dir)
if not os.path.exists(domain2prefix(url) + '/meta/' + dir):
os.mkdir(domain2prefix(url) + '/meta/' + dir)
if not os.path.exists(domain2prefix(url) + '/attic/' + dir):
os.mkdir(domain2prefix(url) + '/attic/' + dir)
with open(domain2prefix(url) + '/pages/' + title.replace(':', '/') + '.txt', 'w') as f:
f.write(getSource(url, title).encode("utf-8"))
revs = getRevisions(url, title, use_hidden_rev, select_revs)
for rev in revs[1:]:
if 'id' in rev and rev['id']:
with gzip.open(domain2prefix(url) + '/attic/' + title.replace(':', '/') + '.' + rev['id'] + '.txt.gz', 'w') as f:
f.write(getSource(url, title, rev['id']).encode("utf-8"))
time.sleep(1.5)
print 'Revision %s of %s' % (rev['id'], title)
with open(domain2prefix(url) + '/meta/' + title.replace(':', '/') + '.changes', 'w') as f:
# Loop through revisions in reverse.
for rev in revs[::-1]:
print rev, title
sum = 'sum' in rev and rev['sum'].strip() or ''
id = 0

ip = '127.0.0.1'
user = ''
minor = 'minor' in rev and rev['minor']

if 'id' in rev and rev['id']:
id = rev['id']
else:
# Different date formats in different versions of DokuWiki.
# If no ID was found, make one up based on the date (since rev IDs are Unix times)
# Maybe this is evil. Not sure.

try:
date = datetime.strptime(rev['date'], "%Y/%m/%d %H:%M")
id = str(int(time.mktime(date.utctimetuple())))
except:
date = datetime.strptime(rev['date'], "%d.%m.%Y %H:%M")
id = str(int(time.mktime(date.utctimetuple())))

rev['user'] = rev['user'] if 'user' in rev else 'unknown'
try:
# inet_aton throws an exception if its argument is not an IPv4 address
socket.inet_aton(rev['user'])
ip = rev['user']
except socket.error:
user = rev['user']

row = '\t'.join([id, ip, 'e' if minor else 'E', title, user, sum])
row = row.replace('\n', ' ')
row = row.replace('\r', ' ')

f.write((row + '\n').encode("utf-8"))


def dumpMedia(url):
prefix = domain2prefix(url)
os.mkdir(prefix + '/media')
os.mkdir(prefix + '/media_attic')
os.mkdir(prefix + '/media_meta')

fetch = urlparse.urljoin(url, 'lib/exe/fetch.php')

files = getFiles(url)
for title in files:
titleparts = title.split(':')
for i in range(len(titleparts)):
dir = "/".join(titleparts[:i])
if not os.path.exists(prefix + '/media/' + dir):
os.mkdir(prefix + '/media/' + dir)
with open(prefix + '/media/' + title.replace(':', '/'), 'wb') as f:
f.write(requests.get(fetch, params={'media': title}).content)
print 'File %s' % title
time.sleep(1.5)


def dump(url):
print domain2prefix(url)
os.mkdir(domain2prefix(url))
dumpContent(url)
dumpMedia(url)

0 comments on commit 6806925

Please sign in to comment.