Create dokuwikidump.py

copy from: WikiTeam/wikiteam#243
saveweb · Feb 14, 2023 · 6806925 · 6806925
commit 6806925
Showing 1 changed file with 389 additions and 0 deletions.
diff --git a/dokuwikidump.py b/dokuwikidump.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+# dumpgenerator.py A generator of dumps for wikis
+# Copyright (C) 2011-2014 WikiTeam developers
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# To learn more, read the documentation:
+#     https://github.com/WikiTeam/wikiteam/wiki
+
+try:
+    from BeautifulSoup import BeautifulSoup
+except:
+    print 'Need BeautifulSoup for current version. In the future it should use regex for scraping.'
+
+import HTMLParser
+import urlparse
+import requests
+import os
+import socket
+import re
+from datetime import datetime
+import gzip
+import time
+
+
+def getTitles(url, ns=None):
+    """Get titles given a doku.php URL and an (optional) namespace"""
+    titles = []
+    ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
+    params = {'call': 'index'}
+    if ns:
+        params['idx'] = ns
+    else:
+        print 'Finding titles'
+    ns = ns or ''
+    depth = len(ns.split(':'))
+    if ns:
+        print '%sLooking in namespace %s' % (' ' * depth, ns)
+    r = requests.post(ajax, params)
+    if r.status_code != 200 or "AJAX call 'index' unknown!" in r.text:
+        return getTitlesOld(url, ns=None)
+    soup = BeautifulSoup(r.text)
+    for a in soup.findAll('a', href=True):
+        if a.has_key('title'):
+            title = a['title']
+        else:
+            query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
+            title = (query['idx' if 'idx' in query else 'id'])[0]
+        if a['class'] == 'idx_dir':
+            titles += getTitles(url, title)
+        else:
+            titles.append(title)
+    time.sleep(1.5)
+    print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
+    return titles
+
+
+def getTitlesOld(url, ns=None, ancient=False):
+    """Get titles using the doku.php?do=index"""
+
+    titles = []
+    params = {'do': 'index'}
+
+    if ns:
+        params['idx'] = ns
+    ns = ns or ''
+    depth = len(ns.split(':'))
+
+    r = requests.get(url, params=params)
+    soup = BeautifulSoup(r.text).findAll('ul', {'class': 'idx'})[0]
+    attr = 'text' if ancient else 'title'
+
+    if ns:
+        print '%sSearching in namespace %s' % (' ' * depth, ns)
+
+        def match(href):
+            if not href:
+                return False
+            qs = urlparse.urlparse(href).query
+            qs = urlparse.parse_qs(qs)
+            return 'idx' in qs and qs['idx'][0] in (ns, ':' + ns)
+        result = soup.findAll(
+            'a', {
+                'class': 'idx_dir', 'href': match})[0].findAllPrevious('li')[0].findAll(
+            'a', {
+                'href': lambda x: x and not match(x)})
+    else:
+        print 'Finding titles (?do=index)'
+        result = soup.findAll('a')
+
+    for a in result:
+        query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
+        if a['class'] == 'idx_dir':
+            titles += getTitlesOld(url, query['idx'][0])
+        else:
+            titles.append(query['id'][0])
+
+    print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
+
+    return titles
+
+
+def getSourceExport(url, title, rev=''):
+    """Export the raw source of a page (at a given revision)"""
+
+    r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'export_raw'})
+    return r.text
+
+
+def getSourceEdit(url, title, rev=''):
+    """Export the raw source of a page by scraping the edit box content. Yuck."""
+
+    r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'edit'})
+    soup = BeautifulSoup(r.text)
+    return ''.join(soup.find('textarea', {'name': 'wikitext'}).contents).strip()
+
+
+def domain2prefix(url):
+    """ Convert domain name to a valid prefix filename. """
+
+    domain = url
+
+    domain = domain.lower()
+    domain = re.sub(r'(https?://|www\.|/doku\.php)', '', domain)
+    domain = re.sub(r'/', '_', domain)
+    domain = re.sub(r'\.', '', domain)
+    domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
+
+    return domain
+
+
+def getRevisions(url, title, use_hidden_rev=False, select_revs=False):
+    """ Get the revisions of a page. This is nontrivial because different versions of DokuWiki return completely different revision HTML."""
+
+    revs = []
+    h = HTMLParser.HTMLParser()
+    if select_revs:
+        r = requests.get(url, params={'id': title, 'do': 'diff'})
+        soup = BeautifulSoup(r.text)
+        select = soup.find(
+            'select', {
+                'class': 'quickselect', 'name': 'rev2[1]'})
+        for option in select.findAll('option'):
+            text = option.text
+            date = ' '.join(text.split(' ')[:2])
+            username = len(text.split(' ')) > 2 and text.split(' ')[2]
+            summary = ' '.join(text.split(' ')[3:])
+
+            revs.append({'id': option['value'],
+                         'user': username,
+                         'sum': summary,
+                         'date': date})
+
+    i = 0
+    continue_index = -1
+    cont = True
+
+    while cont:
+        r = requests.get(
+            url,
+            params={
+                'id': title,
+                'do': 'revisions',
+                'first': continue_index})
+
+        soup = BeautifulSoup(r.text)
+        lis = soup.findAll(
+            'div', {
+                'class': 'level1'})[0].findNext('ul').findAll('li')
+
+        for li in lis:
+            rev = {}
+            rev_hrefs = li.findAll(
+                'a', href=lambda href: href and (
+                    '&rev=' in href or '?rev=' in href))
+            rev['minor'] = ('class', 'minor') in li.attrs
+
+            if rev_hrefs:
+                rev['id'] = urlparse.parse_qs(
+                    urlparse.urlparse(
+                        rev_hrefs[0]['href']).query)['rev'][0]
+
+            sum_span = li.findAll('span', {'class': 'sum'})
+            if sum_span and not select_revs:
+                sum_span = sum_span[0]
+                sum_text = sum_span.text.split(' ')[1:]
+                if sum_span.findAll('bdi'):
+                    rev['sum'] = h.unescape(sum_span.find('bdi').text).strip()
+                else:
+                    rev['sum'] = h.unescape(' '.join(sum_text)).strip()
+            elif not select_revs:
+                print repr(li.text)
+                wikilink1 = li.find('a', {'class': 'wikilink1'})
+                text_node = wikilink1 and wikilink1.next and wikilink1.next.next or ''
+                if text_node.strip:
+                    rev['sum'] = h.unescape(text_node).strip(u'\u2013 \n')
+
+            date_span = li.find('span', {'class': 'date'})
+            if date_span:
+                rev['date'] = date_span.text.strip()
+            else:
+                rev['date'] = ' '.join(li.text.split(' ')[:2])
+                matches = re.findall(
+                    r'([0-9./]+ [0-9]{1,2}:[0-9]{1,2})',
+                    rev['date'])
+                if matches:
+                    rev['date'] = matches[0]
+
+            if not (select_revs and len(revs) > i and revs[i]['user']):
+                user_span = li.find('span', {'class': 'user'})
+                if user_span:
+                    rev['user'] = user_span.text
+
+            if select_revs and len(revs) > i:
+                revs[i].update(rev)
+            else:
+                revs.append(rev)
+            i += 1
+
+        first = soup.findAll('input', {'name': 'first', 'value': True})
+        continue_index = first and max(map(lambda x: x['value'], first))
+        cont = soup.find('input', {'class': 'button', 'accesskey': 'n'})
+        time.sleep(1.5)
+
+    if revs and use_hidden_rev and not select_revs:
+        soup2 = BeautifulSoup(requests.get(url, params={'id': title}).text)
+        revs[0]['id'] = soup2.find(
+            'input', {
+                'type': 'hidden', 'name': 'rev', 'value': True})['value']
+
+    return revs
+
+
+def getFiles(url, ns=''):
+    """ Return a list of media filenames of a wiki """
+    files = set()
+    ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
+    medialist = BeautifulSoup(
+        requests.post(
+            ajax, {
+                'call': 'medialist', 'ns': ns, 'do': 'media'}).text)
+    medians = BeautifulSoup(
+        requests.post(
+            ajax, {
+                'call': 'medians', 'ns': ns, 'do': 'media'}).text)
+    imagelinks = medialist.findAll(
+        'a',
+        href=lambda x: x and re.findall(
+            '[?&](media|image)=',
+            x))
+    for a in imagelinks:
+        query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
+        key = 'media' if 'media' in query else 'image'
+        files.add(query[key][0])
+    files = list(files)
+    namespacelinks = medians.findAll('a', {'class': 'idx_dir', 'href': True})
+    for a in namespacelinks:
+        query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
+        files += getFiles(url, query['ns'][0])
+    print 'Found %d files in namespace %s' % (len(files), ns or '(all)')
+    return files
+
+
+def dumpContent(url):
+    os.mkdir(domain2prefix(url) + '/pages')
+    os.mkdir(domain2prefix(url) + '/attic')
+    os.mkdir(domain2prefix(url) + '/meta')
+
+    titles = getTitles(url)
+    if not len(titles):
+        print 'Empty wiki'
+        return
+
+    r1 = requests.get(url, params={'id': titles[0], 'do': 'export_raw'})
+    r2 = requests.get(url, params={'id': titles[0]})
+    r3 = requests.get(url, params={'id': titles[0], 'do': 'diff'})
+
+    getSource = getSourceExport
+    if 'html' in r1.headers['content-type']:
+        getSource = getSourceEdit
+
+    soup = BeautifulSoup(r2.text)
+    hidden_rev = soup.findAll(
+        'input', {
+            'type': 'hidden', 'name': 'rev', 'value': True})
+    use_hidden_rev = hidden_rev and hidden_rev[0]['value']
+
+    soup = BeautifulSoup(r3.text)
+    select_revs = soup.findAll(
+        'select', {
+            'class': 'quickselect', 'name': 'rev2[0]'})
+
+    for title in titles:
+        titleparts = title.split(':')
+        for i in range(len(titleparts)):
+            dir = "/".join(titleparts[:i])
+            if not os.path.exists(domain2prefix(url) + '/pages/' + dir):
+                os.mkdir(domain2prefix(url) + '/pages/' + dir)
+            if not os.path.exists(domain2prefix(url) + '/meta/' + dir):
+                os.mkdir(domain2prefix(url) + '/meta/' + dir)
+            if not os.path.exists(domain2prefix(url) + '/attic/' + dir):
+                os.mkdir(domain2prefix(url) + '/attic/' + dir)
+        with open(domain2prefix(url) + '/pages/' + title.replace(':', '/') + '.txt', 'w') as f:
+            f.write(getSource(url, title).encode("utf-8"))
+        revs = getRevisions(url, title, use_hidden_rev, select_revs)
+        for rev in revs[1:]:
+            if 'id' in rev and rev['id']:
+                with gzip.open(domain2prefix(url) + '/attic/' + title.replace(':', '/') + '.' + rev['id'] + '.txt.gz', 'w') as f:
+                    f.write(getSource(url, title, rev['id']).encode("utf-8"))
+                time.sleep(1.5)
+                print 'Revision %s of %s' % (rev['id'], title)
+        with open(domain2prefix(url) + '/meta/' + title.replace(':', '/') + '.changes', 'w') as f:
+            # Loop through revisions in reverse.
+            for rev in revs[::-1]:
+                print rev, title
+                sum = 'sum' in rev and rev['sum'].strip() or ''
+                id = 0
+
+                ip = '127.0.0.1'
+                user = ''
+                minor = 'minor' in rev and rev['minor']
+
+                if 'id' in rev and rev['id']:
+                    id = rev['id']
+                else:
+                    # Different date formats in different versions of DokuWiki.
+                    # If no ID was found, make one up based on the date (since rev IDs are Unix times)
+                    # Maybe this is evil. Not sure.
+
+                    try:
+                        date = datetime.strptime(rev['date'], "%Y/%m/%d %H:%M")
+                        id = str(int(time.mktime(date.utctimetuple())))
+                    except:
+                        date = datetime.strptime(rev['date'], "%d.%m.%Y %H:%M")
+                        id = str(int(time.mktime(date.utctimetuple())))
+
+                rev['user'] = rev['user'] if 'user' in rev else 'unknown'
+                try:
+                    # inet_aton throws an exception if its argument is not an IPv4 address
+                    socket.inet_aton(rev['user'])
+                    ip = rev['user']
+                except socket.error:
+                    user = rev['user']
+
+                row = '\t'.join([id, ip, 'e' if minor else 'E', title, user, sum])
+                row = row.replace('\n', ' ')
+                row = row.replace('\r', ' ')
+
+                f.write((row + '\n').encode("utf-8"))
+
+
+def dumpMedia(url):
+    prefix = domain2prefix(url)
+    os.mkdir(prefix + '/media')
+    os.mkdir(prefix + '/media_attic')
+    os.mkdir(prefix + '/media_meta')
+
+    fetch = urlparse.urljoin(url, 'lib/exe/fetch.php')
+
+    files = getFiles(url)
+    for title in files:
+        titleparts = title.split(':')
+        for i in range(len(titleparts)):
+            dir = "/".join(titleparts[:i])
+            if not os.path.exists(prefix + '/media/' + dir):
+                os.mkdir(prefix + '/media/' + dir)
+        with open(prefix + '/media/' + title.replace(':', '/'), 'wb') as f:
+            f.write(requests.get(fetch, params={'media': title}).content)
+        print 'File %s' % title
+        time.sleep(1.5)
+
+
+def dump(url):
+    print domain2prefix(url)
+    os.mkdir(domain2prefix(url))
+    dumpContent(url)
+    dumpMedia(url)